From e5c121b2ba797256dbd27dbd6e662462c7d9ee73 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Wed, 29 Nov 2023 12:28:43 +0000 Subject: [PATCH 001/134] Enable pragma unroll for open-source DPC++ --- include/oneapi/dpl/pstl/onedpl_config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/onedpl_config.h b/include/oneapi/dpl/pstl/onedpl_config.h index b5641fde37c..8101dc671a6 100644 --- a/include/oneapi/dpl/pstl/onedpl_config.h +++ b/include/oneapi/dpl/pstl/onedpl_config.h @@ -117,7 +117,7 @@ // Enable loop unrolling pragmas where supported #if (__INTEL_LLVM_COMPILER || __INTEL_COMPILER || \ - (!defined(__INTEL_LLVM_COMPILER) && !defined(__INTEL_COMPILER) && _ONEDPL_GCC_VERSION >= 80000)) + (!defined(__INTEL_LLVM_COMPILER) && !defined(__INTEL_COMPILER) && ((_ONEDPL_GCC_VERSION >= 80000) || (_ONEDPL_CLANG_VERSION >= 40000)))) # define _ONEDPL_PRAGMA_UNROLL _ONEDPL_PRAGMA(unroll) #else //no pragma unroll # define _ONEDPL_PRAGMA_UNROLL From 7c1cb0faec3f2462c99acd1ec7fab6f2d7615e78 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Wed, 29 Nov 2023 14:28:07 +0000 Subject: [PATCH 002/134] clang-format --- include/oneapi/dpl/pstl/onedpl_config.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/onedpl_config.h b/include/oneapi/dpl/pstl/onedpl_config.h index 8101dc671a6..d860e2661f7 100644 --- a/include/oneapi/dpl/pstl/onedpl_config.h +++ b/include/oneapi/dpl/pstl/onedpl_config.h @@ -117,7 +117,8 @@ // Enable loop unrolling pragmas where supported #if (__INTEL_LLVM_COMPILER || __INTEL_COMPILER || \ - (!defined(__INTEL_LLVM_COMPILER) && !defined(__INTEL_COMPILER) && ((_ONEDPL_GCC_VERSION >= 80000) || (_ONEDPL_CLANG_VERSION >= 40000)))) + (!defined(__INTEL_LLVM_COMPILER) && !defined(__INTEL_COMPILER) && \ + ((_ONEDPL_GCC_VERSION >= 80000) || (_ONEDPL_CLANG_VERSION >= 40000)))) # define _ONEDPL_PRAGMA_UNROLL _ONEDPL_PRAGMA(unroll) #else //no pragma unroll # define _ONEDPL_PRAGMA_UNROLL From 154161f12f07e140b3feace770fd08d3cbd2009a Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Fri, 18 Aug 2023 14:52:41 -0500 Subject: [PATCH 003/134] Start of single-pass scan kernel template --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 2 + .../hetero/dpcpp/parallel_backend_sycl_scan.h | 150 ++++++++++++++++++ .../numeric/numeric.ops/scan_kt.pass.cpp | 30 ++++ 3 files changed, 182 insertions(+) create mode 100644 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h create mode 100644 test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 2335bad252e..2299a0e26d8 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -40,6 +40,8 @@ # include "parallel_backend_sycl_radix_sort.h" #endif +#include "parallel_backend_sycl_scan.h" + namespace oneapi { namespace dpl diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h new file mode 100644 index 00000000000..4fc2dbe4d44 --- /dev/null +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -0,0 +1,150 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _ONEDPL_parallel_backend_sycl_scan_H +#define _ONEDPL_parallel_backend_sycl_scan_H + +namespace oneapi::dpl::experimental::igpu +{ + +template +struct __scan_status_flag +{ + using _AtomicRefT = sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space>; + static constexpr std::uint32_t partial_mask = 1 << (sizeof(std::uint32_t)*8 - 2); + static constexpr std::uint32_t full_mask = 1 << (sizeof(std::uint32_t)*8 - 1); + static constexpr std::uint32_t value_mask = ~(partial_mask | full_mask); + + __scan_status_flag(std::uint32_t* flags_begin, const std::uint32_t tile_id) + : atomic_flag(*(flags_begin + tile_id)) + { + + } + + void set_partial(std::uint32_t val) + { + atomic_flag.store(val | partial_mask); + } + + void set_full(std::uint32_t val) + { + atomic_flag.store(val | full_mask); + } + + _T lookback(const std::uint32_t tile_id, std::uint32_t* flags_begin) + { + _T sum = 0; + int i = 0; + for (std::int32_t tile = static_cast(tile_id) - 1; tile >= 0; --tile) + { + _AtomicRefT tile_atomic(*(flags_begin + tile)); + std::uint32_t tile_val = 0; + do { + tile_val = tile_atomic.load(); + } while (tile_val == 0); + + sum += tile_val & value_mask; + + // If this was a full value, we can stop looking at previous tiles. Otherwise, + // keep going through tiles until we either find a full tile or we've completely + // recomputed the prefix using partial values + if (tile_val & full_mask) + break; + } + return sum; + } + + _AtomicRefT atomic_flag; +}; + +template +void +single_pass_scan_impl(_Policy&& __exec, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op) +{ + using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + + const ::std::size_t n = __in_rng.size(); + auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); + std::size_t num_wgs = __max_cu; + + std::size_t wgsize = n/__max_cu; + + std::uint32_t status_flags_buf_size = num_wgs+1; + sycl::buffer status_flags_buf(status_flags_buf_size); + + // TODO: this probably isn't the best way to do this + sycl::host_accessor status_flags(status_flags_buf); + for (std::size_t i = 0; i < status_flags_buf_size; ++i) + status_flags[i] = 0; + + + auto event = __exec.queue().submit([&](sycl::handler& hdl) { + auto status_flags = sycl::accessor(status_flags_buf, hdl); + auto tile_id_lacc = sycl::accessor(sycl::range<1>{1}, hdl); + + oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); + hdl.parallel_for(sycl::nd_range<1>(n, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(32)]] { + auto item_id = item.get_local_linear_id(); + auto group = item.get_group(); + + //std::uint32_t elems_in_tile = elems_per_item*wgsize; + std::uint32_t elems_in_tile = wgsize; + + // Obtain unique ID for this work-group that will be used in decoupled lookback + if (group.leader()) + { + sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space> idx_atomic(status_flags[status_flags_buf_size-1]); + tile_id_lacc[0] = idx_atomic.fetch_add(1); + } + sycl::group_barrier(group); + std::uint32_t tile_id = tile_id_lacc[0]; + + auto in_begin = __in_rng.begin() + (tile_id*elems_in_tile); + auto in_end = __in_rng.begin() + ((tile_id+1)*elems_in_tile); + auto out_begin = __out_rng.begin() + (tile_id*elems_in_tile); + + auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op); + + __scan_status_flag<_Type> flag(status_flags.get_pointer(), tile_id); + flag.set_partial(local_sum); + + auto prev_sum = flag.lookback(tile_id, status_flags.get_pointer()); + flag.set_full(prev_sum + local_sum); + + sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum); + }); + }); + + event.wait(); +} + +template +void +single_pass_inclusive_scan(_Policy&& __exec, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _BinaryOp __binary_op) +{ + auto __n = __in_end - __in_begin; + auto __keep1 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); + auto __buf1 = __keep1(__in_begin, __in_end); + auto __keep2 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); + auto __buf2 = __keep2(__out_begin, __out_begin + __n); + + single_pass_scan_impl(__exec, __buf1.all_view(), __buf2.all_view(), __binary_op); +} + +} // namespace oneapi::dpl::experimental::igpu + +#endif /* _ONEDPL_parallel_backend_sycl_scan_H */ diff --git a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp new file mode 100644 index 00000000000..71a725563d4 --- /dev/null +++ b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp @@ -0,0 +1,30 @@ +// -*- C++ -*- +//===-- scan.pass.cpp -----------------------------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#include "support/test_config.h" + +#include _PSTL_TEST_HEADER(execution) +#include _PSTL_TEST_HEADER(numeric) + +int +main() +{ + int n = 1 << 16; + sycl::queue q; + int* in_ptr = sycl::malloc_device(n, q); + int* out_ptr = sycl::malloc_device(n, q); + oneapi::dpl::experimental::igpu::single_pass_inclusive_scan(oneapi::dpl::execution::dpcpp_default, in_ptr, in_ptr+n, out_ptr, ::std::plus()); + return 0; +} From 16ec5adce45e1e35109e0e15cad0c9174678bcdc Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Thu, 24 Aug 2023 08:48:42 -0500 Subject: [PATCH 004/134] Fix hang in inclusive scan --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 44 ++++++++++++++----- .../numeric/numeric.ops/scan_kt.pass.cpp | 30 ++++++++++++- 2 files changed, 61 insertions(+), 13 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 4fc2dbe4d44..e71398a44b7 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -16,9 +16,11 @@ #ifndef _ONEDPL_parallel_backend_sycl_scan_H #define _ONEDPL_parallel_backend_sycl_scan_H -namespace oneapi::dpl::experimental::igpu +namespace oneapi::dpl::experimental::kt { +inline namespace igpu { + template struct __scan_status_flag { @@ -69,28 +71,36 @@ struct __scan_status_flag _AtomicRefT atomic_flag; }; -template +template void -single_pass_scan_impl(_Policy&& __exec, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op) +single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op) { using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + static_assert(_Inclusive, "Single-pass scan only available for inclusive scan"); + const ::std::size_t n = __in_rng.size(); - auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); - std::size_t num_wgs = __max_cu; + auto __max_cu = __queue.get_device().template get_info(); + //std::size_t num_wgs = __max_cu; + std::size_t num_wgs = 64; - std::size_t wgsize = n/__max_cu; + // TODO: use wgsize and iters per item from _KernelParam + std::size_t wgsize = n/num_wgs; std::uint32_t status_flags_buf_size = num_wgs+1; sycl::buffer status_flags_buf(status_flags_buf_size); // TODO: this probably isn't the best way to do this + { sycl::host_accessor status_flags(status_flags_buf); for (std::size_t i = 0; i < status_flags_buf_size; ++i) status_flags[i] = 0; + } + +// printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%lu\n", n, num_wgs, wgsize, __max_cu); - auto event = __exec.queue().submit([&](sycl::handler& hdl) { + auto event = __queue.submit([&](sycl::handler& hdl) { auto status_flags = sycl::accessor(status_flags_buf, hdl); auto tile_id_lacc = sycl::accessor(sycl::range<1>{1}, hdl); @@ -121,6 +131,7 @@ single_pass_scan_impl(_Policy&& __exec, _InRange&& __in_rng, _OutRange&& __out_r flag.set_partial(local_sum); auto prev_sum = flag.lookback(tile_id, status_flags.get_pointer()); + //auto prev_sum = 0; flag.set_full(prev_sum + local_sum); sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum); @@ -130,9 +141,18 @@ single_pass_scan_impl(_Policy&& __exec, _InRange&& __in_rng, _OutRange&& __out_r event.wait(); } -template +// The generic structure for configuring a kernel +template +struct kernel_param +{ + static constexpr std::uint16_t data_per_workitem = DataPerWorkItem; + static constexpr std::uint16_t workgroup_size = WorkGroupSize; + using kernel_name = KernelName; +}; + +template void -single_pass_inclusive_scan(_Policy&& __exec, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _BinaryOp __binary_op) +single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _BinaryOp __binary_op) { auto __n = __in_end - __in_begin; auto __keep1 = @@ -142,9 +162,11 @@ single_pass_inclusive_scan(_Policy&& __exec, _InIterator __in_begin, _InIterator oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); auto __buf2 = __keep2(__out_begin, __out_begin + __n); - single_pass_scan_impl(__exec, __buf1.all_view(), __buf2.all_view(), __binary_op); + single_pass_scan_impl<_KernelParam, true>(__queue, __buf1.all_view(), __buf2.all_view(), __binary_op); } -} // namespace oneapi::dpl::experimental::igpu +} // inline namespace igpu + +} // namespace oneapi::dpl::experimental::kt #endif /* _ONEDPL_parallel_backend_sycl_scan_H */ diff --git a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp index 71a725563d4..4ae83a92041 100644 --- a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp @@ -22,9 +22,35 @@ int main() { int n = 1 << 16; + std::vector v(n, 1); sycl::queue q; int* in_ptr = sycl::malloc_device(n, q); int* out_ptr = sycl::malloc_device(n, q); - oneapi::dpl::experimental::igpu::single_pass_inclusive_scan(oneapi::dpl::execution::dpcpp_default, in_ptr, in_ptr+n, out_ptr, ::std::plus()); - return 0; + + + q.copy(v.data(), in_ptr, n); + using KernelParams = oneapi::dpl::experimental::kt::kernel_param<128, 2, class ScanKernel>; + oneapi::dpl::experimental::kt::single_pass_inclusive_scan(q, in_ptr, in_ptr+n, out_ptr, ::std::plus()); + + std::vector tmp(n, 0); + q.copy(out_ptr, tmp.data(), n); + + std::inclusive_scan(v.begin(), v.end(), v.begin()); + + bool passed = true; + for (size_t i = 0; i < n; ++i) + { + if (tmp[i] != v[i]) + { + passed = false; + std::cout << "expected " << i << ' ' << v[i] << ' ' << tmp[i] << '\n'; + } + } + + if (passed) + std::cout << "passed" << std::endl; + else + std::cout << "failed" << std::endl; + + return !passed; } From bd8960153adb9d62090530479a4c5e7a51d6f142 Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Thu, 31 Aug 2023 06:18:55 -0700 Subject: [PATCH 005/134] Debug statements for scan kernel template --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 81 +++++++++++++------ 1 file changed, 58 insertions(+), 23 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index e71398a44b7..c70bbabb82b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -82,56 +82,67 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou const ::std::size_t n = __in_rng.size(); auto __max_cu = __queue.get_device().template get_info(); //std::size_t num_wgs = __max_cu; - std::size_t num_wgs = 64; + std::size_t num_wgs = 256; // TODO: use wgsize and iters per item from _KernelParam - std::size_t wgsize = n/num_wgs; + //constexpr ::std::size_t __elems_per_item = _KernelParam::data_per_workitem; + constexpr ::std::size_t __elems_per_item = 2; + std::size_t wgsize = n/num_wgs/__elems_per_item; + std::size_t num_items = n/__elems_per_item; - std::uint32_t status_flags_buf_size = num_wgs+1; - sycl::buffer status_flags_buf(status_flags_buf_size); - // TODO: this probably isn't the best way to do this - { - sycl::host_accessor status_flags(status_flags_buf); - for (std::size_t i = 0; i < status_flags_buf_size; ++i) - status_flags[i] = 0; - } + std::uint32_t status_flags_size = num_wgs+1; + + uint32_t* status_flags = sycl::malloc_device(status_flags_size, __queue); + __queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t)); -// printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%lu\n", n, num_wgs, wgsize, __max_cu); + //printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu); + /*printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu); + uint32_t* debug1 = sycl::malloc_device(status_flags_size, __queue); + uint32_t* debug2 = sycl::malloc_device(status_flags_size, __queue); + uint32_t* debug3 = sycl::malloc_device(status_flags_size, __queue); + uint32_t* debug4 = sycl::malloc_device(status_flags_size, __queue); + uint32_t* debug5 = sycl::malloc_device(status_flags_size, __queue);*/ auto event = __queue.submit([&](sycl::handler& hdl) { - auto status_flags = sycl::accessor(status_flags_buf, hdl); - auto tile_id_lacc = sycl::accessor(sycl::range<1>{1}, hdl); + auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); - hdl.parallel_for(sycl::nd_range<1>(n, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(32)]] { - auto item_id = item.get_local_linear_id(); + hdl.parallel_for(sycl::nd_range<1>(num_items, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(32)]] { auto group = item.get_group(); - //std::uint32_t elems_in_tile = elems_per_item*wgsize; - std::uint32_t elems_in_tile = wgsize; + std::uint32_t elems_in_tile = wgsize*__elems_per_item; // Obtain unique ID for this work-group that will be used in decoupled lookback if (group.leader()) { - sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space> idx_atomic(status_flags[status_flags_buf_size-1]); + sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space> idx_atomic(status_flags[status_flags_size-1]); tile_id_lacc[0] = idx_atomic.fetch_add(1); } sycl::group_barrier(group); std::uint32_t tile_id = tile_id_lacc[0]; + //debug5[group.get_local_id()] = tile_id; - auto in_begin = __in_rng.begin() + (tile_id*elems_in_tile); - auto in_end = __in_rng.begin() + ((tile_id+1)*elems_in_tile); - auto out_begin = __out_rng.begin() + (tile_id*elems_in_tile); + auto current_offset = (tile_id*elems_in_tile); + auto next_offset = ((tile_id+1)*elems_in_tile); + auto in_begin = __in_rng.begin() + current_offset; + auto in_end = __in_rng.begin() + next_offset; + auto out_begin = __out_rng.begin() + current_offset; + + //debug3[tile_id] = current_offset; + //debug4[tile_id] = next_offset; auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op); + //auto local_sum = 0; + ///debug1[tile_id] = local_sum; - __scan_status_flag<_Type> flag(status_flags.get_pointer(), tile_id); + __scan_status_flag<_Type> flag(status_flags, tile_id); flag.set_partial(local_sum); - auto prev_sum = flag.lookback(tile_id, status_flags.get_pointer()); + auto prev_sum = flag.lookback(tile_id, status_flags); //auto prev_sum = 0; + //debug2[tile_id] = prev_sum; flag.set_full(prev_sum + local_sum); sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum); @@ -139,6 +150,30 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou }); event.wait(); + +#if 0 + std::vector debug1v(status_flags_size); + std::vector debug2v(status_flags_size); + std::vector debug3v(status_flags_size); + std::vector debug4v(status_flags_size); + std::vector debug5v(status_flags_size); + __queue.memcpy(debug1v.data(), debug1, status_flags_size * sizeof(uint32_t)); + __queue.memcpy(debug2v.data(), debug2, status_flags_size * sizeof(uint32_t)); + __queue.memcpy(debug3v.data(), debug3, status_flags_size * sizeof(uint32_t)); + __queue.memcpy(debug4v.data(), debug4, status_flags_size * sizeof(uint32_t)); + __queue.memcpy(debug5v.data(), debug5, status_flags_size * sizeof(uint32_t)); + + for (int i = 0; i < status_flags_size-1; ++i) + std::cout << "local_sum " << i << " " << debug1v[i] << std::endl; + for (int i = 0; i < status_flags_size-1; ++i) + std::cout << "lookback " << i << " " << debug2v[i] << std::endl; + for (int i = 0; i < status_flags_size-1; ++i) + std::cout << "offset " << i << " " << debug3v[i] << std::endl; + for (int i = 0; i < status_flags_size-1; ++i) + std::cout << "end " << i << " " << debug4v[i] << std::endl; +#endif + + sycl::free(status_flags, __queue); } // The generic structure for configuring a kernel From 60a69fcdf20ccfd0341dd72084211096a8965f2f Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Wed, 6 Sep 2023 08:46:10 -0500 Subject: [PATCH 006/134] Update scan kernel template test --- .../numeric/numeric.ops/scan_kt.pass.cpp | 54 +++++++++++-------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp index 4ae83a92041..de5ecafc25b 100644 --- a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp @@ -21,36 +21,44 @@ int main() { - int n = 1 << 16; - std::vector v(n, 1); - sycl::queue q; - int* in_ptr = sycl::malloc_device(n, q); - int* out_ptr = sycl::malloc_device(n, q); + bool all_passed = true; + for (int logn : {4, 8, 11, 16, 19, 21}) + { + std::cout << "Testing 2^" << logn << '\n'; + int n = 1 << logn; + std::vector v(n, 1); + sycl::queue q; + int* in_ptr = sycl::malloc_device(n, q); + int* out_ptr = sycl::malloc_device(n, q); - q.copy(v.data(), in_ptr, n); - using KernelParams = oneapi::dpl::experimental::kt::kernel_param<128, 2, class ScanKernel>; - oneapi::dpl::experimental::kt::single_pass_inclusive_scan(q, in_ptr, in_ptr+n, out_ptr, ::std::plus()); - std::vector tmp(n, 0); - q.copy(out_ptr, tmp.data(), n); + q.copy(v.data(), in_ptr, n); + using KernelParams = oneapi::dpl::experimental::kt::kernel_param<128, 2, class ScanKernel>; + oneapi::dpl::experimental::kt::single_pass_inclusive_scan(q, in_ptr, in_ptr+n, out_ptr, ::std::plus()); - std::inclusive_scan(v.begin(), v.end(), v.begin()); + std::vector tmp(n, 0); + q.copy(out_ptr, tmp.data(), n); - bool passed = true; - for (size_t i = 0; i < n; ++i) - { - if (tmp[i] != v[i]) + std::inclusive_scan(v.begin(), v.end(), v.begin()); + + bool passed = true; + for (size_t i = 0; i < n; ++i) { - passed = false; - std::cout << "expected " << i << ' ' << v[i] << ' ' << tmp[i] << '\n'; + if (tmp[i] != v[i]) + { + passed = false; + std::cout << "expected " << i << ' ' << v[i] << ' ' << tmp[i] << '\n'; + } } - } - if (passed) - std::cout << "passed" << std::endl; - else - std::cout << "failed" << std::endl; + if (passed) + std::cout << "passed" << std::endl; + else + std::cout << "failed" << std::endl; + + all_passed &= passed; + } - return !passed; + return !all_passed; } From d526f0431baf85cdf96143d332f2f6a24c44fec9 Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Thu, 14 Sep 2023 09:08:55 -0700 Subject: [PATCH 007/134] Only have a single work-item per group query for previous tile status --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 27 +++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index c70bbabb82b..b01f56ac539 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -86,7 +86,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou // TODO: use wgsize and iters per item from _KernelParam //constexpr ::std::size_t __elems_per_item = _KernelParam::data_per_workitem; - constexpr ::std::size_t __elems_per_item = 2; + constexpr ::std::size_t __elems_per_item = 16; std::size_t wgsize = n/num_wgs/__elems_per_item; std::size_t num_items = n/__elems_per_item; @@ -96,14 +96,16 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou uint32_t* status_flags = sycl::malloc_device(status_flags_size, __queue); __queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t)); - //printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu); - /*printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu); +#if SCAN_KT_DEBUG + printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu); + printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu); uint32_t* debug1 = sycl::malloc_device(status_flags_size, __queue); uint32_t* debug2 = sycl::malloc_device(status_flags_size, __queue); uint32_t* debug3 = sycl::malloc_device(status_flags_size, __queue); uint32_t* debug4 = sycl::malloc_device(status_flags_size, __queue); - uint32_t* debug5 = sycl::malloc_device(status_flags_size, __queue);*/ + uint32_t* debug5 = sycl::malloc_device(status_flags_size, __queue); +#endif auto event = __queue.submit([&](sycl::handler& hdl) { auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); @@ -138,12 +140,21 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou ///debug1[tile_id] = local_sum; __scan_status_flag<_Type> flag(status_flags, tile_id); - flag.set_partial(local_sum); - auto prev_sum = flag.lookback(tile_id, status_flags); - //auto prev_sum = 0; + if (group.leader()) + flag.set_partial(local_sum); + + // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum + //sycl::reduce_over_group(item.get_subgroup()) + + auto prev_sum = 0; + + if (group.leader()) + prev_sum = flag.lookback(tile_id, status_flags); //debug2[tile_id] = prev_sum; - flag.set_full(prev_sum + local_sum); + + if (group.leader()) + flag.set_full(prev_sum + local_sum); sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum); }); From 09e9bbf4329623afa46fbe3ed6e6029835094157 Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Mon, 18 Sep 2023 08:06:43 -0700 Subject: [PATCH 008/134] First attempt at parallel lookback --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 106 +++++++++++++++--- 1 file changed, 89 insertions(+), 17 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index b01f56ac539..27fdc1d09b4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -24,13 +24,21 @@ inline namespace igpu { template struct __scan_status_flag { + // 00xxxx - not computed + // 01xxxx - partial + // 10xxxx - full + // 110000 - out of bounds + using _AtomicRefT = sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space>; static constexpr std::uint32_t partial_mask = 1 << (sizeof(std::uint32_t)*8 - 2); static constexpr std::uint32_t full_mask = 1 << (sizeof(std::uint32_t)*8 - 1); static constexpr std::uint32_t value_mask = ~(partial_mask | full_mask); + static constexpr std::uint32_t oob_value = partial_mask | full_mask; + + static constexpr int padding = 32; __scan_status_flag(std::uint32_t* flags_begin, const std::uint32_t tile_id) - : atomic_flag(*(flags_begin + tile_id)) + : atomic_flag(*(flags_begin + tile_id + padding)) { } @@ -42,16 +50,57 @@ struct __scan_status_flag void set_full(std::uint32_t val) { - atomic_flag.store(val | full_mask); + atomic_flag.store((val ^ partial_mask) | full_mask); + } + + template + _T cooperative_lookback(const std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin) + { + _T sum = 0; + int offset = -1; + int i = 0; + int local_id = subgroup.get_local_id(); + + for (int tile = static_cast(tile_id) + offset; tile >= 0; offset -= 32) + { + _AtomicRefT tile_atomic(*(flags_begin + tile + padding - local_id)); + std::uint32_t tile_val = 0; + do { + tile_val = tile_atomic.load(); + + //} while (!sycl::all_of_group(subgroup, tile_val != 0)); + } while (0); + + bool is_full = (tile_val & full_mask) && ((tile_val & partial_mask) == 0); + auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full); + ::std::uint32_t is_full_ballot_bits{}; + is_full_ballot.extract_bits(is_full_ballot_bits); + + auto lowest_item_with_full = sycl::ctz(is_full_ballot_bits); + _T contribution = local_id <= lowest_item_with_full ? tile_val & value_mask : _T{}; + + // Sum all of the partial results from the tiles found, as well as the full contribution from the closest tile (if any) + sum += sycl::reduce_over_group(subgroup, contribution, bin_op); + + // If we found a full value, we can stop looking at previous tiles. Otherwise, + // keep going through tiles until we either find a full tile or we've completely + // recomputed the prefix using partial values + if (is_full_ballot_bits) + break; + + //if (i++ > 10) break; + } + return sum; } +#if 0 _T lookback(const std::uint32_t tile_id, std::uint32_t* flags_begin) { _T sum = 0; int i = 0; for (std::int32_t tile = static_cast(tile_id) - 1; tile >= 0; --tile) { - _AtomicRefT tile_atomic(*(flags_begin + tile)); + _AtomicRefT tile_atomic(*(flags_begin + tile + padding)); std::uint32_t tile_val = 0; do { tile_val = tile_atomic.load(); @@ -67,6 +116,7 @@ struct __scan_status_flag } return sum; } +#endif _AtomicRefT atomic_flag; }; @@ -86,15 +136,28 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou // TODO: use wgsize and iters per item from _KernelParam //constexpr ::std::size_t __elems_per_item = _KernelParam::data_per_workitem; - constexpr ::std::size_t __elems_per_item = 16; +#ifdef _ONEDPL_SCAN_ITER_SIZE + constexpr ::std::size_t __elems_per_item = _ONEDPL_SCAN_ITER_SIZE; +#else + constexpr ::std::size_t __elems_per_item = 8; +#endif std::size_t wgsize = n/num_wgs/__elems_per_item; std::size_t num_items = n/__elems_per_item; - std::uint32_t status_flags_size = num_wgs+1; + constexpr int status_flag_padding = 32; + std::uint32_t status_flags_size = num_wgs+1+status_flag_padding; uint32_t* status_flags = sycl::malloc_device(status_flags_size, __queue); - __queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t)); + //__queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t)); + + auto fill_event = __queue.submit([&](sycl::handler& hdl) { + + hdl.parallel_for(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item) { + int id = item.get_linear_id(); + status_flags[id] = id < status_flag_padding ? __scan_status_flag<_Type>::oob_value : 0; + }); + }); #if SCAN_KT_DEBUG printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu); @@ -109,10 +172,12 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou auto event = __queue.submit([&](sycl::handler& hdl) { auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); + hdl.depends_on(fill_event); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); hdl.parallel_for(sycl::nd_range<1>(num_items, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(32)]] { auto group = item.get_group(); + auto subgroup = item.get_sub_group(); std::uint32_t elems_in_tile = wgsize*__elems_per_item; @@ -139,23 +204,30 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou //auto local_sum = 0; ///debug1[tile_id] = local_sum; - __scan_status_flag<_Type> flag(status_flags, tile_id); + auto prev_sum = 0; - if (group.leader()) - flag.set_partial(local_sum); + // The first sub-group will query the previous tiles to find a prefix + if (subgroup.get_group_id() == 0) + { + __scan_status_flag<_Type> flag(status_flags, tile_id); - // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum - //sycl::reduce_over_group(item.get_subgroup()) + if (group.leader()) + flag.set_partial(local_sum); - auto prev_sum = 0; + // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum + //sycl::reduce_over_group(item.get_subgroup()) - if (group.leader()) - prev_sum = flag.lookback(tile_id, status_flags); - //debug2[tile_id] = prev_sum; - if (group.leader()) - flag.set_full(prev_sum + local_sum); + prev_sum = flag.cooperative_lookback(tile_id, subgroup, __binary_op, status_flags); + //if (group.leader()) + // prev_sum = flag.lookback(tile_id, status_flags); + //debug2[tile_id] = prev_sum; + + if (group.leader()) + flag.set_full(prev_sum + local_sum); + } + prev_sum = sycl::group_broadcast(group, prev_sum, 0); sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum); }); }); From 30e0da7811689d75de697aafefeec7bac2ec8526 Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Fri, 22 Sep 2023 11:42:33 -0700 Subject: [PATCH 009/134] Working cooperative lookback --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 61 +++++++++++++------ 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 27fdc1d09b4..963de2952e6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -50,26 +50,26 @@ struct __scan_status_flag void set_full(std::uint32_t val) { - atomic_flag.store((val ^ partial_mask) | full_mask); + atomic_flag.store(val | full_mask); } template - _T cooperative_lookback(const std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin) + _T cooperative_lookback(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin) { _T sum = 0; int offset = -1; int i = 0; int local_id = subgroup.get_local_id(); - for (int tile = static_cast(tile_id) + offset; tile >= 0; offset -= 32) + for (int tile = static_cast(tile_id) + offset; tile >= 0; tile -= 32) { _AtomicRefT tile_atomic(*(flags_begin + tile + padding - local_id)); std::uint32_t tile_val = 0; do { tile_val = tile_atomic.load(); - //} while (!sycl::all_of_group(subgroup, tile_val != 0)); - } while (0); + } while (!sycl::all_of_group(subgroup, tile_val != 0)); + //} while (0); bool is_full = (tile_val & full_mask) && ((tile_val & partial_mask) == 0); auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full); @@ -77,7 +77,7 @@ struct __scan_status_flag is_full_ballot.extract_bits(is_full_ballot_bits); auto lowest_item_with_full = sycl::ctz(is_full_ballot_bits); - _T contribution = local_id <= lowest_item_with_full ? tile_val & value_mask : _T{}; + _T contribution = local_id <= lowest_item_with_full ? tile_val & value_mask : _T{0}; // Sum all of the partial results from the tiles found, as well as the full contribution from the closest tile (if any) sum += sycl::reduce_over_group(subgroup, contribution, bin_op); @@ -132,6 +132,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou const ::std::size_t n = __in_rng.size(); auto __max_cu = __queue.get_device().template get_info(); //std::size_t num_wgs = __max_cu; + //std::size_t num_wgs = 448; std::size_t num_wgs = 256; // TODO: use wgsize and iters per item from _KernelParam @@ -143,26 +144,31 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou #endif std::size_t wgsize = n/num_wgs/__elems_per_item; std::size_t num_items = n/__elems_per_item; + // + //std::size_t wgsize = 256; + //std::size_t num_items = 114688; constexpr int status_flag_padding = 32; std::uint32_t status_flags_size = num_wgs+1+status_flag_padding; + printf("launching kernel items=%lu wgs=%lu wgsize=%lu elems_per_iter=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __elems_per_item, __max_cu); + uint32_t* status_flags = sycl::malloc_device(status_flags_size, __queue); //__queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t)); auto fill_event = __queue.submit([&](sycl::handler& hdl) { - hdl.parallel_for(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item) { + hdl.parallel_for(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item) { int id = item.get_linear_id(); status_flags[id] = id < status_flag_padding ? __scan_status_flag<_Type>::oob_value : 0; }); }); -#if SCAN_KT_DEBUG - printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu); - printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu); + std::uint32_t elems_in_tile = wgsize*__elems_per_item; +#define SCAN_KT_DEBUG 1 +#if SCAN_KT_DEBUG uint32_t* debug1 = sycl::malloc_device(status_flags_size, __queue); uint32_t* debug2 = sycl::malloc_device(status_flags_size, __queue); uint32_t* debug3 = sycl::malloc_device(status_flags_size, __queue); @@ -175,11 +181,10 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou hdl.depends_on(fill_event); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); - hdl.parallel_for(sycl::nd_range<1>(num_items, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(32)]] { + hdl.parallel_for(sycl::nd_range<1>(num_items, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(32)]] { auto group = item.get_group(); auto subgroup = item.get_sub_group(); - std::uint32_t elems_in_tile = wgsize*__elems_per_item; // Obtain unique ID for this work-group that will be used in decoupled lookback if (group.leader()) @@ -189,7 +194,9 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou } sycl::group_barrier(group); std::uint32_t tile_id = tile_id_lacc[0]; - //debug5[group.get_local_id()] = tile_id; +#if SCAN_KT_DEBUG + debug5[group.get_group_linear_id()] = tile_id; +#endif auto current_offset = (tile_id*elems_in_tile); auto next_offset = ((tile_id+1)*elems_in_tile); @@ -197,12 +204,15 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou auto in_end = __in_rng.begin() + next_offset; auto out_begin = __out_rng.begin() + current_offset; - //debug3[tile_id] = current_offset; - //debug4[tile_id] = next_offset; +#if SCAN_KT_DEBUG + debug3[tile_id] = current_offset; + debug4[tile_id] = next_offset; +#endif auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op); - //auto local_sum = 0; - ///debug1[tile_id] = local_sum; +#if SCAN_KT_DEBUG + debug1[tile_id] = local_sum; +#endif auto prev_sum = 0; @@ -221,7 +231,9 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou prev_sum = flag.cooperative_lookback(tile_id, subgroup, __binary_op, status_flags); //if (group.leader()) // prev_sum = flag.lookback(tile_id, status_flags); - //debug2[tile_id] = prev_sum; +#if SCAN_KT_DEBUG + debug2[tile_id] = prev_sum; +#endif if (group.leader()) flag.set_full(prev_sum + local_sum); @@ -234,20 +246,31 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou event.wait(); -#if 0 +#if SCAN_KT_DEBUG std::vector debug1v(status_flags_size); std::vector debug2v(status_flags_size); std::vector debug3v(status_flags_size); std::vector debug4v(status_flags_size); std::vector debug5v(status_flags_size); + std::vector debug6v(status_flags_size); __queue.memcpy(debug1v.data(), debug1, status_flags_size * sizeof(uint32_t)); __queue.memcpy(debug2v.data(), debug2, status_flags_size * sizeof(uint32_t)); __queue.memcpy(debug3v.data(), debug3, status_flags_size * sizeof(uint32_t)); __queue.memcpy(debug4v.data(), debug4, status_flags_size * sizeof(uint32_t)); __queue.memcpy(debug5v.data(), debug5, status_flags_size * sizeof(uint32_t)); + __queue.memcpy(debug6v.data(), status_flags, status_flags_size * sizeof(uint32_t)); + for (int i = 0; i < status_flags_size-1; ++i) + std::cout << "tile " << i << " " << debug5v[i] << std::endl; for (int i = 0; i < status_flags_size-1; ++i) std::cout << "local_sum " << i << " " << debug1v[i] << std::endl; + for (int i = 0; i < status_flags_size-1; ++i) + { + auto val = (debug6v[i] & __scan_status_flag<_Type>::value_mask); + int a = val / elems_in_tile; + int b = val % elems_in_tile; + std::cout << "flags " << i << " " << std::bitset<32>(debug6v[i]) << " (" << val<< " = " << a << "/" << elems_in_tile << "+" << b <<")" << std::endl; + } for (int i = 0; i < status_flags_size-1; ++i) std::cout << "lookback " << i << " " << debug2v[i] << std::endl; for (int i = 0; i < status_flags_size-1; ++i) From 2311929486e18d8b4eee18208e932f20409e4489 Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Wed, 25 Oct 2023 11:13:53 -0700 Subject: [PATCH 010/134] Fix correctness issue with non-power-of-2 sizes --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 52 ++++++++++++++++--- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 963de2952e6..7aaf3f2a255 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -133,7 +133,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou auto __max_cu = __queue.get_device().template get_info(); //std::size_t num_wgs = __max_cu; //std::size_t num_wgs = 448; - std::size_t num_wgs = 256; + //std::size_t num_wgs = 256; // TODO: use wgsize and iters per item from _KernelParam //constexpr ::std::size_t __elems_per_item = _KernelParam::data_per_workitem; @@ -142,8 +142,14 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou #else constexpr ::std::size_t __elems_per_item = 8; #endif - std::size_t wgsize = n/num_wgs/__elems_per_item; - std::size_t num_items = n/__elems_per_item; + // Next power of 2 greater than or equal to __n + auto __n_uniform = n; + if ((__n_uniform & (__n_uniform - 1)) != 0) + __n_uniform = oneapi::dpl::__internal::__dpl_bit_floor(n) << 1; + //std::size_t wgsize = n/num_wgs/__elems_per_item; + std::size_t wgsize = 256; + std::size_t num_items = __n_uniform/__elems_per_item; + std::size_t num_wgs = num_items/wgsize; // //std::size_t wgsize = 256; //std::size_t num_items = 114688; @@ -152,7 +158,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou constexpr int status_flag_padding = 32; std::uint32_t status_flags_size = num_wgs+1+status_flag_padding; - printf("launching kernel items=%lu wgs=%lu wgsize=%lu elems_per_iter=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __elems_per_item, __max_cu); + //printf("launching kernel items=%lu wgs=%lu wgsize=%lu elems_per_iter=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __elems_per_item, __max_cu); uint32_t* status_flags = sycl::malloc_device(status_flags_size, __queue); //__queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t)); @@ -165,10 +171,17 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou }); }); + std::uint32_t elems_in_tile = wgsize*__elems_per_item; -#define SCAN_KT_DEBUG 1 +#define SCAN_KT_DEBUG 0 #if SCAN_KT_DEBUG + std::vector debug11v(status_flags_size); + __queue.memcpy(debug11v.data(), status_flags, status_flags_size * sizeof(uint32_t)); + + for (int i = 0; i < status_flags_size-1; ++i) + std::cout << "flag_before " << i << " " << debug11v[i] << std::endl; + uint32_t* debug1 = sycl::malloc_device(status_flags_size, __queue); uint32_t* debug2 = sycl::malloc_device(status_flags_size, __queue); uint32_t* debug3 = sycl::malloc_device(status_flags_size, __queue); @@ -200,21 +213,27 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou auto current_offset = (tile_id*elems_in_tile); auto next_offset = ((tile_id+1)*elems_in_tile); + if (next_offset > n) + next_offset = n; auto in_begin = __in_rng.begin() + current_offset; auto in_end = __in_rng.begin() + next_offset; auto out_begin = __out_rng.begin() + current_offset; + #if SCAN_KT_DEBUG debug3[tile_id] = current_offset; debug4[tile_id] = next_offset; #endif + if (current_offset >= n) + return; + auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op); #if SCAN_KT_DEBUG debug1[tile_id] = local_sum; #endif - auto prev_sum = 0; + _Type prev_sum = 0; // The first sub-group will query the previous tiles to find a prefix if (subgroup.get_group_id() == 0) @@ -296,6 +315,17 @@ void single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _BinaryOp __binary_op) { auto __n = __in_end - __in_begin; + +#if SCAN_KT_DEBUG + using _Type = std::remove_pointer_t<_InIterator>; + std::vector<_Type> in_debug(__n); + __queue.memcpy(in_debug.data(), __in_begin, __n * sizeof(_Type)); + + for (int i = 0; i < __n; ++i) + std::cout << "input_before " << i << " " << in_debug[i] << std::endl; +#endif + + //printf("KERNEL_TEMPLATE %lu\n", __n); auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); auto __buf1 = __keep1(__in_begin, __in_end); @@ -304,6 +334,16 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera auto __buf2 = __keep2(__out_begin, __out_begin + __n); single_pass_scan_impl<_KernelParam, true>(__queue, __buf1.all_view(), __buf2.all_view(), __binary_op); + +#if SCAN_KT_DEBUG + std::vector<_Type> in_debug2(__n); + __queue.memcpy(in_debug2.data(), __in_begin, __n * sizeof(_Type)); + + for (int i = 0; i < __n; ++i) + std::cout << "input_after " << i << " " << in_debug2[i] << std::endl; +#endif + + //printf("KERNEL_TEMPLATE DONE %lu\n", __n); } } // inline namespace igpu From 0f58c07c24ea397f1d63eabfe7ae7dac82cdf14f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?= Date: Tue, 7 Nov 2023 10:51:21 +0000 Subject: [PATCH 011/134] Scan_kt Flags and Values separated (#15) Atomic flags and the values used in Scan_kt separated to avoid truncating the range to 30bit values, and prepare for a more general scan implementation. --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 242 +++++------------- .../numeric/numeric.ops/scan_kt.pass.cpp | 14 +- 2 files changed, 75 insertions(+), 181 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 7aaf3f2a255..f52e4ef532f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -21,63 +21,75 @@ namespace oneapi::dpl::experimental::kt inline namespace igpu { +constexpr size_t SUBGROUP_SIZE = 32; + template struct __scan_status_flag { - // 00xxxx - not computed - // 01xxxx - partial - // 10xxxx - full - // 110000 - out of bounds - - using _AtomicRefT = sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space>; - static constexpr std::uint32_t partial_mask = 1 << (sizeof(std::uint32_t)*8 - 2); - static constexpr std::uint32_t full_mask = 1 << (sizeof(std::uint32_t)*8 - 1); - static constexpr std::uint32_t value_mask = ~(partial_mask | full_mask); - static constexpr std::uint32_t oob_value = partial_mask | full_mask; - - static constexpr int padding = 32; - - __scan_status_flag(std::uint32_t* flags_begin, const std::uint32_t tile_id) - : atomic_flag(*(flags_begin + tile_id + padding)) + using _AtomicRefT = sycl::atomic_ref<::std::uint32_t, sycl::memory_order::acq_rel, sycl::memory_scope::device, + sycl::access::address_space::global_space>; + static constexpr std::uint32_t NOT_READY = 0; + static constexpr std::uint32_t PARTIAL_MASK = 1; + static constexpr std::uint32_t FULL_MASK = 2; + static constexpr std::uint32_t OUT_OF_BOUNDS = 4; + + static constexpr int padding = SUBGROUP_SIZE; + + __scan_status_flag(const std::uint32_t tile_id, std::uint32_t* flags_begin, _T* tile_sums, + size_t num_elements) + : atomic_flag(*(flags_begin + tile_id + padding)), scanned_partial_value(tile_sums + tile_id + padding), + scanned_full_value(tile_sums + tile_id + padding + num_elements), num_elements{num_elements} { - } - void set_partial(std::uint32_t val) + void + set_partial(_T val) { - atomic_flag.store(val | partial_mask); + (*scanned_partial_value) = val; + atomic_flag.store(PARTIAL_MASK); } - void set_full(std::uint32_t val) + void + set_full(_T val) { - atomic_flag.store(val | full_mask); + (*scanned_full_value) = val; + atomic_flag.store(FULL_MASK); } - template - _T cooperative_lookback(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin) + template + _T + cooperative_lookback(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin, + _T* tile_sums) { _T sum = 0; int offset = -1; int i = 0; int local_id = subgroup.get_local_id(); - for (int tile = static_cast(tile_id) + offset; tile >= 0; tile -= 32) + for (int tile = static_cast(tile_id) + offset; tile >= 0; tile -= SUBGROUP_SIZE) { _AtomicRefT tile_atomic(*(flags_begin + tile + padding - local_id)); - std::uint32_t tile_val = 0; - do { - tile_val = tile_atomic.load(); + std::uint32_t flag; + do + { + flag = tile_atomic.load(); + } while (!sycl::all_of_group(subgroup, flag != NOT_READY)); // Loop till all ready - } while (!sycl::all_of_group(subgroup, tile_val != 0)); - //} while (0); + bool is_full = flag == FULL_MASK; - bool is_full = (tile_val & full_mask) && ((tile_val & partial_mask) == 0); auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full); ::std::uint32_t is_full_ballot_bits{}; is_full_ballot.extract_bits(is_full_ballot_bits); auto lowest_item_with_full = sycl::ctz(is_full_ballot_bits); - _T contribution = local_id <= lowest_item_with_full ? tile_val & value_mask : _T{0}; + + // The partial scan results and the full scan sum values are in contiguous memory. + // Each section of the memory is of size num_elements. + // The partial sum for a tile is at [i] and the full sum is at [i + num_elements] + // is_full * num_elements allows to select between the two values without branching the code. + size_t contrib_offset = tile + padding - local_id + is_full * num_elements; + _T val = *(tile_sums + contrib_offset); + _T contribution = local_id <= lowest_item_with_full && (tile - local_id >= 0) ? val : _T{0}; // Sum all of the partial results from the tiles found, as well as the full contribution from the closest tile (if any) sum += sycl::reduce_over_group(subgroup, contribution, bin_op); @@ -88,37 +100,16 @@ struct __scan_status_flag if (is_full_ballot_bits) break; - //if (i++ > 10) break; } - return sum; - } - -#if 0 - _T lookback(const std::uint32_t tile_id, std::uint32_t* flags_begin) - { - _T sum = 0; - int i = 0; - for (std::int32_t tile = static_cast(tile_id) - 1; tile >= 0; --tile) - { - _AtomicRefT tile_atomic(*(flags_begin + tile + padding)); - std::uint32_t tile_val = 0; - do { - tile_val = tile_atomic.load(); - } while (tile_val == 0); - - sum += tile_val & value_mask; - // If this was a full value, we can stop looking at previous tiles. Otherwise, - // keep going through tiles until we either find a full tile or we've completely - // recomputed the prefix using partial values - if (tile_val & full_mask) - break; - } return sum; } -#endif _AtomicRefT atomic_flag; + _T* scanned_partial_value; + _T* scanned_full_value; + + size_t num_elements; }; template @@ -130,86 +121,57 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou static_assert(_Inclusive, "Single-pass scan only available for inclusive scan"); const ::std::size_t n = __in_rng.size(); - auto __max_cu = __queue.get_device().template get_info(); - //std::size_t num_wgs = __max_cu; - //std::size_t num_wgs = 448; - //std::size_t num_wgs = 256; - - // TODO: use wgsize and iters per item from _KernelParam - //constexpr ::std::size_t __elems_per_item = _KernelParam::data_per_workitem; #ifdef _ONEDPL_SCAN_ITER_SIZE - constexpr ::std::size_t __elems_per_item = _ONEDPL_SCAN_ITER_SIZE; + constexpr ::std::size_t __elems_per_workitem = _ONEDPL_SCAN_ITER_SIZE; #else - constexpr ::std::size_t __elems_per_item = 8; + constexpr ::std::size_t __elems_per_workitem = 8; #endif // Next power of 2 greater than or equal to __n auto __n_uniform = n; if ((__n_uniform & (__n_uniform - 1)) != 0) __n_uniform = oneapi::dpl::__internal::__dpl_bit_floor(n) << 1; - //std::size_t wgsize = n/num_wgs/__elems_per_item; - std::size_t wgsize = 256; - std::size_t num_items = __n_uniform/__elems_per_item; - std::size_t num_wgs = num_items/wgsize; - // - //std::size_t wgsize = 256; - //std::size_t num_items = 114688; - + std::size_t num_workitems = __n_uniform / __elems_per_workitem; + std::size_t wgsize = num_workitems > 256 ? 256 : num_workitems; + std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_workitems, wgsize); - constexpr int status_flag_padding = 32; - std::uint32_t status_flags_size = num_wgs+1+status_flag_padding; - - //printf("launching kernel items=%lu wgs=%lu wgsize=%lu elems_per_iter=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __elems_per_item, __max_cu); + constexpr int status_flag_padding = SUBGROUP_SIZE; + std::uint32_t status_flags_size = num_wgs + status_flag_padding + 1; + std::uint32_t tile_sums_size = num_wgs + status_flag_padding; uint32_t* status_flags = sycl::malloc_device(status_flags_size, __queue); - //__queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t)); + // First status_flags_size elements: partial scanned values (using _BinaryOp) of each workgroup + // Second status_flags_size elements: full scanned values, i.e. sum of the previous and current workgroup's partial sums + _Type* tile_sums = sycl::malloc_device<_Type>(tile_sums_size * 2, __queue); auto fill_event = __queue.submit([&](sycl::handler& hdl) { - hdl.parallel_for(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item) { int id = item.get_linear_id(); - status_flags[id] = id < status_flag_padding ? __scan_status_flag<_Type>::oob_value : 0; + status_flags[id] = id < status_flag_padding ? __scan_status_flag<_Type>::OUT_OF_BOUNDS + : __scan_status_flag<_Type>::NOT_READY; }); }); - - std::uint32_t elems_in_tile = wgsize*__elems_per_item; - -#define SCAN_KT_DEBUG 0 -#if SCAN_KT_DEBUG - std::vector debug11v(status_flags_size); - __queue.memcpy(debug11v.data(), status_flags, status_flags_size * sizeof(uint32_t)); - - for (int i = 0; i < status_flags_size-1; ++i) - std::cout << "flag_before " << i << " " << debug11v[i] << std::endl; - - uint32_t* debug1 = sycl::malloc_device(status_flags_size, __queue); - uint32_t* debug2 = sycl::malloc_device(status_flags_size, __queue); - uint32_t* debug3 = sycl::malloc_device(status_flags_size, __queue); - uint32_t* debug4 = sycl::malloc_device(status_flags_size, __queue); - uint32_t* debug5 = sycl::malloc_device(status_flags_size, __queue); -#endif + std::uint32_t elems_in_tile = wgsize*__elems_per_workitem; auto event = __queue.submit([&](sycl::handler& hdl) { auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); hdl.depends_on(fill_event); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); - hdl.parallel_for(sycl::nd_range<1>(num_items, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(32)]] { + hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { auto group = item.get_group(); auto subgroup = item.get_sub_group(); - // Obtain unique ID for this work-group that will be used in decoupled lookback if (group.leader()) { - sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space> idx_atomic(status_flags[status_flags_size-1]); + sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, + sycl::access::address_space::global_space> + idx_atomic(status_flags[status_flags_size - 1]); tile_id_lacc[0] = idx_atomic.fetch_add(1); } sycl::group_barrier(group); std::uint32_t tile_id = tile_id_lacc[0]; -#if SCAN_KT_DEBUG - debug5[group.get_group_linear_id()] = tile_id; -#endif auto current_offset = (tile_id*elems_in_tile); auto next_offset = ((tile_id+1)*elems_in_tile); @@ -219,40 +181,22 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou auto in_end = __in_rng.begin() + next_offset; auto out_begin = __out_rng.begin() + current_offset; - -#if SCAN_KT_DEBUG - debug3[tile_id] = current_offset; - debug4[tile_id] = next_offset; -#endif - if (current_offset >= n) return; auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op); -#if SCAN_KT_DEBUG - debug1[tile_id] = local_sum; -#endif - _Type prev_sum = 0; // The first sub-group will query the previous tiles to find a prefix if (subgroup.get_group_id() == 0) { - __scan_status_flag<_Type> flag(status_flags, tile_id); + __scan_status_flag<_Type> flag(tile_id, status_flags, tile_sums, status_flags_size); if (group.leader()) flag.set_partial(local_sum); // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum - //sycl::reduce_over_group(item.get_subgroup()) - - - prev_sum = flag.cooperative_lookback(tile_id, subgroup, __binary_op, status_flags); - //if (group.leader()) - // prev_sum = flag.lookback(tile_id, status_flags); -#if SCAN_KT_DEBUG - debug2[tile_id] = prev_sum; -#endif + prev_sum = flag.cooperative_lookback(tile_id, subgroup, __binary_op, status_flags, tile_sums); if (group.leader()) flag.set_full(prev_sum + local_sum); @@ -265,40 +209,8 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou event.wait(); -#if SCAN_KT_DEBUG - std::vector debug1v(status_flags_size); - std::vector debug2v(status_flags_size); - std::vector debug3v(status_flags_size); - std::vector debug4v(status_flags_size); - std::vector debug5v(status_flags_size); - std::vector debug6v(status_flags_size); - __queue.memcpy(debug1v.data(), debug1, status_flags_size * sizeof(uint32_t)); - __queue.memcpy(debug2v.data(), debug2, status_flags_size * sizeof(uint32_t)); - __queue.memcpy(debug3v.data(), debug3, status_flags_size * sizeof(uint32_t)); - __queue.memcpy(debug4v.data(), debug4, status_flags_size * sizeof(uint32_t)); - __queue.memcpy(debug5v.data(), debug5, status_flags_size * sizeof(uint32_t)); - __queue.memcpy(debug6v.data(), status_flags, status_flags_size * sizeof(uint32_t)); - - for (int i = 0; i < status_flags_size-1; ++i) - std::cout << "tile " << i << " " << debug5v[i] << std::endl; - for (int i = 0; i < status_flags_size-1; ++i) - std::cout << "local_sum " << i << " " << debug1v[i] << std::endl; - for (int i = 0; i < status_flags_size-1; ++i) - { - auto val = (debug6v[i] & __scan_status_flag<_Type>::value_mask); - int a = val / elems_in_tile; - int b = val % elems_in_tile; - std::cout << "flags " << i << " " << std::bitset<32>(debug6v[i]) << " (" << val<< " = " << a << "/" << elems_in_tile << "+" << b <<")" << std::endl; - } - for (int i = 0; i < status_flags_size-1; ++i) - std::cout << "lookback " << i << " " << debug2v[i] << std::endl; - for (int i = 0; i < status_flags_size-1; ++i) - std::cout << "offset " << i << " " << debug3v[i] << std::endl; - for (int i = 0; i < status_flags_size-1; ++i) - std::cout << "end " << i << " " << debug4v[i] << std::endl; -#endif - sycl::free(status_flags, __queue); + sycl::free(tile_sums, __queue); } // The generic structure for configuring a kernel @@ -316,16 +228,6 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera { auto __n = __in_end - __in_begin; -#if SCAN_KT_DEBUG - using _Type = std::remove_pointer_t<_InIterator>; - std::vector<_Type> in_debug(__n); - __queue.memcpy(in_debug.data(), __in_begin, __n * sizeof(_Type)); - - for (int i = 0; i < __n; ++i) - std::cout << "input_before " << i << " " << in_debug[i] << std::endl; -#endif - - //printf("KERNEL_TEMPLATE %lu\n", __n); auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); auto __buf1 = __keep1(__in_begin, __in_end); @@ -334,16 +236,6 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera auto __buf2 = __keep2(__out_begin, __out_begin + __n); single_pass_scan_impl<_KernelParam, true>(__queue, __buf1.all_view(), __buf2.all_view(), __binary_op); - -#if SCAN_KT_DEBUG - std::vector<_Type> in_debug2(__n); - __queue.memcpy(in_debug2.data(), __in_begin, __n * sizeof(_Type)); - - for (int i = 0; i < __n; ++i) - std::cout << "input_after " << i << " " << in_debug2[i] << std::endl; -#endif - - //printf("KERNEL_TEMPLATE DONE %lu\n", __n); } } // inline namespace igpu diff --git a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp index de5ecafc25b..38a82b026d7 100644 --- a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp @@ -22,23 +22,23 @@ int main() { bool all_passed = true; + sycl::queue q; for (int logn : {4, 8, 11, 16, 19, 21}) { - std::cout << "Testing 2^" << logn << '\n'; + std::cout << "Testing 2^" << logn << std::endl; int n = 1 << logn; std::vector v(n, 1); - sycl::queue q; int* in_ptr = sycl::malloc_device(n, q); int* out_ptr = sycl::malloc_device(n, q); - - q.copy(v.data(), in_ptr, n); + q.copy(v.data(), in_ptr, n).wait(); using KernelParams = oneapi::dpl::experimental::kt::kernel_param<128, 2, class ScanKernel>; oneapi::dpl::experimental::kt::single_pass_inclusive_scan(q, in_ptr, in_ptr+n, out_ptr, ::std::plus()); std::vector tmp(n, 0); q.copy(out_ptr, tmp.data(), n); + q.wait(); std::inclusive_scan(v.begin(), v.end(), v.begin()); @@ -53,11 +53,13 @@ main() } if (passed) - std::cout << "passed" << std::endl; + std::cout << " passed" << std::endl; else - std::cout << "failed" << std::endl; + std::cout << " failed" << std::endl; all_passed &= passed; + sycl::free(in_ptr, q); + sycl::free(out_ptr, q); } return !all_passed; From 8af98d60dc7c7ed9072a235235efc1b934e63a43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?= Date: Tue, 7 Nov 2023 13:07:04 +0000 Subject: [PATCH 012/134] Refactored Scan_kt code (#16) * Improved Scan_kt: templated parameters, ballot, wgsize calculation. - Changed number of workgroups calculation from next power of two to next multiple of wgsize - Improved group_ballot by using the class member functions - Using kernel_param struct to determine wgsize and elems per work item. --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 34 +++++++------------ .../numeric/numeric.ops/scan_kt.pass.cpp | 2 +- 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index f52e4ef532f..e7a0ca345e6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -76,12 +76,8 @@ struct __scan_status_flag } while (!sycl::all_of_group(subgroup, flag != NOT_READY)); // Loop till all ready bool is_full = flag == FULL_MASK; - auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full); - ::std::uint32_t is_full_ballot_bits{}; - is_full_ballot.extract_bits(is_full_ballot_bits); - - auto lowest_item_with_full = sycl::ctz(is_full_ballot_bits); + auto lowest_item_with_full = is_full_ballot.find_low(); // The partial scan results and the full scan sum values are in contiguous memory. // Each section of the memory is of size num_elements. @@ -97,7 +93,7 @@ struct __scan_status_flag // If we found a full value, we can stop looking at previous tiles. Otherwise, // keep going through tiles until we either find a full tile or we've completely // recomputed the prefix using partial values - if (is_full_ballot_bits) + if (is_full_ballot.any()) break; } @@ -121,18 +117,14 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou static_assert(_Inclusive, "Single-pass scan only available for inclusive scan"); const ::std::size_t n = __in_rng.size(); -#ifdef _ONEDPL_SCAN_ITER_SIZE - constexpr ::std::size_t __elems_per_workitem = _ONEDPL_SCAN_ITER_SIZE; -#else - constexpr ::std::size_t __elems_per_workitem = 8; -#endif - // Next power of 2 greater than or equal to __n - auto __n_uniform = n; - if ((__n_uniform & (__n_uniform - 1)) != 0) - __n_uniform = oneapi::dpl::__internal::__dpl_bit_floor(n) << 1; - std::size_t num_workitems = __n_uniform / __elems_per_workitem; - std::size_t wgsize = num_workitems > 256 ? 256 : num_workitems; - std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_workitems, wgsize); + + constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; + constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem; + + // Avoid non_uniform n by padding up to a multiple of wgsize + std::uint32_t elems_in_tile = wgsize * elems_per_workitem; + ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); + ::std::size_t num_workitems = num_wgs * wgsize; constexpr int status_flag_padding = SUBGROUP_SIZE; std::uint32_t status_flags_size = num_wgs + status_flag_padding + 1; @@ -151,8 +143,6 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou }); }); - std::uint32_t elems_in_tile = wgsize*__elems_per_workitem; - auto event = __queue.submit([&](sycl::handler& hdl) { auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); hdl.depends_on(fill_event); @@ -214,10 +204,10 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou } // The generic structure for configuring a kernel -template +template struct kernel_param { - static constexpr std::uint16_t data_per_workitem = DataPerWorkItem; + static constexpr std::uint16_t elems_per_workitem = ElemsPerWorkItem; static constexpr std::uint16_t workgroup_size = WorkGroupSize; using kernel_name = KernelName; }; diff --git a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp index 38a82b026d7..b3407581f37 100644 --- a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp @@ -33,7 +33,7 @@ main() int* out_ptr = sycl::malloc_device(n, q); q.copy(v.data(), in_ptr, n).wait(); - using KernelParams = oneapi::dpl::experimental::kt::kernel_param<128, 2, class ScanKernel>; + using KernelParams = oneapi::dpl::experimental::kt::kernel_param<8, 128, class ScanKernel>; oneapi::dpl::experimental::kt::single_pass_inclusive_scan(q, in_ptr, in_ptr+n, out_ptr, ::std::plus()); std::vector tmp(n, 0); From 3de596ea2ba5a889b841a0ab96c8aa6055ff6fba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?= Date: Wed, 8 Nov 2023 16:47:52 +0000 Subject: [PATCH 013/134] Scan_kt: Single memory allocation for device_memory (#17) and async free of the device memory (#18) * Single memory allocation for device_memory * async free of device memory --------- Co-authored-by: Joe Todd Co-authored-by: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 35 +++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index e7a0ca345e6..5773b80e1be 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -127,13 +127,24 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou ::std::size_t num_workitems = num_wgs * wgsize; constexpr int status_flag_padding = SUBGROUP_SIZE; - std::uint32_t status_flags_size = num_wgs + status_flag_padding + 1; - std::uint32_t tile_sums_size = num_wgs + status_flag_padding; + std::size_t status_flags_elems = num_wgs + status_flag_padding + 1; + std::size_t status_flags_size = status_flags_elems * sizeof(std::uint32_t); - uint32_t* status_flags = sycl::malloc_device(status_flags_size, __queue); - // First status_flags_size elements: partial scanned values (using _BinaryOp) of each workgroup - // Second status_flags_size elements: full scanned values, i.e. sum of the previous and current workgroup's partial sums - _Type* tile_sums = sycl::malloc_device<_Type>(tile_sums_size * 2, __queue); + std::size_t tile_sums_elems = num_wgs + status_flag_padding; + std::size_t tile_sums_size = status_flags_elems * sizeof(_Type); + + std::size_t extra_mem_for_aligment = alignof(_Type) - (status_flags_size % alignof(_Type)); + // status_flags_size for the status_flags + // extra_mem_for_aligment of the datatype _Type + // First tile_sums_size partial scanned values + // Second tile_sums_size full scanned values (current partial plus all previous workgroups partial) + char* mem_pool = + sycl::malloc_device(status_flags_size + extra_mem_for_aligment + 2 * tile_sums_size, __queue); + + std::size_t tile_sums_offset = status_flags_size + extra_mem_for_aligment; + + std::uint32_t* status_flags = reinterpret_cast(mem_pool); + _Type* tile_sums = reinterpret_cast<_Type*>(mem_pool + tile_sums_offset); auto fill_event = __queue.submit([&](sycl::handler& hdl) { hdl.parallel_for(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item) { @@ -180,7 +191,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou // The first sub-group will query the previous tiles to find a prefix if (subgroup.get_group_id() == 0) { - __scan_status_flag<_Type> flag(tile_id, status_flags, tile_sums, status_flags_size); + __scan_status_flag<_Type> flag(tile_id, status_flags, tile_sums, tile_sums_elems); if (group.leader()) flag.set_partial(local_sum); @@ -197,10 +208,14 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou }); }); - event.wait(); + auto free_event = __queue.submit( + [=](sycl::handler& hdl) + { + hdl.depends_on(event); + hdl.host_task([=](){ sycl::free(mem_pool, __queue); }); + }); - sycl::free(status_flags, __queue); - sycl::free(tile_sums, __queue); + event.wait(); } // The generic structure for configuring a kernel From 2d6ff78f3d2facb0b384f4f71ac7465198b1e6b4 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Wed, 8 Nov 2023 16:07:30 +0000 Subject: [PATCH 014/134] Replace sycl::range with sycl::nd_range for fill --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 5773b80e1be..53d925a14c8 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -146,13 +146,22 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou std::uint32_t* status_flags = reinterpret_cast(mem_pool); _Type* tile_sums = reinterpret_cast<_Type*>(mem_pool + tile_sums_offset); - auto fill_event = __queue.submit([&](sycl::handler& hdl) { - hdl.parallel_for(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item) { - int id = item.get_linear_id(); - status_flags[id] = id < status_flag_padding ? __scan_status_flag<_Type>::OUT_OF_BOUNDS - : __scan_status_flag<_Type>::NOT_READY; + ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(status_flags_size, wgsize); + + auto fill_event = __queue.submit( + [&](sycl::handler& hdl) + { + hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize}, + [=](const sycl::nd_item<1>& item) + { + int id = item.get_global_linear_id(); + if (id < status_flags_size) + status_flags[id] = + id < status_flag_padding + ? __scan_status_flag<_Type>::OUT_OF_BOUNDS + : __scan_status_flag<_Type>::NOT_READY; + }); }); - }); auto event = __queue.submit([&](sycl::handler& hdl) { auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); From 124a912c6852f89a5e3e74041cd0a60e6351e4a2 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Wed, 8 Nov 2023 19:14:32 +0000 Subject: [PATCH 015/134] Bug fix --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 53d925a14c8..038018a13ac 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -146,7 +146,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou std::uint32_t* status_flags = reinterpret_cast(mem_pool); _Type* tile_sums = reinterpret_cast<_Type*>(mem_pool + tile_sums_offset); - ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(status_flags_size, wgsize); + ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(status_flags_elems, wgsize); auto fill_event = __queue.submit( [&](sycl::handler& hdl) @@ -155,7 +155,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou [=](const sycl::nd_item<1>& item) { int id = item.get_global_linear_id(); - if (id < status_flags_size) + if (id < status_flags_elems) status_flags[id] = id < status_flag_padding ? __scan_status_flag<_Type>::OUT_OF_BOUNDS @@ -177,7 +177,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou { sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space> - idx_atomic(status_flags[status_flags_size - 1]); + idx_atomic(status_flags[status_flags_elems - 1]); tile_id_lacc[0] = idx_atomic.fetch_add(1); } sycl::group_barrier(group); From d716bbd21a451b0b1424de131f00eb48b1a7a0e8 Mon Sep 17 00:00:00 2001 From: Aidan Date: Wed, 8 Nov 2023 13:21:32 +0000 Subject: [PATCH 016/134] Global to local then perform op --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 38 ++++++++++++++----- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 038018a13ac..846208007da 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -165,11 +165,14 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou auto event = __queue.submit([&](sycl::handler& hdl) { auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); + auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); hdl.depends_on(fill_event); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { auto group = item.get_group(); + auto local_id = item.get_local_id(0); + auto stride = item.get_local_range(0); auto subgroup = item.get_sub_group(); // Obtain unique ID for this work-group that will be used in decoupled lookback @@ -183,16 +186,33 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou sycl::group_barrier(group); std::uint32_t tile_id = tile_id_lacc[0]; - auto current_offset = (tile_id*elems_in_tile); - auto next_offset = ((tile_id+1)*elems_in_tile); - if (next_offset > n) - next_offset = n; - auto in_begin = __in_rng.begin() + current_offset; - auto in_end = __in_rng.begin() + next_offset; - auto out_begin = __out_rng.begin() + current_offset; - - if (current_offset >= n) + // Global load into local + auto wg_current_offset = (tile_id*elems_in_tile); + auto wg_next_offset = ((tile_id+1)*elems_in_tile); + size_t wg_local_memory_size = elems_in_tile; + if (wg_current_offset >= n) return; + if (wg_next_offset >= n) { + wg_local_memory_size = n - wg_current_offset; + wg_next_offset = n; // Not needed + } + + // TODO: vectorize loads, where possible + if (wg_next_offset <= n) { + _ONEDPL_PRAGMA_UNROLL + for (std::uint32_t i = 0; i < elems_per_workitem; ++i) + tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + local_id + stride * i]; + } else { + for (std::uint32_t i = 0; i < elems_per_workitem; ++i) { + if (wg_current_offset + stride * i < n) + tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + stride * i]; + } + } + sycl::group_barrier(group); + + auto in_begin = tile_vals.get_pointer(); + auto in_end = in_begin + wg_local_memory_size; + auto out_begin = __out_rng.begin() + wg_current_offset; auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op); _Type prev_sum = 0; From 6a474c7dc2aeee9082d8183db8fefbc8355a6bd0 Mon Sep 17 00:00:00 2001 From: Aidan Date: Fri, 10 Nov 2023 13:51:35 +0000 Subject: [PATCH 017/134] Update based on feedback --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 846208007da..1bd10595413 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -192,20 +192,17 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou size_t wg_local_memory_size = elems_in_tile; if (wg_current_offset >= n) return; - if (wg_next_offset >= n) { + if (wg_next_offset > n) wg_local_memory_size = n - wg_current_offset; - wg_next_offset = n; // Not needed - } - // TODO: vectorize loads, where possible if (wg_next_offset <= n) { _ONEDPL_PRAGMA_UNROLL for (std::uint32_t i = 0; i < elems_per_workitem; ++i) tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + local_id + stride * i]; } else { for (std::uint32_t i = 0; i < elems_per_workitem; ++i) { - if (wg_current_offset + stride * i < n) - tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + stride * i]; + if (wg_current_offset + local_id + stride * i < n) + tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + local_id + stride * i]; } } sycl::group_barrier(group); From ba7be34eb82634cb9c81757050ad51b30210bec4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?= Date: Tue, 21 Nov 2023 11:48:48 +0000 Subject: [PATCH 018/134] Refactored cooperative_loopback and memory implementation (#24) * Refactored cooperative_loopback and memory implementation detail * renamed load_counter to fetch_add_counter * Removed dynamic tile counter from the scan memory struct * scratch memory Reordering * Fixed wrong values returned in LoopbackScanMemory.get_value * Improved Class and variable naming --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 334 +++++++++++++----- 1 file changed, 253 insertions(+), 81 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 1bd10595413..314ace11410 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -16,51 +16,244 @@ #ifndef _ONEDPL_parallel_backend_sycl_scan_H #define _ONEDPL_parallel_backend_sycl_scan_H +#include +#include + namespace oneapi::dpl::experimental::kt { inline namespace igpu { -constexpr size_t SUBGROUP_SIZE = 32; +constexpr ::std::size_t SUBGROUP_SIZE = 32; + +template typename LoopbackScanMemory, typename TileId> +struct ScanMemoryManager +{ + using _TileIdT = typename TileId::_TileIdT; + using _FlagT = typename LoopbackScanMemory::_FlagT; + + ScanMemoryManager(sycl::queue q) : q{q} {}; + + ::std::uint8_t* + scan_memory_ptr() noexcept + { + return scan_memory_begin; + }; + + _TileIdT* + tile_id_ptr() noexcept + { + return tile_id_begin; + }; + + void + allocate(::std::size_t num_wgs) + { + ::std::size_t scan_memory_size = LoopbackScanMemory::get_memory_size(num_wgs); + constexpr ::std::size_t padded_tileid_size = TileId::get_padded_memory_size(); + constexpr ::std::size_t tileid_size = TileId::get_memory_size(); + + auto mem_size_bytes = scan_memory_size + padded_tileid_size; + + scratch = sycl::malloc_device<::std::uint8_t>(mem_size_bytes, q); + + scan_memory_begin = scratch; + + void* base_tileid_ptr = reinterpret_cast(scan_memory_begin + scan_memory_size); + size_t remainder = mem_size_bytes - scan_memory_size; + + tile_id_begin = reinterpret_cast<_TileIdT*>( + ::std::align(::std::alignment_of_v<_TileIdT>, tileid_size, base_tileid_ptr, remainder)); + } + + sycl::event + async_free(sycl::event dependency) + { + return q.submit( + [e = dependency, ptr = scratch, q_ = q](sycl::handler& hdl) + { + hdl.depends_on(e); + hdl.host_task([=]() { sycl::free(ptr, q_); }); + }); + } + + private: + ::std::uint8_t* scratch = nullptr; + ::std::uint8_t* scan_memory_begin = nullptr; + _TileIdT* tile_id_begin = nullptr; + + sycl::queue q; +}; -template -struct __scan_status_flag +template +struct LoopbackScanMemory { - using _AtomicRefT = sycl::atomic_ref<::std::uint32_t, sycl::memory_order::acq_rel, sycl::memory_scope::device, - sycl::access::address_space::global_space>; - static constexpr std::uint32_t NOT_READY = 0; - static constexpr std::uint32_t PARTIAL_MASK = 1; - static constexpr std::uint32_t FULL_MASK = 2; - static constexpr std::uint32_t OUT_OF_BOUNDS = 4; - - static constexpr int padding = SUBGROUP_SIZE; - - __scan_status_flag(const std::uint32_t tile_id, std::uint32_t* flags_begin, _T* tile_sums, - size_t num_elements) - : atomic_flag(*(flags_begin + tile_id + padding)), scanned_partial_value(tile_sums + tile_id + padding), - scanned_full_value(tile_sums + tile_id + padding + num_elements), num_elements{num_elements} + using _FlagT = ::std::uint32_t; + using _AtomicFlagRefT = sycl::atomic_ref<_FlagT, sycl::memory_order::acq_rel, sycl::memory_scope::device, + sycl::access::address_space::global_space>; + + static constexpr _FlagT NOT_READY = 0; + static constexpr _FlagT PARTIAL_MASK = 1; + static constexpr _FlagT FULL_MASK = 2; + static constexpr _FlagT OUT_OF_BOUNDS = 4; + + static constexpr ::std::size_t padding = SUBGROUP_SIZE; + + LoopbackScanMemory(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs) + : num_elements(get_num_elements(num_wgs)) { + // LoopbackScanMemory: [Partial Value, ..., Full Value, ..., Flag, ...] + // Each section has num_wgs + padding elements + tile_values_begin = reinterpret_cast<_T*>(scan_memory_begin); + flags_begin = get_flags_begin(scan_memory_begin, num_wgs); } void - set_partial(_T val) + set_partial(::std::size_t tile_id, _T val) { - (*scanned_partial_value) = val; + _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding)); + + tile_values_begin[tile_id + padding] = val; atomic_flag.store(PARTIAL_MASK); } void - set_full(_T val) + set_full(::std::size_t tile_id, _T val) { - (*scanned_full_value) = val; + _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding)); + + tile_values_begin[tile_id + padding + num_elements] = val; atomic_flag.store(FULL_MASK); } - template + _FlagT + load_flag(::std::size_t tile_id) const + { + _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding)); + + return atomic_flag.load(); + } + + _T + get_value(::std::size_t tile_id, _FlagT flag) const + { + ::std::size_t offset = tile_id + padding + num_elements * is_full(flag); + return tile_values_begin[offset]; + } + + static ::std::size_t + get_tile_values_bytes(::std::size_t num_elements) + { + return (2 * num_elements) * sizeof(_T); + } + + static ::std::size_t + get_flag_bytes(::std::size_t num_elements) + { + return num_elements * sizeof(_FlagT); + } + + static ::std::size_t + get_padded_flag_bytes(::std::size_t num_elements) + { + // sizeof(_FlagT) extra bytes for possible intenal alignment + return get_flag_bytes(num_elements) + sizeof(_FlagT); + } + + static _FlagT* + get_flags_begin(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs) + { + // Aligned flags + ::std::size_t num_elements = get_num_elements(num_wgs); + ::std::size_t tile_values_bytes = get_tile_values_bytes(num_elements); + void* base_flags = reinterpret_cast(scan_memory_begin + tile_values_bytes); + auto remainder = get_padded_flag_bytes(num_elements); // scan_memory_bytes - tile_values_bytes + return reinterpret_cast<_FlagT*>( + ::std::align(::std::alignment_of_v<_FlagT>, get_flag_bytes(num_elements), base_flags, remainder)); + } + + static ::std::size_t + get_memory_size(::std::size_t num_wgs) + { + ::std::size_t num_elements = get_num_elements(num_wgs); + // sizeof(_T) extra bytes are not needed because LoopbackScanMemory is going at the beginning of the scratch + ::std::size_t tile_values_bytes = get_tile_values_bytes(num_elements); + // Padding to provide room for aligment + ::std::size_t flag_bytes = get_padded_flag_bytes(num_elements); + + return tile_values_bytes + flag_bytes; + } + + static ::std::size_t + get_num_elements(::std::size_t num_wgs) + { + return padding + num_wgs; + } + + static bool + is_ready(_FlagT flag) + { + return flag != NOT_READY; + } + + static bool + is_full(_FlagT flag) + { + return flag == FULL_MASK; + } + + static bool + is_out_of_bounds(_FlagT flag) + { + return flag == OUT_OF_BOUNDS; + } + + private: + ::std::size_t num_elements; + _FlagT* flags_begin; + _T* tile_values_begin; +}; + +struct TileId +{ + using _TileIdT = ::std::uint32_t; + using _AtomicTileRefT = sycl::atomic_ref<_TileIdT, sycl::memory_order::relaxed, sycl::memory_scope::device, + sycl::access::address_space::global_space>; + + TileId(_TileIdT* tileid_memory) : tile_counter(*(tileid_memory)) {} + + constexpr static ::std::size_t + get_padded_memory_size() + { + // extra sizeof(_TileIdT) for possible aligment issues + return sizeof(_TileIdT) + sizeof(_TileIdT); + } + + constexpr static ::std::size_t + get_memory_size() + { + // extra sizeof(_TileIdT) for possible aligment issues + return sizeof(_TileIdT); + } + + _TileIdT + fetch_inc() + { + return tile_counter.fetch_add(1); + } + + _AtomicTileRefT tile_counter; +}; + +struct cooperative_lookback +{ + + template typename LoopbackScanMemory> _T - cooperative_lookback(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin, - _T* tile_sums) + operator()(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, LoopbackScanMemory<_T> memory) { + using FlagT = typename LoopbackScanMemory<_T>::_FlagT; + _T sum = 0; int offset = -1; int i = 0; @@ -68,24 +261,20 @@ struct __scan_status_flag for (int tile = static_cast(tile_id) + offset; tile >= 0; tile -= SUBGROUP_SIZE) { - _AtomicRefT tile_atomic(*(flags_begin + tile + padding - local_id)); - std::uint32_t flag; + FlagT flag; do { - flag = tile_atomic.load(); - } while (!sycl::all_of_group(subgroup, flag != NOT_READY)); // Loop till all ready + flag = memory.load_flag(tile - local_id); + } while (!sycl::all_of_group(subgroup, LoopbackScanMemory<_T>::is_ready(flag))); // Loop till all ready - bool is_full = flag == FULL_MASK; + bool is_full = LoopbackScanMemory<_T>::is_full(flag); auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full); auto lowest_item_with_full = is_full_ballot.find_low(); - // The partial scan results and the full scan sum values are in contiguous memory. - // Each section of the memory is of size num_elements. - // The partial sum for a tile is at [i] and the full sum is at [i + num_elements] - // is_full * num_elements allows to select between the two values without branching the code. - size_t contrib_offset = tile + padding - local_id + is_full * num_elements; - _T val = *(tile_sums + contrib_offset); - _T contribution = local_id <= lowest_item_with_full && (tile - local_id >= 0) ? val : _T{0}; + // TODO: Use identity_fn for out of bounds values + _T contribution = local_id <= lowest_item_with_full && !LoopbackScanMemory<_T>::is_out_of_bounds(flag) + ? memory.get_value(tile - local_id, flag) + : _T{0}; // Sum all of the partial results from the tiles found, as well as the full contribution from the closest tile (if any) sum += sycl::reduce_over_group(subgroup, contribution, bin_op); @@ -100,12 +289,6 @@ struct __scan_status_flag return sum; } - - _AtomicRefT atomic_flag; - _T* scanned_partial_value; - _T* scanned_full_value; - - size_t num_elements; }; template @@ -113,6 +296,8 @@ void single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op) { using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + using _TileIdT = TileId::_TileIdT; + using _FlagT = typename LoopbackScanMemory<_Type>::_FlagT; static_assert(_Inclusive, "Single-pass scan only available for inclusive scan"); @@ -122,31 +307,22 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem; // Avoid non_uniform n by padding up to a multiple of wgsize - std::uint32_t elems_in_tile = wgsize * elems_per_workitem; + ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem; ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); ::std::size_t num_workitems = num_wgs * wgsize; - constexpr int status_flag_padding = SUBGROUP_SIZE; - std::size_t status_flags_elems = num_wgs + status_flag_padding + 1; - std::size_t status_flags_size = status_flags_elems * sizeof(std::uint32_t); - - std::size_t tile_sums_elems = num_wgs + status_flag_padding; - std::size_t tile_sums_size = status_flags_elems * sizeof(_Type); + ScanMemoryManager<_Type, LoopbackScanMemory, TileId> scratch(__queue); + scratch.allocate(num_wgs); - std::size_t extra_mem_for_aligment = alignof(_Type) - (status_flags_size % alignof(_Type)); - // status_flags_size for the status_flags - // extra_mem_for_aligment of the datatype _Type - // First tile_sums_size partial scanned values - // Second tile_sums_size full scanned values (current partial plus all previous workgroups partial) - char* mem_pool = - sycl::malloc_device(status_flags_size + extra_mem_for_aligment + 2 * tile_sums_size, __queue); + // Memory Structure: + // [Loopback Scan Memory, Tile Id Counter] + auto scan_memory_begin = scratch.scan_memory_ptr(); + auto status_flags_begin = LoopbackScanMemory<_Type>::get_flags_begin(scan_memory_begin, num_wgs); + auto tile_id_begin = scratch.tile_id_ptr(); - std::size_t tile_sums_offset = status_flags_size + extra_mem_for_aligment; - - std::uint32_t* status_flags = reinterpret_cast(mem_pool); - _Type* tile_sums = reinterpret_cast<_Type*>(mem_pool + tile_sums_offset); - - ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(status_flags_elems, wgsize); + ::std::size_t num_elements = LoopbackScanMemory<_Type>::get_num_elements(num_wgs); + // fill_num_wgs num_elements + 1 to also initialize tile_id_counter + ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize); auto fill_event = __queue.submit( [&](sycl::handler& hdl) @@ -155,14 +331,17 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou [=](const sycl::nd_item<1>& item) { int id = item.get_global_linear_id(); - if (id < status_flags_elems) - status_flags[id] = - id < status_flag_padding - ? __scan_status_flag<_Type>::OUT_OF_BOUNDS - : __scan_status_flag<_Type>::NOT_READY; + if (id < num_elements) + status_flags_begin[id] = + id < LoopbackScanMemory<_Type>::padding + ? LoopbackScanMemory<_Type>::OUT_OF_BOUNDS + : LoopbackScanMemory<_Type>::NOT_READY; + if (id == num_elements) + tile_id_begin[0] = 0; }); }); + auto event = __queue.submit([&](sycl::handler& hdl) { auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); @@ -176,12 +355,10 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou auto subgroup = item.get_sub_group(); // Obtain unique ID for this work-group that will be used in decoupled lookback + TileId dynamic_tile_id(tile_id_begin); if (group.leader()) { - sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, - sycl::access::address_space::global_space> - idx_atomic(status_flags[status_flags_elems - 1]); - tile_id_lacc[0] = idx_atomic.fetch_add(1); + tile_id_lacc[0] = dynamic_tile_id.fetch_inc(); } sycl::group_barrier(group); std::uint32_t tile_id = tile_id_lacc[0]; @@ -207,7 +384,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou } sycl::group_barrier(group); - auto in_begin = tile_vals.get_pointer(); + auto in_begin = tile_vals.template get_multi_ptr().get(); auto in_end = in_begin + wg_local_memory_size; auto out_begin = __out_rng.begin() + wg_current_offset; @@ -217,16 +394,16 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou // The first sub-group will query the previous tiles to find a prefix if (subgroup.get_group_id() == 0) { - __scan_status_flag<_Type> flag(tile_id, status_flags, tile_sums, tile_sums_elems); + LoopbackScanMemory<_Type> scan_mem(scan_memory_begin, num_wgs); if (group.leader()) - flag.set_partial(local_sum); + scan_mem.set_partial(tile_id, local_sum); // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum - prev_sum = flag.cooperative_lookback(tile_id, subgroup, __binary_op, status_flags, tile_sums); + prev_sum = cooperative_lookback()(tile_id, subgroup, __binary_op, scan_mem); if (group.leader()) - flag.set_full(prev_sum + local_sum); + scan_mem.set_full(tile_id, prev_sum + local_sum); } prev_sum = sycl::group_broadcast(group, prev_sum, 0); @@ -234,12 +411,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou }); }); - auto free_event = __queue.submit( - [=](sycl::handler& hdl) - { - hdl.depends_on(event); - hdl.host_task([=](){ sycl::free(mem_pool, __queue); }); - }); + scratch.async_free(event); event.wait(); } From 69cc2fadc20a5ac89d4e2c2e76ab85b55f7521fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?= Date: Thu, 23 Nov 2023 14:11:27 +0000 Subject: [PATCH 019/134] [Scan_kt] Atomic64 flags + value implementation (#25) * Implemented atomic64 version of the scan_kt pass * Removed repeated offset calculation for tile id atomic flag * Loopback -> Lookback. Removed unused var. --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 351 ++++++++++++------ 1 file changed, 243 insertions(+), 108 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 314ace11410..6dfe1bb6ef1 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -26,11 +26,13 @@ inline namespace igpu { constexpr ::std::size_t SUBGROUP_SIZE = 32; -template typename LoopbackScanMemory, typename TileId> +template typename LookbackScanMemory, + typename TileId> struct ScanMemoryManager { using _TileIdT = typename TileId::_TileIdT; - using _FlagT = typename LoopbackScanMemory::_FlagT; + using _LookbackScanMemory = LookbackScanMemory; + using _FlagT = typename _LookbackScanMemory::_FlagT; ScanMemoryManager(sycl::queue q) : q{q} {}; @@ -49,7 +51,7 @@ struct ScanMemoryManager void allocate(::std::size_t num_wgs) { - ::std::size_t scan_memory_size = LoopbackScanMemory::get_memory_size(num_wgs); + ::std::size_t scan_memory_size = _LookbackScanMemory::get_memory_size(num_wgs); constexpr ::std::size_t padded_tileid_size = TileId::get_padded_memory_size(); constexpr ::std::size_t tileid_size = TileId::get_memory_size(); @@ -85,8 +87,11 @@ struct ScanMemoryManager sycl::queue q; }; +template +struct LookbackScanMemory; + template -struct LoopbackScanMemory +struct LookbackScanMemory<_T, /* UseAtomic64=*/::std::false_type> { using _FlagT = ::std::uint32_t; using _AtomicFlagRefT = sycl::atomic_ref<_FlagT, sycl::memory_order::acq_rel, sycl::memory_scope::device, @@ -99,13 +104,12 @@ struct LoopbackScanMemory static constexpr ::std::size_t padding = SUBGROUP_SIZE; - LoopbackScanMemory(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs) - : num_elements(get_num_elements(num_wgs)) + // LookbackScanMemory: [Partial Value, ..., Full Value, ..., Flag, ...] + // Each section has num_wgs + padding elements + LookbackScanMemory(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs) + : num_elements(get_num_elements(num_wgs)), tile_values_begin(reinterpret_cast<_T*>(scan_memory_begin)), + flags_begin(get_flags_begin(scan_memory_begin, num_wgs)) { - // LoopbackScanMemory: [Partial Value, ..., Full Value, ..., Flag, ...] - // Each section has num_wgs + padding elements - tile_values_begin = reinterpret_cast<_T*>(scan_memory_begin); - flags_begin = get_flags_begin(scan_memory_begin, num_wgs); } void @@ -126,19 +130,17 @@ struct LoopbackScanMemory atomic_flag.store(FULL_MASK); } - _FlagT - load_flag(::std::size_t tile_id) const + _AtomicFlagRefT + get_flag(::std::size_t tile_id) const { - _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding)); - - return atomic_flag.load(); + return _AtomicFlagRefT(*(flags_begin + tile_id + padding)); } _T get_value(::std::size_t tile_id, _FlagT flag) const { - ::std::size_t offset = tile_id + padding + num_elements * is_full(flag); - return tile_values_begin[offset]; + // full_value and partial_value are num_elements apart + return *(tile_values_begin + tile_id + padding + num_elements * is_full(flag)); } static ::std::size_t @@ -176,7 +178,7 @@ struct LoopbackScanMemory get_memory_size(::std::size_t num_wgs) { ::std::size_t num_elements = get_num_elements(num_wgs); - // sizeof(_T) extra bytes are not needed because LoopbackScanMemory is going at the beginning of the scratch + // sizeof(_T) extra bytes are not needed because LookbackScanMemory is going at the beginning of the scratch ::std::size_t tile_values_bytes = get_tile_values_bytes(num_elements); // Padding to provide room for aligment ::std::size_t flag_bytes = get_padded_flag_bytes(num_elements); @@ -214,6 +216,110 @@ struct LoopbackScanMemory _T* tile_values_begin; }; +template +struct LookbackScanMemory<_T, /* UseAtomic64=*/::std::true_type> +{ + using _FlagT = ::std::uint64_t; + using _AtomicFlagRefT = sycl::atomic_ref<_FlagT, sycl::memory_order::relaxed, sycl::memory_scope::device, + sycl::access::address_space::global_space>; + + // Each flag is divided in 2 32bit values + // 32..63 status bits + // 00..31 value bits + // Example: status = full scanned value, int value = 15: + // 1000 0000 0000 0000 0000 0000 0000 0000 | 0000 0000 0000 0000 0000 0000 0000 1111 + + // Status values: + // 00xxxx - not computed + // 01xxxx - partial + // 10xxxx - full + // 110000 - out of bounds + + static constexpr _FlagT NOT_READY = 0; + static constexpr _FlagT PARTIAL_MASK = 1l << (sizeof(_FlagT) * 8 - 2); + static constexpr _FlagT FULL_MASK = 1l << (sizeof(_FlagT) * 8 - 1); + static constexpr _FlagT OUT_OF_BOUNDS = PARTIAL_MASK | FULL_MASK; + + static constexpr _FlagT VALUE_MASK = (1l << sizeof(::std::uint32_t) * 8) - 1; // 32 bit mask to store value + + static constexpr ::std::size_t padding = SUBGROUP_SIZE; + + LookbackScanMemory(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs) + : num_elements(get_num_elements(num_wgs)), flags_begin(get_flags_begin(scan_memory_begin, num_wgs)) + { + } + + void + set_partial(::std::size_t tile_id, _T val) + { + _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding)); + + atomic_flag.store(PARTIAL_MASK | static_cast<::std::uint32_t>(val)); + } + + void + set_full(::std::size_t tile_id, _T val) + { + _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding)); + + atomic_flag.store(FULL_MASK | static_cast<::std::uint32_t>(val)); + } + + _AtomicFlagRefT + get_flag(::std::size_t tile_id) const + { + return _AtomicFlagRefT(*(flags_begin + tile_id + padding)); + } + + _T + get_value(::std::size_t, _FlagT flag) const + { + return static_cast<::std::uint32_t>(flag & VALUE_MASK); + } + + static _FlagT* + get_flags_begin(::std::uint8_t* scan_memory_begin, ::std::size_t) + { + return reinterpret_cast<_FlagT*>(scan_memory_begin); + } + + static ::std::size_t + get_memory_size(::std::size_t num_wgs) + { + ::std::size_t num_elements = get_num_elements(num_wgs); + return num_elements * sizeof(_FlagT); + } + + static ::std::size_t + get_num_elements(::std::size_t num_wgs) + { + return padding + num_wgs; + } + + static bool + is_ready(_FlagT flag) + { + // flag & OUT_OF_BOUNDS != NOT_READY means it has either partial or full value, or is out of bounds + return (flag & OUT_OF_BOUNDS) != NOT_READY; + } + + static bool + is_full(_FlagT flag) + { + return (flag & OUT_OF_BOUNDS) == FULL_MASK; + } + + static bool + is_out_of_bounds(_FlagT flag) + { + return (flag & OUT_OF_BOUNDS) == OUT_OF_BOUNDS; + } + + private: + ::std::size_t num_elements; + _FlagT* flags_begin; +}; + struct TileId { using _TileIdT = ::std::uint32_t; @@ -248,11 +354,14 @@ struct TileId struct cooperative_lookback { - template typename LoopbackScanMemory> + template typename LookbackScanMemory, typename UseAtomic64> _T - operator()(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, LoopbackScanMemory<_T> memory) + operator()(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, + LookbackScanMemory<_T, UseAtomic64> memory) { - using FlagT = typename LoopbackScanMemory<_T>::_FlagT; + using _LookbackScanMemory = LookbackScanMemory<_T, UseAtomic64>; + using FlagT = typename _LookbackScanMemory::_FlagT; _T sum = 0; int offset = -1; @@ -261,18 +370,19 @@ struct cooperative_lookback for (int tile = static_cast(tile_id) + offset; tile >= 0; tile -= SUBGROUP_SIZE) { + auto atomic_flag = memory.get_flag(tile - local_id); FlagT flag; do { - flag = memory.load_flag(tile - local_id); - } while (!sycl::all_of_group(subgroup, LoopbackScanMemory<_T>::is_ready(flag))); // Loop till all ready + flag = atomic_flag.load(); + } while (!sycl::all_of_group(subgroup, _LookbackScanMemory::is_ready(flag))); // Loop till all ready - bool is_full = LoopbackScanMemory<_T>::is_full(flag); + bool is_full = _LookbackScanMemory::is_full(flag); auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full); auto lowest_item_with_full = is_full_ballot.find_low(); // TODO: Use identity_fn for out of bounds values - _T contribution = local_id <= lowest_item_with_full && !LoopbackScanMemory<_T>::is_out_of_bounds(flag) + _T contribution = local_id <= lowest_item_with_full && !_LookbackScanMemory::is_out_of_bounds(flag) ? memory.get_value(tile - local_id, flag) : _T{0}; @@ -291,124 +401,131 @@ struct cooperative_lookback } }; -template +template void single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op) { using _Type = oneapi::dpl::__internal::__value_t<_InRange>; using _TileIdT = TileId::_TileIdT; - using _FlagT = typename LoopbackScanMemory<_Type>::_FlagT; + using _LookbackScanMemory = LookbackScanMemory<_Type, _UseAtomic64>; + using _FlagT = typename _LookbackScanMemory::_FlagT; - static_assert(_Inclusive, "Single-pass scan only available for inclusive scan"); + static_assert(std::is_same_v<_Inclusive, ::std::true_type>, "Single-pass scan only available for inclusive scan"); const ::std::size_t n = __in_rng.size(); constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem; - // Avoid non_uniform n by padding up to a multiple of wgsize ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem; ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); ::std::size_t num_workitems = num_wgs * wgsize; - ScanMemoryManager<_Type, LoopbackScanMemory, TileId> scratch(__queue); + ScanMemoryManager<_Type, _UseAtomic64, LookbackScanMemory, TileId> scratch(__queue); scratch.allocate(num_wgs); // Memory Structure: - // [Loopback Scan Memory, Tile Id Counter] + // [Lookback Scan Memory, Tile Id Counter] auto scan_memory_begin = scratch.scan_memory_ptr(); - auto status_flags_begin = LoopbackScanMemory<_Type>::get_flags_begin(scan_memory_begin, num_wgs); + auto status_flags_begin = _LookbackScanMemory::get_flags_begin(scan_memory_begin, num_wgs); auto tile_id_begin = scratch.tile_id_ptr(); - ::std::size_t num_elements = LoopbackScanMemory<_Type>::get_num_elements(num_wgs); + ::std::size_t num_elements = _LookbackScanMemory::get_num_elements(num_wgs); // fill_num_wgs num_elements + 1 to also initialize tile_id_counter ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize); auto fill_event = __queue.submit( [&](sycl::handler& hdl) { - hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize}, - [=](const sycl::nd_item<1>& item) - { - int id = item.get_global_linear_id(); - if (id < num_elements) - status_flags_begin[id] = - id < LoopbackScanMemory<_Type>::padding - ? LoopbackScanMemory<_Type>::OUT_OF_BOUNDS - : LoopbackScanMemory<_Type>::NOT_READY; - if (id == num_elements) - tile_id_begin[0] = 0; - }); + hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize}, + [=](const sycl::nd_item<1>& item) + { + int id = item.get_global_linear_id(); + if (id < num_elements) + status_flags_begin[id] = id < _LookbackScanMemory::padding + ? _LookbackScanMemory::OUT_OF_BOUNDS + : _LookbackScanMemory::NOT_READY; + if (id == num_elements) + tile_id_begin[0] = 0; + }); }); - auto event = __queue.submit([&](sycl::handler& hdl) { auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); hdl.depends_on(fill_event); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); - hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { - auto group = item.get_group(); - auto local_id = item.get_local_id(0); - auto stride = item.get_local_range(0); - auto subgroup = item.get_sub_group(); - - // Obtain unique ID for this work-group that will be used in decoupled lookback - TileId dynamic_tile_id(tile_id_begin); - if (group.leader()) - { - tile_id_lacc[0] = dynamic_tile_id.fetch_inc(); - } - sycl::group_barrier(group); - std::uint32_t tile_id = tile_id_lacc[0]; - - // Global load into local - auto wg_current_offset = (tile_id*elems_in_tile); - auto wg_next_offset = ((tile_id+1)*elems_in_tile); - size_t wg_local_memory_size = elems_in_tile; - if (wg_current_offset >= n) - return; - if (wg_next_offset > n) - wg_local_memory_size = n - wg_current_offset; - - if (wg_next_offset <= n) { - _ONEDPL_PRAGMA_UNROLL - for (std::uint32_t i = 0; i < elems_per_workitem; ++i) - tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + local_id + stride * i]; - } else { - for (std::uint32_t i = 0; i < elems_per_workitem; ++i) { - if (wg_current_offset + local_id + stride * i < n) - tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + local_id + stride * i]; - } - } - sycl::group_barrier(group); - - auto in_begin = tile_vals.template get_multi_ptr().get(); - auto in_end = in_begin + wg_local_memory_size; - auto out_begin = __out_rng.begin() + wg_current_offset; - - auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op); - _Type prev_sum = 0; - - // The first sub-group will query the previous tiles to find a prefix - if (subgroup.get_group_id() == 0) - { - LoopbackScanMemory<_Type> scan_mem(scan_memory_begin, num_wgs); - - if (group.leader()) - scan_mem.set_partial(tile_id, local_sum); - - // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum - prev_sum = cooperative_lookback()(tile_id, subgroup, __binary_op, scan_mem); - - if (group.leader()) - scan_mem.set_full(tile_id, prev_sum + local_sum); - } - - prev_sum = sycl::group_broadcast(group, prev_sum, 0); - sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum); - }); + hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), + [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] + { + auto group = item.get_group(); + auto local_id = item.get_local_id(0); + auto stride = item.get_local_range(0); + auto subgroup = item.get_sub_group(); + + // Obtain unique ID for this work-group that will be used in decoupled lookback + TileId dynamic_tile_id(tile_id_begin); + if (group.leader()) + { + tile_id_lacc[0] = dynamic_tile_id.fetch_inc(); + } + sycl::group_barrier(group); + std::uint32_t tile_id = tile_id_lacc[0]; + + // Global load into local + auto wg_current_offset = (tile_id * elems_in_tile); + auto wg_next_offset = ((tile_id + 1) * elems_in_tile); + size_t wg_local_memory_size = elems_in_tile; + if (wg_current_offset >= n) + return; + if (wg_next_offset > n) + wg_local_memory_size = n - wg_current_offset; + + if (wg_next_offset <= n) + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint32_t i = 0; i < elems_per_workitem; ++i) + tile_vals[local_id + stride * i] = + __in_rng[wg_current_offset + local_id + stride * i]; + } + else + { + for (std::uint32_t i = 0; i < elems_per_workitem; ++i) + { + if (wg_current_offset + local_id + stride * i < n) + tile_vals[local_id + stride * i] = + __in_rng[wg_current_offset + local_id + stride * i]; + } + } + sycl::group_barrier(group); + + auto in_begin = tile_vals.template get_multi_ptr().get(); + auto in_end = in_begin + wg_local_memory_size; + auto out_begin = __out_rng.begin() + wg_current_offset; + + auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op); + _Type prev_sum = 0; + + // The first sub-group will query the previous tiles to find a prefix + if (subgroup.get_group_id() == 0) + { + _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs); + + if (group.leader()) + scan_mem.set_partial(tile_id, local_sum); + + // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum + prev_sum = cooperative_lookback()(tile_id, subgroup, __binary_op, scan_mem); + + if (group.leader()) + scan_mem.set_full(tile_id, prev_sum + local_sum); + } + + prev_sum = sycl::group_broadcast(group, prev_sum, 0); + sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum); + }); }); scratch.async_free(event); @@ -438,7 +555,25 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); auto __buf2 = __keep2(__out_begin, __out_begin + __n); - single_pass_scan_impl<_KernelParam, true>(__queue, __buf1.all_view(), __buf2.all_view(), __binary_op); + // Avoid aspect query overhead for sizeof(Types) > 32 bits + if constexpr (sizeof(typename std::iterator_traits<_InIterator>::value_type) <= sizeof(std::uint32_t)) + { + if (__queue.get_device().has(sycl::aspect::atomic64)) + { + single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::true_type>( + __queue, __buf1.all_view(), __buf2.all_view(), __binary_op); + } + else + { + single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type>( + __queue, __buf1.all_view(), __buf2.all_view(), __binary_op); + } + } + else + { + single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type>( + __queue, __buf1.all_view(), __buf2.all_view(), __binary_op); + } } } // inline namespace igpu From b5851cea3b280d5a1f0739b25a06e995d7ddce27 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 23 Nov 2023 14:39:49 +0000 Subject: [PATCH 020/134] constexpr, types and remove an unneeded check --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 6dfe1bb6ef1..266d4b18657 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -364,8 +364,7 @@ struct cooperative_lookback using FlagT = typename _LookbackScanMemory::_FlagT; _T sum = 0; - int offset = -1; - int i = 0; + constexpr int offset = -1; int local_id = subgroup.get_local_id(); for (int tile = static_cast(tile_id) + offset; tile >= 0; tile -= SUBGROUP_SIZE) @@ -418,7 +417,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem; // Avoid non_uniform n by padding up to a multiple of wgsize - ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem; + constexpr ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem; ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); ::std::size_t num_workitems = num_wgs * wgsize; @@ -461,8 +460,8 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { auto group = item.get_group(); - auto local_id = item.get_local_id(0); - auto stride = item.get_local_range(0); + ::std::uint32_t local_id = item.get_local_id(0); + constexpr ::std::uint32_t stride = wgsize; auto subgroup = item.get_sub_group(); // Obtain unique ID for this work-group that will be used in decoupled lookback @@ -477,9 +476,8 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou // Global load into local auto wg_current_offset = (tile_id * elems_in_tile); auto wg_next_offset = ((tile_id + 1) * elems_in_tile); - size_t wg_local_memory_size = elems_in_tile; - if (wg_current_offset >= n) - return; + auto wg_local_memory_size = elems_in_tile; + if (wg_next_offset > n) wg_local_memory_size = n - wg_current_offset; @@ -502,7 +500,6 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou sycl::group_barrier(group); auto in_begin = tile_vals.template get_multi_ptr().get(); - auto in_end = in_begin + wg_local_memory_size; auto out_begin = __out_rng.begin() + wg_current_offset; auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op); From c9736c197d85d55703754a4e1791ec03545524e5 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 23 Nov 2023 14:40:04 +0000 Subject: [PATCH 021/134] Correct static_cast ? --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 266d4b18657..0655b60deb1 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -274,7 +274,7 @@ struct LookbackScanMemory<_T, /* UseAtomic64=*/::std::true_type> _T get_value(::std::size_t, _FlagT flag) const { - return static_cast<::std::uint32_t>(flag & VALUE_MASK); + return static_cast<_T>(flag & VALUE_MASK); } static _FlagT* From 0e450f79ffdbbb47c6aa71052d394e0fc57a73e8 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 23 Nov 2023 14:40:19 +0000 Subject: [PATCH 022/134] Defer group comms in lookback --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 0655b60deb1..ce186b4ffa4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -386,8 +386,7 @@ struct cooperative_lookback : _T{0}; // Sum all of the partial results from the tiles found, as well as the full contribution from the closest tile (if any) - sum += sycl::reduce_over_group(subgroup, contribution, bin_op); - + sum = bin_op(sum, contribution); // If we found a full value, we can stop looking at previous tiles. Otherwise, // keep going through tiles until we either find a full tile or we've completely // recomputed the prefix using partial values @@ -395,6 +394,7 @@ struct cooperative_lookback break; } + sum = sycl::reduce_over_group(subgroup, sum, bin_op); return sum; } From 95b55528d6730b245290d686ef53c97554d5e34b Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 23 Nov 2023 14:40:52 +0000 Subject: [PATCH 023/134] Disable dynamic tile ID by default TODO: we still allocate & initialize the memory for the counter --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 35 ++++++++++++------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index ce186b4ffa4..007186a2f9a 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -400,8 +400,8 @@ struct cooperative_lookback } }; -template +template void single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op) { @@ -464,14 +464,23 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou constexpr ::std::uint32_t stride = wgsize; auto subgroup = item.get_sub_group(); - // Obtain unique ID for this work-group that will be used in decoupled lookback - TileId dynamic_tile_id(tile_id_begin); - if (group.leader()) - { - tile_id_lacc[0] = dynamic_tile_id.fetch_inc(); - } - sycl::group_barrier(group); - std::uint32_t tile_id = tile_id_lacc[0]; + std::uint32_t tile_id; + if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>) + { + // Obtain unique ID for this work-group that will be used in decoupled lookback + TileId dynamic_tile_id(tile_id_begin); + if (group.leader()) + { + tile_id_lacc[0] = dynamic_tile_id.fetch_inc(); + } + sycl::group_barrier(group); + tile_id = tile_id_lacc[0]; + } + else + { + tile_id = group.get_group_linear_id(); + } + // Global load into local auto wg_current_offset = (tile_id * elems_in_tile); @@ -557,18 +566,18 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera { if (__queue.get_device().has(sycl::aspect::atomic64)) { - single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::true_type>( + single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::false_type>( __queue, __buf1.all_view(), __buf2.all_view(), __binary_op); } else { - single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type>( + single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type, /* UseDynamicTileID */ std::false_type>( __queue, __buf1.all_view(), __buf2.all_view(), __binary_op); } } else { - single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type>( + single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type, /* UseDynamicTileID */ std::false_type>( __queue, __buf1.all_view(), __buf2.all_view(), __binary_op); } } From 3f30ec8e4eb0ee8fb9e9589a3ecb32cdda85370b Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 23 Nov 2023 14:41:32 +0000 Subject: [PATCH 024/134] Reduce from register sums instead of local mem Also use #pragma unroll for now --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 007186a2f9a..e43cfee6aa6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -489,29 +489,36 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou if (wg_next_offset > n) wg_local_memory_size = n - wg_current_offset; - + _Type my_reducer{}; if (wg_next_offset <= n) { - _ONEDPL_PRAGMA_UNROLL + #pragma unroll for (std::uint32_t i = 0; i < elems_per_workitem; ++i) - tile_vals[local_id + stride * i] = - __in_rng[wg_current_offset + local_id + stride * i]; + { + _Type in_val = __in_rng[wg_current_offset + local_id + stride * i]; + my_reducer = __binary_op(my_reducer, in_val); + tile_vals[local_id + stride * i] = in_val; + } } else { + #pragma unroll for (std::uint32_t i = 0; i < elems_per_workitem; ++i) { if (wg_current_offset + local_id + stride * i < n) - tile_vals[local_id + stride * i] = - __in_rng[wg_current_offset + local_id + stride * i]; + { + _Type in_val = __in_rng[wg_current_offset + local_id + stride * i]; + my_reducer = __binary_op(my_reducer, in_val); + tile_vals[local_id + stride * i] = in_val; + } } } - sycl::group_barrier(group); + + auto local_sum = sycl::reduce_over_group(group, my_reducer, __binary_op); auto in_begin = tile_vals.template get_multi_ptr().get(); auto out_begin = __out_rng.begin() + wg_current_offset; - auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op); _Type prev_sum = 0; // The first sub-group will query the previous tiles to find a prefix From c147f053768c4d38fb71b89be1416f58919b1d3a Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 23 Nov 2023 14:42:06 +0000 Subject: [PATCH 025/134] Unrolled version of joint_inclusive_scan --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index e43cfee6aa6..68921c08c3c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -536,8 +536,23 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou scan_mem.set_full(tile_id, prev_sum + local_sum); } - prev_sum = sycl::group_broadcast(group, prev_sum, 0); - sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum); + _Type carry = sycl::group_broadcast(group, prev_sum, 0); + #pragma unroll + for (::std::uint32_t step = 0; step < elems_per_workitem; ++step) + { + ::std::uint32_t i = stride * step; + _Type x; + if (i + local_id < wg_local_memory_size) + { + x = in_begin[i + local_id]; + } + _Type out = sycl::inclusive_scan_over_group(group, x, __binary_op, carry); + if (i + local_id < wg_local_memory_size) + { + out_begin[i + local_id] = out; + } + carry = group_broadcast(group, out, stride - 1); + } }); }); From ab69568d21c5b6839cb10a7a69778519ff0ff430 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 23 Nov 2023 15:25:30 +0000 Subject: [PATCH 026/134] Update include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Alberto Cabrera Pérez --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 68921c08c3c..dae5cd7a48e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -537,6 +537,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou } _Type carry = sycl::group_broadcast(group, prev_sum, 0); + // TODO: Find a fix for _ONEDPL_PRAGMA_UNROLL #pragma unroll for (::std::uint32_t step = 0; step < elems_per_workitem; ++step) { From b992b847972a1f2fc83ace6443b280029bee1b20 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 23 Nov 2023 15:27:43 +0000 Subject: [PATCH 027/134] Add TODO --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index dae5cd7a48e..a85d86aeb31 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -489,6 +489,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou if (wg_next_offset > n) wg_local_memory_size = n - wg_current_offset; + //TODO: assumes default ctor produces identity w.r.t. __binary_op _Type my_reducer{}; if (wg_next_offset <= n) { From 37726be95a53dc76791b0fbbcc02dc39b8acda9c Mon Sep 17 00:00:00 2001 From: Alberto Cabrera Date: Tue, 28 Nov 2023 15:55:38 +0000 Subject: [PATCH 028/134] Changing fill kernel for a memset --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 39 ++++++++++--------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index a85d86aeb31..c1e1d2c0cbd 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -369,19 +369,20 @@ struct cooperative_lookback for (int tile = static_cast(tile_id) + offset; tile >= 0; tile -= SUBGROUP_SIZE) { - auto atomic_flag = memory.get_flag(tile - local_id); + auto atomic_flag = memory.get_flag(tile - local_id); // FlagT flag; do { flag = atomic_flag.load(); - } while (!sycl::all_of_group(subgroup, _LookbackScanMemory::is_ready(flag))); // Loop till all ready + } while (!sycl::all_of_group(subgroup, _LookbackScanMemory::is_ready(flag) || + (tile - local_id < 0))); // Loop till all ready bool is_full = _LookbackScanMemory::is_full(flag); auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full); auto lowest_item_with_full = is_full_ballot.find_low(); // TODO: Use identity_fn for out of bounds values - _T contribution = local_id <= lowest_item_with_full && !_LookbackScanMemory::is_out_of_bounds(flag) + _T contribution = local_id <= lowest_item_with_full && (tile - local_id >= 0) ? memory.get_value(tile - local_id, flag) : _T{0}; @@ -434,21 +435,23 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou // fill_num_wgs num_elements + 1 to also initialize tile_id_counter ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize); - auto fill_event = __queue.submit( - [&](sycl::handler& hdl) - { - hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize}, - [=](const sycl::nd_item<1>& item) - { - int id = item.get_global_linear_id(); - if (id < num_elements) - status_flags_begin[id] = id < _LookbackScanMemory::padding - ? _LookbackScanMemory::OUT_OF_BOUNDS - : _LookbackScanMemory::NOT_READY; - if (id == num_elements) - tile_id_begin[0] = 0; - }); - }); + auto fill_event = __queue.memset(status_flags_begin, 0, num_elements * sizeof(_FlagT) + 1 * sizeof(_TileIdT)); + + // auto fill_event = __queue.submit( + // [&](sycl::handler& hdl) + // { + // hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize}, + // [=](const sycl::nd_item<1>& item) + // { + // int id = item.get_global_linear_id(); + // if (id < num_elements) + // status_flags_begin[id] = id < _LookbackScanMemory::padding + // ? _LookbackScanMemory::OUT_OF_BOUNDS + // : _LookbackScanMemory::NOT_READY; + // if (id == num_elements) + // tile_id_begin[0] = 0; + // }); + // }); auto event = __queue.submit([&](sycl::handler& hdl) { auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); From d7c3c7860ffa0de52b9fd92941fc78be9be7955e Mon Sep 17 00:00:00 2001 From: Alberto Cabrera Date: Wed, 29 Nov 2023 15:19:30 +0000 Subject: [PATCH 029/134] Single wg implementation --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 358 ++++++++++++------ 1 file changed, 234 insertions(+), 124 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index c1e1d2c0cbd..345da745608 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -401,6 +401,89 @@ struct cooperative_lookback } }; +template +void +single_pass_scan_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op) +{ + using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + + static_assert(std::is_same_v<_Inclusive, ::std::true_type>, "Single-pass scan only available for inclusive scan"); + + const ::std::size_t n = __in_rng.size(); + + constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; + constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem; + // Avoid non_uniform n by padding up to a multiple of wgsize + constexpr ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem; + constexpr ::std::size_t num_workitems = wgsize; + + auto event = __queue.submit([&](sycl::handler& hdl) { + auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); + + oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); + hdl.parallel_for( + sycl::nd_range<1>(num_workitems, wgsize), [= + ](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { + auto group = item.get_group(); + ::std::uint32_t local_id = item.get_local_id(0); + constexpr ::std::uint32_t stride = wgsize; + auto subgroup = item.get_sub_group(); + + constexpr std::uint32_t tile_id = 0; + constexpr std::uint32_t wg_begin = 0; + constexpr std::uint32_t wg_end = elems_in_tile; + + std::uint32_t wg_local_memory_size = elems_in_tile; + + auto out_begin = __out_rng.begin(); + _Type carry = 0; + + // Global load into local + if (wg_end > n) + wg_local_memory_size = n; + + //TODO: assumes default ctor produces identity w.r.t. __binary_op + // _Type my_reducer{}; + if (wg_end <= n) + { +#pragma unroll + for (std::uint32_t step = 0; step < elems_per_workitem; ++step) + { + ::std::uint32_t i = stride * step; + _Type in_val = __in_rng[i + local_id]; + // my_reducer = __binary_op(my_reducer, in_val); + _Type out = sycl::inclusive_scan_over_group(group, in_val, __binary_op, carry); + out_begin[i + local_id] = out; + carry = group_broadcast(group, out, stride - 1); + } + } + else + { +#pragma unroll + for (std::uint32_t step = 0; step < elems_per_workitem; ++step) + { + ::std::uint32_t i = stride * step; + _Type in_val; + + if (i + local_id < n) + { + in_val = __in_rng[i + local_id]; + // my_reducer = __binary_op(my_reducer, in_val); + } + _Type out = sycl::inclusive_scan_over_group(group, in_val, __binary_op, carry); + if (i + local_id < n) + { + out_begin[i + local_id] = out; + } + carry = group_broadcast(group, out, stride - 1); + } + } + }); + }); + + event.wait(); +} + template void @@ -437,128 +520,111 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou auto fill_event = __queue.memset(status_flags_begin, 0, num_elements * sizeof(_FlagT) + 1 * sizeof(_TileIdT)); - // auto fill_event = __queue.submit( - // [&](sycl::handler& hdl) - // { - // hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize}, - // [=](const sycl::nd_item<1>& item) - // { - // int id = item.get_global_linear_id(); - // if (id < num_elements) - // status_flags_begin[id] = id < _LookbackScanMemory::padding - // ? _LookbackScanMemory::OUT_OF_BOUNDS - // : _LookbackScanMemory::NOT_READY; - // if (id == num_elements) - // tile_id_begin[0] = 0; - // }); - // }); - auto event = __queue.submit([&](sycl::handler& hdl) { auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); hdl.depends_on(fill_event); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); - hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), - [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] - { - auto group = item.get_group(); - ::std::uint32_t local_id = item.get_local_id(0); - constexpr ::std::uint32_t stride = wgsize; - auto subgroup = item.get_sub_group(); - - std::uint32_t tile_id; - if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>) - { - // Obtain unique ID for this work-group that will be used in decoupled lookback - TileId dynamic_tile_id(tile_id_begin); - if (group.leader()) - { - tile_id_lacc[0] = dynamic_tile_id.fetch_inc(); - } - sycl::group_barrier(group); - tile_id = tile_id_lacc[0]; - } - else - { - tile_id = group.get_group_linear_id(); - } - - - // Global load into local - auto wg_current_offset = (tile_id * elems_in_tile); - auto wg_next_offset = ((tile_id + 1) * elems_in_tile); - auto wg_local_memory_size = elems_in_tile; - - if (wg_next_offset > n) - wg_local_memory_size = n - wg_current_offset; - //TODO: assumes default ctor produces identity w.r.t. __binary_op - _Type my_reducer{}; - if (wg_next_offset <= n) - { - #pragma unroll - for (std::uint32_t i = 0; i < elems_per_workitem; ++i) - { - _Type in_val = __in_rng[wg_current_offset + local_id + stride * i]; - my_reducer = __binary_op(my_reducer, in_val); - tile_vals[local_id + stride * i] = in_val; - } - } - else - { - #pragma unroll - for (std::uint32_t i = 0; i < elems_per_workitem; ++i) - { - if (wg_current_offset + local_id + stride * i < n) - { - _Type in_val = __in_rng[wg_current_offset + local_id + stride * i]; - my_reducer = __binary_op(my_reducer, in_val); - tile_vals[local_id + stride * i] = in_val; - } - } - } - - auto local_sum = sycl::reduce_over_group(group, my_reducer, __binary_op); - - auto in_begin = tile_vals.template get_multi_ptr().get(); - auto out_begin = __out_rng.begin() + wg_current_offset; - - _Type prev_sum = 0; - - // The first sub-group will query the previous tiles to find a prefix - if (subgroup.get_group_id() == 0) - { - _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs); - - if (group.leader()) - scan_mem.set_partial(tile_id, local_sum); - - // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum - prev_sum = cooperative_lookback()(tile_id, subgroup, __binary_op, scan_mem); - - if (group.leader()) - scan_mem.set_full(tile_id, prev_sum + local_sum); - } - - _Type carry = sycl::group_broadcast(group, prev_sum, 0); - // TODO: Find a fix for _ONEDPL_PRAGMA_UNROLL - #pragma unroll - for (::std::uint32_t step = 0; step < elems_per_workitem; ++step) - { - ::std::uint32_t i = stride * step; - _Type x; - if (i + local_id < wg_local_memory_size) - { - x = in_begin[i + local_id]; - } - _Type out = sycl::inclusive_scan_over_group(group, x, __binary_op, carry); - if (i + local_id < wg_local_memory_size) - { - out_begin[i + local_id] = out; - } - carry = group_broadcast(group, out, stride - 1); - } - }); + hdl.parallel_for( + sycl::nd_range<1>(num_workitems, wgsize), [= + ](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { + auto group = item.get_group(); + ::std::uint32_t local_id = item.get_local_id(0); + constexpr ::std::uint32_t stride = wgsize; + auto subgroup = item.get_sub_group(); + + std::uint32_t tile_id; + if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>) + { + // Obtain unique ID for this work-group that will be used in decoupled lookback + TileId dynamic_tile_id(tile_id_begin); + if (group.leader()) + { + tile_id_lacc[0] = dynamic_tile_id.fetch_inc(); + } + sycl::group_barrier(group); + tile_id = tile_id_lacc[0]; + } + else + { + tile_id = group.get_group_linear_id(); + } + + // Global load into local + auto wg_current_offset = (tile_id * elems_in_tile); + auto wg_next_offset = ((tile_id + 1) * elems_in_tile); + auto wg_local_memory_size = elems_in_tile; + + if (wg_next_offset > n) + wg_local_memory_size = n - wg_current_offset; + //TODO: assumes default ctor produces identity w.r.t. __binary_op + _Type my_reducer{}; + if (wg_next_offset <= n) + { +#pragma unroll + for (std::uint32_t i = 0; i < elems_per_workitem; ++i) + { + _Type in_val = __in_rng[wg_current_offset + local_id + stride * i]; + my_reducer = __binary_op(my_reducer, in_val); + tile_vals[local_id + stride * i] = in_val; + } + } + else + { +#pragma unroll + for (std::uint32_t i = 0; i < elems_per_workitem; ++i) + { + if (wg_current_offset + local_id + stride * i < n) + { + _Type in_val = __in_rng[wg_current_offset + local_id + stride * i]; + my_reducer = __binary_op(my_reducer, in_val); + tile_vals[local_id + stride * i] = in_val; + } + } + } + + auto local_sum = sycl::reduce_over_group(group, my_reducer, __binary_op); + + auto in_begin = tile_vals.template get_multi_ptr().get(); + auto out_begin = __out_rng.begin() + wg_current_offset; + + _Type prev_sum = 0; + + // The first sub-group will query the previous tiles to find a prefix + if (subgroup.get_group_id() == 0) + { + _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs); + + if (group.leader()) + scan_mem.set_partial(tile_id, local_sum); + + // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum + prev_sum = cooperative_lookback()(tile_id, subgroup, __binary_op, scan_mem); + + if (group.leader()) + scan_mem.set_full(tile_id, prev_sum + local_sum); + } + + _Type carry = sycl::group_broadcast(group, prev_sum, 0); +// TODO: Find a fix for _ONEDPL_PRAGMA_UNROLL +#pragma unroll + for (::std::uint32_t step = 0; step < elems_per_workitem; ++step) + { + ::std::uint32_t i = stride * step; + _Type x; + if (i + local_id < wg_local_memory_size) + { + x = in_begin[i + local_id]; + } + _Type out = sycl::inclusive_scan_over_group(group, x, __binary_op, carry); + if (i + local_id < wg_local_memory_size) + { + out_begin[i + local_id] = out; + } + carry = group_broadcast(group, out, stride - 1); + } + }); }); scratch.async_free(event); @@ -575,9 +641,10 @@ struct kernel_param using kernel_name = KernelName; }; -template +template void -single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _BinaryOp __binary_op) +single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, + _BinaryOp __binary_op) { auto __n = __in_end - __in_begin; @@ -593,19 +660,62 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera { if (__queue.get_device().has(sycl::aspect::atomic64)) { - single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::false_type>( - __queue, __buf1.all_view(), __buf2.all_view(), __binary_op); + single_pass_scan_impl<_KernelParam, _Inclusive, /* UseAtomic64 */ std::true_type, + /* UseDynamicTileID */ std::false_type>(__queue, __buf1.all_view(), __buf2.all_view(), + __binary_op); } else { - single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type, /* UseDynamicTileID */ std::false_type>( - __queue, __buf1.all_view(), __buf2.all_view(), __binary_op); + single_pass_scan_impl<_KernelParam, _Inclusive, /* UseAtomic64 */ std::false_type, + /* UseDynamicTileID */ std::false_type>(__queue, __buf1.all_view(), __buf2.all_view(), + __binary_op); } } else { - single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type, /* UseDynamicTileID */ std::false_type>( - __queue, __buf1.all_view(), __buf2.all_view(), __binary_op); + single_pass_scan_impl<_KernelParam, _Inclusive, /* UseAtomic64 */ std::false_type, + /* UseDynamicTileID */ std::false_type>(__queue, __buf1.all_view(), __buf2.all_view(), + __binary_op); + } +} + +template +void +single_pass_single_wg_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, + _OutIterator __out_begin, _BinaryOp __binary_op) +{ + auto __n = __in_end - __in_begin; + + auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); + auto __buf1 = __keep1(__in_begin, __in_end); + auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); + auto __buf2 = __keep2(__out_begin, __out_begin + __n); + + // Avoid aspect query overhead for sizeof(Types) > 32 bits + single_pass_scan_impl_single_wg<_KernelParam, /* Inclusive */ std::true_type>(__queue, __buf1.all_view(), + __buf2.all_view(), __binary_op); +} + +template +void +single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, + _BinaryOp __binary_op) +{ + constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; + constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem; + // Avoid non_uniform n by padding up to a multiple of wgsize + constexpr ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem; + auto __n = __in_end - __in_begin; + + if (__n <= elems_in_tile) + { + single_pass_single_wg_inclusive_scan<_KernelParam, /* Inclusive */ std::true_type>( + __queue, __in_begin, __in_end, __out_begin, __binary_op); + } + else + { + single_pass_inclusive_scan<_KernelParam, /* Inclusive */ std::true_type>(__queue, __in_begin, __in_end, + __out_begin, __binary_op); } } From e42e68dfa0f3c722df13fce98f802222893df94b Mon Sep 17 00:00:00 2001 From: "aidan.belton" Date: Tue, 21 Nov 2023 10:38:29 +0000 Subject: [PATCH 030/134] Add phase 1 --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 123 ++++++++++++++++++ .../numeric/numeric.ops/copy_if_kt.pass.cpp | 77 +++++++++++ 2 files changed, 200 insertions(+) create mode 100644 test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 345da745608..c6da15a17b0 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -719,6 +719,129 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera } } +// Load function to try and get some PVC perf w/ coalesced +template +inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t wg_group_id) { + // if constexpr (std::is_arithmetic_v) { + // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * wg_group_id); + // } + return src[i + wg_stride * wg_group_id]; +} + +// Load with checking for the subgroup case +template +inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t wg_group_id, size_t input_size) { + // if constexpr (std::is_arithmetic_v) { + // if (i / SUBGROUP_SIZE + SUBGROUP_SIZE + wg_stride * wg_group_id <= input_size) + // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * wg_group_id); + // return src[i + wg_stride * wg_group_id]; + // } + return src[i + wg_stride * wg_group_id]; +} + +template +void +single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _UnaryPredicate pred) +{ + using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + + const ::std::size_t n = __in_rng.size(); + + constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; + constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem; + + // Avoid non_uniform n by padding up to a multiple of wgsize + std::uint32_t elems_in_tile = wgsize * elems_per_workitem; + ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); + ::std::size_t num_workitems = num_wgs * wgsize; + + auto event = __queue.submit([&](sycl::handler& hdl) { + auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_per_workitem*wgsize}, hdl); + auto l_wg_count = sycl::local_accessor(sycl::range<1>{1}, hdl); + + oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); + hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { + auto group = item.get_group(); + auto wg_group_id = item.get_group(0); + auto wg_local_id = item.get_local_id(0); + auto sg = item.get_sub_group(); + + // Must be a better way to init atomics + l_wg_count[0] = 0; + sycl::group_barrier(group); + sycl::atomic_ref wg_count(l_wg_count[0]); + + constexpr size_t elems_per_workgroup = elems_per_workitem*wgsize; + + // Phase 1: Create wg_count and construct in-order wg_copy_if_values + if ((wg_group_id + 1) * elems_per_workgroup <= n) { + #pragma unroll + for (size_t i = wg_local_id; i < elems_per_workgroup; i += wgsize) { + _Type val = load<_Type>(sg, __in_rng, i, elems_per_workgroup, wg_group_id); + + size_t satisfies_pred = pred(val); + //size_t satisfies_pred = 0; + size_t count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus()); + + if (satisfies_pred) + wg_copy_if_values[count + wg_count.load()] = val; + + if (wg_local_id == (wgsize - 1)) + wg_count += (count + satisfies_pred); + sycl::group_barrier(group); + } + } + else { + // Edge of input, have to handle memory bounds + // Might have unneccessary group_barrier calls + #pragma unroll + for (size_t i = wg_local_id; i < elems_per_workgroup; i += wgsize) { + size_t satisfies_pred = 0; + _Type val; // TODO: alloca + if (i + elems_per_workgroup * wg_group_id < n) { + val = load<_Type>(sg, __in_rng, i, elems_per_workgroup, wg_group_id, n); + + satisfies_pred = pred(val); + } + size_t count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus()); + + if (satisfies_pred) + wg_copy_if_values[count + wg_count.load()] = val; + + if (wg_local_id == (wgsize - 1)) + wg_count += (count + satisfies_pred); + sycl::group_barrier(group); + } + } + // Check behaviour + if (group.leader()) { + __out_rng[wg_group_id] = wg_count.load(); + } + + // Phase 2: Global scan across wg_count + + // Phase 3: copy values to global memory + }); + }); + event.wait(); +} + +template +void +single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _UnaryPredicate pred) +{ + auto __n = __in_end - __in_begin; + + auto __keep1 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); + auto __buf1 = __keep1(__in_begin, __in_end); + auto __keep2 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); + auto __buf2 = __keep2(__out_begin, __out_begin + __n); + + single_pass_copy_if_impl<_KernelParam>(__queue, __buf1.all_view(), __buf2.all_view(), pred); +} + } // inline namespace igpu } // namespace oneapi::dpl::experimental::kt diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp new file mode 100644 index 00000000000..459449d933d --- /dev/null +++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp @@ -0,0 +1,77 @@ +// -*- C++ -*- +//===-- scan.pass.cpp -----------------------------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#include "support/test_config.h" + +#include _PSTL_TEST_HEADER(execution) +#include _PSTL_TEST_HEADER(numeric) + +int +main() +{ + bool all_passed = true; + sycl::queue q; + + for (int logn : {4, 8, 10, 12, 14}) + { + std::cout << "Testing 2^" << logn << std::endl; + int n = 1 << logn; + std::cout << "n:" << n << std::endl; + std::vector v(n, 0); + for (size_t i = 0; i < v.size(); ++i) + std::cout << v[i] << ","; + std::cout << std::endl; + + int* in_ptr = sycl::malloc_device(n, q); + int* out_ptr = sycl::malloc_device(n, q); + + constexpr int n_elements_per_workitem = 8; + + q.copy(v.data(), in_ptr, n).wait(); + using KernelParams = oneapi::dpl::experimental::kt::kernel_param; + oneapi::dpl::experimental::kt::single_pass_copy_if(q, in_ptr, in_ptr+n, out_ptr, [](int x) { return x == 0; }); + + std::vector tmp(n, 0); + q.copy(out_ptr, tmp.data(), n); + q.wait(); + + std::copy_if(v.begin(), v.end(), v.begin(), [](int x) { return x == 0; }); + + bool passed = true; + // for (size_t i = 0; i < n; ++i) + // { + // if (tmp[i] != v[i]) + // { + // passed = false; + // std::cout << "expected " << i << ' ' << v[i] << ' ' << tmp[i] << '\n'; + // } + // } + + // if (passed) + // std::cout << " passed" << std::endl; + // else + // std::cout << " failed" << std::endl; + + for (size_t i = 0; i < n/(n_elements_per_workitem*128) + 1; ++i) { + std::cout << "i:" << i << " count:" << tmp[i] << std::endl; + } + + all_passed &= passed; + sycl::free(in_ptr, q); + sycl::free(out_ptr, q); + } + + return !all_passed; +} From 54c0ae9a66a049d62dcac83e891e225ab3e50e1a Mon Sep 17 00:00:00 2001 From: "aidan.belton" Date: Mon, 27 Nov 2023 13:26:38 +0000 Subject: [PATCH 031/134] Add phase 2 --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 112 +++++++++++++++--- .../numeric/numeric.ops/copy_if_kt.pass.cpp | 6 +- 2 files changed, 99 insertions(+), 19 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index c6da15a17b0..5a9d3241574 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -739,11 +739,14 @@ inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, siz return src[i + wg_stride * wg_group_id]; } -template +template void single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _UnaryPredicate pred) { using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + using _TileIdT = TileId::_TileIdT; + using _LookbackScanMemory = LookbackScanMemory<_Type, _UseAtomic64>; + using _FlagT = typename _LookbackScanMemory::_FlagT; const ::std::size_t n = __in_rng.size(); @@ -751,33 +754,87 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem; // Avoid non_uniform n by padding up to a multiple of wgsize - std::uint32_t elems_in_tile = wgsize * elems_per_workitem; + constexpr std::uint32_t elems_in_tile = wgsize * elems_per_workitem; ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); ::std::size_t num_workitems = num_wgs * wgsize; + ScanMemoryManager<_Type, _UseAtomic64, LookbackScanMemory, TileId> scratch(__queue); + scratch.allocate(num_wgs); + + // Memory Structure: + // [Lookback Scan Memory, Tile Id Counter] + auto scan_memory_begin = scratch.scan_memory_ptr(); + auto status_flags_begin = _LookbackScanMemory::get_flags_begin(scan_memory_begin, num_wgs); + auto tile_id_begin = scratch.tile_id_ptr(); + + ::std::size_t num_elements = _LookbackScanMemory::get_num_elements(num_wgs); + // fill_num_wgs num_elements + 1 to also initialize tile_id_counter + ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize); + + auto fill_event = __queue.submit( + [&](sycl::handler& hdl) + { + hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize}, + [=](const sycl::nd_item<1>& item) + { + int id = item.get_global_linear_id(); + if (id < num_elements) + status_flags_begin[id] = id < _LookbackScanMemory::padding + ? _LookbackScanMemory::OUT_OF_BOUNDS + : _LookbackScanMemory::NOT_READY; + if (id == num_elements) + tile_id_begin[0] = 0; + }); + }); + auto event = __queue.submit([&](sycl::handler& hdl) { auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_per_workitem*wgsize}, hdl); auto l_wg_count = sycl::local_accessor(sycl::range<1>{1}, hdl); + auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); + auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); + oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); - hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { + hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { auto group = item.get_group(); auto wg_group_id = item.get_group(0); auto wg_local_id = item.get_local_id(0); auto sg = item.get_sub_group(); + constexpr ::std::uint32_t stride = wgsize; + + // Init tile_id + std::uint32_t tile_id; + if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>) + { + // Obtain unique ID for this work-group that will be used in decoupled lookback + TileId dynamic_tile_id(tile_id_begin); + if (group.leader()) + { + tile_id_lacc[0] = dynamic_tile_id.fetch_inc(); + } + sycl::group_barrier(group); + tile_id = tile_id_lacc[0]; + } + else + { + tile_id = group.get_group_linear_id(); + } + + // Global load into local + auto wg_current_offset = (tile_id * elems_in_tile); + auto wg_next_offset = ((tile_id + 1) * elems_in_tile); + auto wg_local_memory_size = elems_in_tile; // Must be a better way to init atomics l_wg_count[0] = 0; sycl::group_barrier(group); sycl::atomic_ref wg_count(l_wg_count[0]); - constexpr size_t elems_per_workgroup = elems_per_workitem*wgsize; - // Phase 1: Create wg_count and construct in-order wg_copy_if_values - if ((wg_group_id + 1) * elems_per_workgroup <= n) { + if ((wg_group_id + 1) * elems_in_tile <= n) { #pragma unroll - for (size_t i = wg_local_id; i < elems_per_workgroup; i += wgsize) { - _Type val = load<_Type>(sg, __in_rng, i, elems_per_workgroup, wg_group_id); + for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { + _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, wg_group_id); size_t satisfies_pred = pred(val); //size_t satisfies_pred = 0; @@ -794,12 +851,12 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ else { // Edge of input, have to handle memory bounds // Might have unneccessary group_barrier calls - #pragma unroll - for (size_t i = wg_local_id; i < elems_per_workgroup; i += wgsize) { + //#pragma unroll + for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { size_t satisfies_pred = 0; _Type val; // TODO: alloca - if (i + elems_per_workgroup * wg_group_id < n) { - val = load<_Type>(sg, __in_rng, i, elems_per_workgroup, wg_group_id, n); + if (i + elems_in_tile * wg_group_id < n) { + val = load<_Type>(sg, __in_rng, i, elems_in_tile, wg_group_id, n); satisfies_pred = pred(val); } @@ -813,13 +870,36 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ sycl::group_barrier(group); } } + + // Phase 2: Global scan across wg_count + auto local_sum = wg_count.load(); + + auto in_begin = tile_vals.get_pointer(); + + _Type prev_sum = 0; + + // The first sub-group will query the previous tiles to find a prefix + if (sg.get_group_id() == 0) + { + _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs); + + if (group.leader()) + scan_mem.set_partial(tile_id, local_sum); + + // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum + prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_Type>(), scan_mem); + + if (group.leader()) + scan_mem.set_full(tile_id, prev_sum + local_sum); + } + + _Type carry = sycl::group_broadcast(group, prev_sum, 0); + // Check behaviour if (group.leader()) { - __out_rng[wg_group_id] = wg_count.load(); + __out_rng[wg_group_id] = carry; } - // Phase 2: Global scan across wg_count - // Phase 3: copy values to global memory }); }); @@ -839,7 +919,7 @@ single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __i oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); auto __buf2 = __keep2(__out_begin, __out_begin + __n); - single_pass_copy_if_impl<_KernelParam>(__queue, __buf1.all_view(), __buf2.all_view(), pred); + single_pass_copy_if_impl<_KernelParam, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::true_type>(__queue, __buf1.all_view(), __buf2.all_view(), pred); } } // inline namespace igpu diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp index 459449d933d..917e88a7707 100644 --- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp @@ -30,9 +30,9 @@ main() int n = 1 << logn; std::cout << "n:" << n << std::endl; std::vector v(n, 0); - for (size_t i = 0; i < v.size(); ++i) - std::cout << v[i] << ","; - std::cout << std::endl; + //for (size_t i = 0; i < v.size(); ++i) + // std::cout << v[i] << ","; + //std::cout << std::endl; int* in_ptr = sycl::malloc_device(n, q); int* out_ptr = sycl::malloc_device(n, q); From ba543ed4040c5faa4daca25cc77bb3679d15d4c6 Mon Sep 17 00:00:00 2001 From: Aidan Date: Tue, 28 Nov 2023 15:19:56 +0000 Subject: [PATCH 032/134] Add phase 3 --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 67 ++++++++------- .../numeric/numeric.ops/copy_if_kt.pass.cpp | 86 ++++++++++++------- 2 files changed, 87 insertions(+), 66 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 5a9d3241574..63a59476234 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -721,27 +721,27 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera // Load function to try and get some PVC perf w/ coalesced template -inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t wg_group_id) { +inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t tile_id) { // if constexpr (std::is_arithmetic_v) { - // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * wg_group_id); + // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * tile_id); // } - return src[i + wg_stride * wg_group_id]; + return src[i + wg_stride * tile_id]; } // Load with checking for the subgroup case template -inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t wg_group_id, size_t input_size) { +inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t tile_id, size_t input_size) { // if constexpr (std::is_arithmetic_v) { - // if (i / SUBGROUP_SIZE + SUBGROUP_SIZE + wg_stride * wg_group_id <= input_size) - // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * wg_group_id); - // return src[i + wg_stride * wg_group_id]; + // if (i / SUBGROUP_SIZE + SUBGROUP_SIZE + wg_stride * tile_id <= input_size) + // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * tile_id); + // return src[i + wg_stride * tile_id]; // } - return src[i + wg_stride * wg_group_id]; + return src[i + wg_stride * tile_id]; } -template +template void -single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _UnaryPredicate pred) +single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred) { using _Type = oneapi::dpl::__internal::__value_t<_InRange>; using _TileIdT = TileId::_TileIdT; @@ -793,11 +793,11 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); + hdl.depends_on(fill_event); - oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); - hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { + oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng); + hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { auto group = item.get_group(); - auto wg_group_id = item.get_group(0); auto wg_local_id = item.get_local_id(0); auto sg = item.get_sub_group(); constexpr ::std::uint32_t stride = wgsize; @@ -822,7 +822,6 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ // Global load into local auto wg_current_offset = (tile_id * elems_in_tile); - auto wg_next_offset = ((tile_id + 1) * elems_in_tile); auto wg_local_memory_size = elems_in_tile; // Must be a better way to init atomics @@ -831,10 +830,10 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ sycl::atomic_ref wg_count(l_wg_count[0]); // Phase 1: Create wg_count and construct in-order wg_copy_if_values - if ((wg_group_id + 1) * elems_in_tile <= n) { + if ((tile_id + 1) * elems_in_tile <= n) { #pragma unroll for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { - _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, wg_group_id); + _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id); size_t satisfies_pred = pred(val); //size_t satisfies_pred = 0; @@ -847,16 +846,15 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ wg_count += (count + satisfies_pred); sycl::group_barrier(group); } - } - else { + } else { // Edge of input, have to handle memory bounds // Might have unneccessary group_barrier calls - //#pragma unroll + #pragma unroll for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { size_t satisfies_pred = 0; _Type val; // TODO: alloca - if (i + elems_in_tile * wg_group_id < n) { - val = load<_Type>(sg, __in_rng, i, elems_in_tile, wg_group_id, n); + if (i + elems_in_tile * tile_id < n) { + val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id, n); satisfies_pred = pred(val); } @@ -873,10 +871,8 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ // Phase 2: Global scan across wg_count auto local_sum = wg_count.load(); - auto in_begin = tile_vals.get_pointer(); - - _Type prev_sum = 0; + size_t prev_sum = 0; // The first sub-group will query the previous tiles to find a prefix if (sg.get_group_id() == 0) @@ -893,22 +889,23 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ scan_mem.set_full(tile_id, prev_sum + local_sum); } - _Type carry = sycl::group_broadcast(group, prev_sum, 0); + size_t start_idx = sycl::group_broadcast(group, prev_sum, 0); - // Check behaviour - if (group.leader()) { - __out_rng[wg_group_id] = carry; - } - // Phase 3: copy values to global memory + for (int i = wg_local_id; i < local_sum; i += wgsize) { + // Probably adjust method to try and get some perf on PVC for arithmetic types using sg.store + __out_rng[start_idx + i] = wg_copy_if_values[i]; + } + if (tile_id == (num_wgs - 1) && group.leader()) + __num_rng[0] = start_idx + local_sum; }); }); event.wait(); } -template +template void -single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _UnaryPredicate pred) +single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred) { auto __n = __in_end - __in_begin; @@ -919,7 +916,11 @@ single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __i oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); auto __buf2 = __keep2(__out_begin, __out_begin + __n); - single_pass_copy_if_impl<_KernelParam, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::true_type>(__queue, __buf1.all_view(), __buf2.all_view(), pred); + auto __keep_num = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>(); + auto __buf_num = __keep2(__num_begin, __num_begin + 1); + + single_pass_copy_if_impl<_KernelParam, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::true_type>(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred); } } // inline namespace igpu diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp index 917e88a7707..202f28fbaad 100644 --- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp @@ -14,64 +14,84 @@ //===----------------------------------------------------------------------===// #include "support/test_config.h" +#include "support/utils.h" #include _PSTL_TEST_HEADER(execution) #include _PSTL_TEST_HEADER(numeric) -int -main() +using namespace TestUtils; + +template +class CopyIfKernel; + +template +bool test(Predicate pred, Generator gen) { bool all_passed = true; sycl::queue q; - for (int logn : {4, 8, 10, 12, 14}) + for (int logn : {4, 8, 10, 12, 14, 15, 18}) { - std::cout << "Testing 2^" << logn << std::endl; int n = 1 << logn; - std::cout << "n:" << n << std::endl; - std::vector v(n, 0); - //for (size_t i = 0; i < v.size(); ++i) - // std::cout << v[i] << ","; - //std::cout << std::endl; - int* in_ptr = sycl::malloc_device(n, q); - int* out_ptr = sycl::malloc_device(n, q); + Sequence in(n, [&](size_t k) -> T { + return gen(n ^ k); + }); + + Sequence std_out(n); + + T* in_ptr = sycl::malloc_device(n, q); + T* out_ptr = sycl::malloc_device(n, q); + size_t* out_num = sycl::malloc_device(1, q); constexpr int n_elements_per_workitem = 8; - q.copy(v.data(), in_ptr, n).wait(); - using KernelParams = oneapi::dpl::experimental::kt::kernel_param; - oneapi::dpl::experimental::kt::single_pass_copy_if(q, in_ptr, in_ptr+n, out_ptr, [](int x) { return x == 0; }); + q.copy(in.data(), in_ptr, n).wait(); + using KernelParams = oneapi::dpl::experimental::kt::kernel_param>; + oneapi::dpl::experimental::kt::single_pass_copy_if(q, in_ptr, in_ptr+n, out_ptr, out_num, pred); - std::vector tmp(n, 0); - q.copy(out_ptr, tmp.data(), n); + Sequence kt_out(n); + size_t num_selected = 0; + q.copy(out_ptr, kt_out.data(), n); + q.copy(out_num, &num_selected, 1); q.wait(); - std::copy_if(v.begin(), v.end(), v.begin(), [](int x) { return x == 0; }); + auto std_out_end = std::copy_if(in.begin(), in.end(), std_out.begin(), pred); bool passed = true; - // for (size_t i = 0; i < n; ++i) - // { - // if (tmp[i] != v[i]) - // { - // passed = false; - // std::cout << "expected " << i << ' ' << v[i] << ' ' << tmp[i] << '\n'; - // } - // } - - // if (passed) - // std::cout << " passed" << std::endl; - // else - // std::cout << " failed" << std::endl; - - for (size_t i = 0; i < n/(n_elements_per_workitem*128) + 1; ++i) { - std::cout << "i:" << i << " count:" << tmp[i] << std::endl; + if (num_selected != (std_out_end - std_out.begin())) { + passed = false; + std::cout << "Num selected wrong: expected " << (std_out_end - std_out.begin()) << " " << num_selected << "\n"; + } + + for (size_t i = 0; i < (std_out_end - std_out.begin()); ++i) + { + if (kt_out[i] != std_out[i]) + { + passed = false; + std::cout << "expected " << i << ' ' << std_out[i] << ' ' << kt_out[i] << '\n'; + } } + if (passed) + std::cout << " passed" << std::endl; + else + std::cout << " failed" << std::endl; + all_passed &= passed; sycl::free(in_ptr, q); sycl::free(out_ptr, q); + sycl::free(out_num, q); } return !all_passed; } + +int main() { + bool all_passed; + all_passed &= test([](const float64_t& x) { return x * x <= 1024; }, [](size_t j) { return ((j + 1) % 7 & 2) != 0 ? float64_t(j % 32) : float64_t(j % 33 + 34); }); + all_passed &= test([](const int&) { return true; }, [](size_t j) { return j; }); + all_passed &= test([](const std::int32_t& x) { return x != 42; }, [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? std::int32_t(j + 1) : 42; }); + + return all_passed; +} From cdf74d0aed9c8f64bb5fc10e3ed96ecf1626732d Mon Sep 17 00:00:00 2001 From: Aidan Date: Tue, 5 Dec 2023 11:28:40 +0000 Subject: [PATCH 033/134] Add count datatype _SizeT --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 63a59476234..3d6289642bc 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -744,8 +744,9 @@ void single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred) { using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + using _SizeT = uint64_t; using _TileIdT = TileId::_TileIdT; - using _LookbackScanMemory = LookbackScanMemory<_Type, _UseAtomic64>; + using _LookbackScanMemory = LookbackScanMemory<_SizeT, _UseAtomic64>; using _FlagT = typename _LookbackScanMemory::_FlagT; const ::std::size_t n = __in_rng.size(); @@ -758,7 +759,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); ::std::size_t num_workitems = num_wgs * wgsize; - ScanMemoryManager<_Type, _UseAtomic64, LookbackScanMemory, TileId> scratch(__queue); + ScanMemoryManager<_SizeT, _UseAtomic64, LookbackScanMemory, TileId> scratch(__queue); scratch.allocate(num_wgs); // Memory Structure: @@ -792,7 +793,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ auto l_wg_count = sycl::local_accessor(sycl::range<1>{1}, hdl); auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); - auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); + auto tile_vals = sycl::local_accessor<_SizeT, 1>(sycl::range<1>{elems_in_tile}, hdl); hdl.depends_on(fill_event); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng); @@ -827,7 +828,8 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ // Must be a better way to init atomics l_wg_count[0] = 0; sycl::group_barrier(group); - sycl::atomic_ref wg_count(l_wg_count[0]); + sycl::atomic_ref<_SizeT, sycl::memory_order::acq_rel, sycl::memory_scope::work_group, sycl::access::address_space::local_space> wg_count(l_wg_count[0]); + sycl::group_barrier(group); // Phase 1: Create wg_count and construct in-order wg_copy_if_values if ((tile_id + 1) * elems_in_tile <= n) { @@ -835,9 +837,8 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id); - size_t satisfies_pred = pred(val); - //size_t satisfies_pred = 0; - size_t count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus()); + _SizeT satisfies_pred = pred(val); + _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<_SizeT>()); if (satisfies_pred) wg_copy_if_values[count + wg_count.load()] = val; @@ -851,14 +852,14 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ // Might have unneccessary group_barrier calls #pragma unroll for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { - size_t satisfies_pred = 0; + _SizeT satisfies_pred = 0; _Type val; // TODO: alloca if (i + elems_in_tile * tile_id < n) { val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id, n); satisfies_pred = pred(val); } - size_t count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus()); + _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<_SizeT>()); if (satisfies_pred) wg_copy_if_values[count + wg_count.load()] = val; @@ -870,9 +871,9 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ } // Phase 2: Global scan across wg_count - auto local_sum = wg_count.load(); - auto in_begin = tile_vals.get_pointer(); - size_t prev_sum = 0; + _SizeT local_sum = wg_count.load(); + _SizeT* in_begin = tile_vals.get_pointer(); + _SizeT prev_sum = 0; // The first sub-group will query the previous tiles to find a prefix if (sg.get_group_id() == 0) @@ -883,13 +884,13 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ scan_mem.set_partial(tile_id, local_sum); // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum - prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_Type>(), scan_mem); + prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_SizeT>(), scan_mem); if (group.leader()) scan_mem.set_full(tile_id, prev_sum + local_sum); } - size_t start_idx = sycl::group_broadcast(group, prev_sum, 0); + _SizeT start_idx = sycl::group_broadcast(group, prev_sum, 0); // Phase 3: copy values to global memory for (int i = wg_local_id; i < local_sum; i += wgsize) { From c5670d813d8a1d9b77f9ad93e86e940a859b9a3b Mon Sep 17 00:00:00 2001 From: Aidan Date: Tue, 5 Dec 2023 12:24:51 +0000 Subject: [PATCH 034/134] Move away from atomics --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 36 +++++++------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 3d6289642bc..60c2db24b78 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -789,11 +789,9 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ }); auto event = __queue.submit([&](sycl::handler& hdl) { - auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_per_workitem*wgsize}, hdl); - auto l_wg_count = sycl::local_accessor(sycl::range<1>{1}, hdl); + auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); - auto tile_vals = sycl::local_accessor<_SizeT, 1>(sycl::range<1>{elems_in_tile}, hdl); hdl.depends_on(fill_event); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng); @@ -825,11 +823,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ auto wg_current_offset = (tile_id * elems_in_tile); auto wg_local_memory_size = elems_in_tile; - // Must be a better way to init atomics - l_wg_count[0] = 0; - sycl::group_barrier(group); - sycl::atomic_ref<_SizeT, sycl::memory_order::acq_rel, sycl::memory_scope::work_group, sycl::access::address_space::local_space> wg_count(l_wg_count[0]); - sycl::group_barrier(group); + _SizeT wg_count = 0; // Phase 1: Create wg_count and construct in-order wg_copy_if_values if ((tile_id + 1) * elems_in_tile <= n) { @@ -838,14 +832,12 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id); _SizeT satisfies_pred = pred(val); - _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<_SizeT>()); + _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); if (satisfies_pred) - wg_copy_if_values[count + wg_count.load()] = val; + wg_copy_if_values[count] = val; - if (wg_local_id == (wgsize - 1)) - wg_count += (count + satisfies_pred); - sycl::group_barrier(group); + wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); } } else { // Edge of input, have to handle memory bounds @@ -859,20 +851,16 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ satisfies_pred = pred(val); } - _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<_SizeT>()); + _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); if (satisfies_pred) - wg_copy_if_values[count + wg_count.load()] = val; + wg_copy_if_values[count] = val; - if (wg_local_id == (wgsize - 1)) - wg_count += (count + satisfies_pred); - sycl::group_barrier(group); + wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); } } // Phase 2: Global scan across wg_count - _SizeT local_sum = wg_count.load(); - _SizeT* in_begin = tile_vals.get_pointer(); _SizeT prev_sum = 0; // The first sub-group will query the previous tiles to find a prefix @@ -881,24 +869,24 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs); if (group.leader()) - scan_mem.set_partial(tile_id, local_sum); + scan_mem.set_partial(tile_id, wg_count); // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_SizeT>(), scan_mem); if (group.leader()) - scan_mem.set_full(tile_id, prev_sum + local_sum); + scan_mem.set_full(tile_id, prev_sum + wg_count); } _SizeT start_idx = sycl::group_broadcast(group, prev_sum, 0); // Phase 3: copy values to global memory - for (int i = wg_local_id; i < local_sum; i += wgsize) { + for (int i = wg_local_id; i < wg_count; i += wgsize) { // Probably adjust method to try and get some perf on PVC for arithmetic types using sg.store __out_rng[start_idx + i] = wg_copy_if_values[i]; } if (tile_id == (num_wgs - 1) && group.leader()) - __num_rng[0] = start_idx + local_sum; + __num_rng[0] = start_idx + wg_count; }); }); event.wait(); From 8918b42eeb91ffa6baac73f7fb5276d80e3d7758 Mon Sep 17 00:00:00 2001 From: Aidan Date: Tue, 5 Dec 2023 19:42:54 +0000 Subject: [PATCH 035/134] Sort out test logic --- test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp index 202f28fbaad..75769131522 100644 --- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp @@ -84,11 +84,11 @@ bool test(Predicate pred, Generator gen) sycl::free(out_num, q); } - return !all_passed; + return all_passed; } int main() { - bool all_passed; + bool all_passed = true; all_passed &= test([](const float64_t& x) { return x * x <= 1024; }, [](size_t j) { return ((j + 1) % 7 & 2) != 0 ? float64_t(j % 32) : float64_t(j % 33 + 34); }); all_passed &= test([](const int&) { return true; }, [](size_t j) { return j; }); all_passed &= test([](const std::int32_t& x) { return x != 42; }, [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? std::int32_t(j + 1) : 42; }); From 625f3156d814ecb4d57666e35515098744fcaf15 Mon Sep 17 00:00:00 2001 From: Aidan Date: Tue, 5 Dec 2023 19:50:48 +0000 Subject: [PATCH 036/134] Remove unnecessary load and store functions --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 25 ++----------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 60c2db24b78..68d11740df0 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -719,26 +719,6 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera } } -// Load function to try and get some PVC perf w/ coalesced -template -inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t tile_id) { - // if constexpr (std::is_arithmetic_v) { - // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * tile_id); - // } - return src[i + wg_stride * tile_id]; -} - -// Load with checking for the subgroup case -template -inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t tile_id, size_t input_size) { - // if constexpr (std::is_arithmetic_v) { - // if (i / SUBGROUP_SIZE + SUBGROUP_SIZE + wg_stride * tile_id <= input_size) - // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * tile_id); - // return src[i + wg_stride * tile_id]; - // } - return src[i + wg_stride * tile_id]; -} - template void single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred) @@ -829,7 +809,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ if ((tile_id + 1) * elems_in_tile <= n) { #pragma unroll for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { - _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id); + _Type val = __in_rng[i + elems_in_tile * tile_id]; _SizeT satisfies_pred = pred(val); _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); @@ -847,7 +827,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ _SizeT satisfies_pred = 0; _Type val; // TODO: alloca if (i + elems_in_tile * tile_id < n) { - val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id, n); + val = __in_rng[i + elems_in_tile * tile_id]; satisfies_pred = pred(val); } @@ -882,7 +862,6 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ // Phase 3: copy values to global memory for (int i = wg_local_id; i < wg_count; i += wgsize) { - // Probably adjust method to try and get some perf on PVC for arithmetic types using sg.store __out_rng[start_idx + i] = wg_copy_if_values[i]; } if (tile_id == (num_wgs - 1) && group.leader()) From ca7a8306df3c0ed75d3b3d9d958c8ec81d33a3b5 Mon Sep 17 00:00:00 2001 From: Aidan Date: Wed, 6 Dec 2023 11:03:59 +0000 Subject: [PATCH 037/134] Release scratch mem --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 68d11740df0..db642fc7177 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -868,6 +868,9 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ __num_rng[0] = start_idx + wg_count; }); }); + + scratch.async_free(event); + event.wait(); } From 25238ebee912c4b1419b15c3aa0310cbf182ae9c Mon Sep 17 00:00:00 2001 From: Aidan Date: Fri, 8 Dec 2023 11:15:29 +0000 Subject: [PATCH 038/134] Add single wg copy if --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 108 +++++++++++++++++- 1 file changed, 106 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index db642fc7177..36e395b7285 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -79,6 +79,11 @@ struct ScanMemoryManager }); } + void free() + { + sycl::free(scratch, q); + } + private: ::std::uint8_t* scratch = nullptr; ::std::uint8_t* scan_memory_begin = nullptr; @@ -719,6 +724,86 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera } } +template +void +single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred) +{ + using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + using _SizeT = uint64_t; + using _TileIdT = TileId::_TileIdT; + + const ::std::size_t n = __in_rng.size(); + + constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; + constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem; + + // Avoid non_uniform n by padding up to a multiple of wgsize + constexpr std::uint32_t elems_in_tile = wgsize * elems_per_workitem; + ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); + ::std::size_t num_workitems = num_wgs * wgsize; + + auto event = __queue.submit([&](sycl::handler& hdl) { + auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); + + oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng); + hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { + auto group = item.get_group(); + auto wg_local_id = item.get_local_id(0); + constexpr ::std::uint32_t stride = wgsize; + + // Global load into local + auto wg_current_offset = 0; + auto wg_local_memory_size = elems_in_tile; + + _SizeT wg_count = 0; + + // Phase 1: Create wg_count and construct in-order wg_copy_if_values + if (elems_in_tile <= n) { + #pragma unroll + for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { + _Type val = __in_rng[i]; + + _SizeT satisfies_pred = pred(val); + _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); + + if (satisfies_pred) + wg_copy_if_values[count] = val; + + wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); + } + } else { + // Edge of input, have to handle memory bounds + // Might have unneccessary group_barrier calls + #pragma unroll + for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { + _SizeT satisfies_pred = 0; + _Type val; // TODO: alloca + if (i < n) { + val = __in_rng[i]; + + satisfies_pred = pred(val); + } + _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); + + if (satisfies_pred) + wg_copy_if_values[count] = val; + + wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); + } + } + + // Phase 3: copy values to global memory + for (int i = wg_local_id; i < wg_count; i += wgsize) { + __out_rng[i] = wg_copy_if_values[i]; + } + if (group.leader()) + __num_rng[0] = wg_count; + }); + }); + + event.wait(); +} + template void single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred) @@ -869,9 +954,28 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ }); }); - scratch.async_free(event); - event.wait(); + scratch.free(); +} + +template +void +single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred) +{ + auto __n = __in_end - __in_begin; + + auto __keep1 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); + auto __buf1 = __keep1(__in_begin, __in_end); + auto __keep2 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); + auto __buf2 = __keep2(__out_begin, __out_begin + __n); + + auto __keep_num = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>(); + auto __buf_num = __keep2(__num_begin, __num_begin + 1); + + single_pass_copy_if_impl_single_wg<_KernelParam>(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred); } template From 2f2ccb2ba8d379fcc1569e149784c5566f6407e8 Mon Sep 17 00:00:00 2001 From: Aidan Date: Fri, 8 Dec 2023 11:38:45 +0000 Subject: [PATCH 039/134] Fix unrolls and use memset --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 36 ++++++------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 36e395b7285..fcfb3ad1b84 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -753,13 +753,12 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou // Global load into local auto wg_current_offset = 0; - auto wg_local_memory_size = elems_in_tile; _SizeT wg_count = 0; // Phase 1: Create wg_count and construct in-order wg_copy_if_values if (elems_in_tile <= n) { - #pragma unroll +#pragma unroll for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { _Type val = __in_rng[i]; @@ -774,7 +773,7 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou } else { // Edge of input, have to handle memory bounds // Might have unneccessary group_barrier calls - #pragma unroll +#pragma unroll for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { _SizeT satisfies_pred = 0; _Type val; // TODO: alloca @@ -837,21 +836,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ // fill_num_wgs num_elements + 1 to also initialize tile_id_counter ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize); - auto fill_event = __queue.submit( - [&](sycl::handler& hdl) - { - hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize}, - [=](const sycl::nd_item<1>& item) - { - int id = item.get_global_linear_id(); - if (id < num_elements) - status_flags_begin[id] = id < _LookbackScanMemory::padding - ? _LookbackScanMemory::OUT_OF_BOUNDS - : _LookbackScanMemory::NOT_READY; - if (id == num_elements) - tile_id_begin[0] = 0; - }); - }); + auto fill_event = __queue.memset(status_flags_begin, 0, num_elements * sizeof(_FlagT) + 1 * sizeof(_TileIdT)); auto event = __queue.submit([&](sycl::handler& hdl) { auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); @@ -886,15 +871,14 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ // Global load into local auto wg_current_offset = (tile_id * elems_in_tile); - auto wg_local_memory_size = elems_in_tile; _SizeT wg_count = 0; // Phase 1: Create wg_count and construct in-order wg_copy_if_values if ((tile_id + 1) * elems_in_tile <= n) { - #pragma unroll - for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { - _Type val = __in_rng[i + elems_in_tile * tile_id]; +#pragma unroll + for (size_t i = 0; i < elems_in_tile; i += wgsize) { + _Type val = __in_rng[i + wg_local_id + elems_in_tile * tile_id]; _SizeT satisfies_pred = pred(val); _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); @@ -907,12 +891,12 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ } else { // Edge of input, have to handle memory bounds // Might have unneccessary group_barrier calls - #pragma unroll - for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { +#pragma unroll + for (size_t i = 0; i < elems_in_tile; i += wgsize) { _SizeT satisfies_pred = 0; _Type val; // TODO: alloca - if (i + elems_in_tile * tile_id < n) { - val = __in_rng[i + elems_in_tile * tile_id]; + if (i + wg_local_id + elems_in_tile * tile_id < n) { + val = __in_rng[i + wg_local_id + elems_in_tile * tile_id]; satisfies_pred = pred(val); } From 021fb9a1ef3e8f0ac5ca4fb2c1cacb15cdb1d0b4 Mon Sep 17 00:00:00 2001 From: Aidan Date: Fri, 8 Dec 2023 11:48:50 +0000 Subject: [PATCH 040/134] apply changes to single wg --- .../pstl/hetero/dpcpp/parallel_backend_sycl_scan.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index fcfb3ad1b84..60007e4566c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -759,8 +759,8 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou // Phase 1: Create wg_count and construct in-order wg_copy_if_values if (elems_in_tile <= n) { #pragma unroll - for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { - _Type val = __in_rng[i]; + for (size_t i = 0; i < elems_in_tile; i += wgsize) { + _Type val = __in_rng[i + wg_local_id]; _SizeT satisfies_pred = pred(val); _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); @@ -774,11 +774,11 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou // Edge of input, have to handle memory bounds // Might have unneccessary group_barrier calls #pragma unroll - for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { + for (size_t i = 0; i < elems_in_tile; i += wgsize) { _SizeT satisfies_pred = 0; - _Type val; // TODO: alloca - if (i < n) { - val = __in_rng[i]; + _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type)))); + if (i + wg_local_id < n) { + val = __in_rng[i + wg_local_id]; satisfies_pred = pred(val); } @@ -894,7 +894,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ #pragma unroll for (size_t i = 0; i < elems_in_tile; i += wgsize) { _SizeT satisfies_pred = 0; - _Type val; // TODO: alloca + _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type)))); if (i + wg_local_id + elems_in_tile * tile_id < n) { val = __in_rng[i + wg_local_id + elems_in_tile * tile_id]; From c4b05a4cf175bdb24c38bd6fa85c0f862b2352b7 Mon Sep 17 00:00:00 2001 From: Aidan Date: Fri, 8 Dec 2023 15:31:31 +0000 Subject: [PATCH 041/134] Remove unused variables --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 60007e4566c..fcb539cab2b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -741,6 +741,7 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou constexpr std::uint32_t elems_in_tile = wgsize * elems_per_workitem; ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); ::std::size_t num_workitems = num_wgs * wgsize; + assert(num_wgs == 1); auto event = __queue.submit([&](sycl::handler& hdl) { auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); @@ -752,8 +753,6 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou constexpr ::std::uint32_t stride = wgsize; // Global load into local - auto wg_current_offset = 0; - _SizeT wg_count = 0; // Phase 1: Create wg_count and construct in-order wg_copy_if_values @@ -869,9 +868,6 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ tile_id = group.get_group_linear_id(); } - // Global load into local - auto wg_current_offset = (tile_id * elems_in_tile); - _SizeT wg_count = 0; // Phase 1: Create wg_count and construct in-order wg_copy_if_values From 5d1ed8ec303544cdadcc7938b1b555231a05cb34 Mon Sep 17 00:00:00 2001 From: "aidan.belton" Date: Fri, 8 Dec 2023 15:35:41 +0000 Subject: [PATCH 042/134] Clang-format copy_if_kt commits --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 333 ++++++++++-------- .../numeric/numeric.ops/copy_if_kt.pass.cpp | 33 +- 2 files changed, 200 insertions(+), 166 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index fcb539cab2b..0838817fd4f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -79,7 +79,8 @@ struct ScanMemoryManager }); } - void free() + void + free() { sycl::free(scratch, q); } @@ -724,9 +725,11 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera } } -template +template void -single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred) +single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, + _NumSelectedRange __num_rng, _UnaryPredicate pred) { using _Type = oneapi::dpl::__internal::__value_t<_InRange>; using _SizeT = uint64_t; @@ -747,64 +750,76 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng); - hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { - auto group = item.get_group(); - auto wg_local_id = item.get_local_id(0); - constexpr ::std::uint32_t stride = wgsize; - - // Global load into local - _SizeT wg_count = 0; - - // Phase 1: Create wg_count and construct in-order wg_copy_if_values - if (elems_in_tile <= n) { + hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), + [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { + auto group = item.get_group(); + auto wg_local_id = item.get_local_id(0); + constexpr ::std::uint32_t stride = wgsize; + + // Global load into local + _SizeT wg_count = 0; + + // Phase 1: Create wg_count and construct in-order wg_copy_if_values + if (elems_in_tile <= n) + { #pragma unroll - for (size_t i = 0; i < elems_in_tile; i += wgsize) { - _Type val = __in_rng[i + wg_local_id]; - - _SizeT satisfies_pred = pred(val); - _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); - - if (satisfies_pred) - wg_copy_if_values[count] = val; - - wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); - } - } else { - // Edge of input, have to handle memory bounds - // Might have unneccessary group_barrier calls + for (size_t i = 0; i < elems_in_tile; i += wgsize) + { + _Type val = __in_rng[i + wg_local_id]; + + _SizeT satisfies_pred = pred(val); + _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, + sycl::plus<_SizeT>()); + + if (satisfies_pred) + wg_copy_if_values[count] = val; + + wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); + } + } + else + { + // Edge of input, have to handle memory bounds + // Might have unneccessary group_barrier calls #pragma unroll - for (size_t i = 0; i < elems_in_tile; i += wgsize) { - _SizeT satisfies_pred = 0; - _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type)))); - if (i + wg_local_id < n) { - val = __in_rng[i + wg_local_id]; - - satisfies_pred = pred(val); - } - _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); - - if (satisfies_pred) - wg_copy_if_values[count] = val; - - wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); - } - } - - // Phase 3: copy values to global memory - for (int i = wg_local_id; i < wg_count; i += wgsize) { - __out_rng[i] = wg_copy_if_values[i]; - } - if (group.leader()) - __num_rng[0] = wg_count; - }); + for (size_t i = 0; i < elems_in_tile; i += wgsize) + { + _SizeT satisfies_pred = 0; + _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type)))); + if (i + wg_local_id < n) + { + val = __in_rng[i + wg_local_id]; + + satisfies_pred = pred(val); + } + _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, + sycl::plus<_SizeT>()); + + if (satisfies_pred) + wg_copy_if_values[count] = val; + + wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); + } + } + + // Phase 3: copy values to global memory + for (int i = wg_local_id; i < wg_count; i += wgsize) + { + __out_rng[i] = wg_copy_if_values[i]; + } + if (group.leader()) + __num_rng[0] = wg_count; + }); }); event.wait(); } -template +template void -single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred) +single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, + _UnaryPredicate pred) { using _Type = oneapi::dpl::__internal::__value_t<_InRange>; using _SizeT = uint64_t; @@ -844,138 +859,150 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ hdl.depends_on(fill_event); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng); - hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { - auto group = item.get_group(); - auto wg_local_id = item.get_local_id(0); - auto sg = item.get_sub_group(); - constexpr ::std::uint32_t stride = wgsize; - - // Init tile_id - std::uint32_t tile_id; - if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>) - { - // Obtain unique ID for this work-group that will be used in decoupled lookback - TileId dynamic_tile_id(tile_id_begin); - if (group.leader()) - { - tile_id_lacc[0] = dynamic_tile_id.fetch_inc(); - } - sycl::group_barrier(group); - tile_id = tile_id_lacc[0]; - } - else - { - tile_id = group.get_group_linear_id(); - } - - _SizeT wg_count = 0; - - // Phase 1: Create wg_count and construct in-order wg_copy_if_values - if ((tile_id + 1) * elems_in_tile <= n) { + hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), + [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { + auto group = item.get_group(); + auto wg_local_id = item.get_local_id(0); + auto sg = item.get_sub_group(); + constexpr ::std::uint32_t stride = wgsize; + + // Init tile_id + std::uint32_t tile_id; + if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>) + { + // Obtain unique ID for this work-group that will be used in decoupled lookback + TileId dynamic_tile_id(tile_id_begin); + if (group.leader()) + { + tile_id_lacc[0] = dynamic_tile_id.fetch_inc(); + } + sycl::group_barrier(group); + tile_id = tile_id_lacc[0]; + } + else + { + tile_id = group.get_group_linear_id(); + } + + _SizeT wg_count = 0; + + // Phase 1: Create wg_count and construct in-order wg_copy_if_values + if ((tile_id + 1) * elems_in_tile <= n) + { #pragma unroll - for (size_t i = 0; i < elems_in_tile; i += wgsize) { - _Type val = __in_rng[i + wg_local_id + elems_in_tile * tile_id]; - - _SizeT satisfies_pred = pred(val); - _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); - - if (satisfies_pred) - wg_copy_if_values[count] = val; - - wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); - } - } else { - // Edge of input, have to handle memory bounds - // Might have unneccessary group_barrier calls + for (size_t i = 0; i < elems_in_tile; i += wgsize) + { + _Type val = __in_rng[i + wg_local_id + elems_in_tile * tile_id]; + + _SizeT satisfies_pred = pred(val); + _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, + sycl::plus<_SizeT>()); + + if (satisfies_pred) + wg_copy_if_values[count] = val; + + wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); + } + } + else + { + // Edge of input, have to handle memory bounds + // Might have unneccessary group_barrier calls #pragma unroll - for (size_t i = 0; i < elems_in_tile; i += wgsize) { - _SizeT satisfies_pred = 0; - _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type)))); - if (i + wg_local_id + elems_in_tile * tile_id < n) { - val = __in_rng[i + wg_local_id + elems_in_tile * tile_id]; - - satisfies_pred = pred(val); - } - _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); - - if (satisfies_pred) - wg_copy_if_values[count] = val; - - wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); - } - } - - // Phase 2: Global scan across wg_count - _SizeT prev_sum = 0; - - // The first sub-group will query the previous tiles to find a prefix - if (sg.get_group_id() == 0) - { - _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs); - - if (group.leader()) - scan_mem.set_partial(tile_id, wg_count); - - // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum - prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_SizeT>(), scan_mem); - - if (group.leader()) - scan_mem.set_full(tile_id, prev_sum + wg_count); - } - - _SizeT start_idx = sycl::group_broadcast(group, prev_sum, 0); - - // Phase 3: copy values to global memory - for (int i = wg_local_id; i < wg_count; i += wgsize) { - __out_rng[start_idx + i] = wg_copy_if_values[i]; - } - if (tile_id == (num_wgs - 1) && group.leader()) - __num_rng[0] = start_idx + wg_count; - }); + for (size_t i = 0; i < elems_in_tile; i += wgsize) + { + _SizeT satisfies_pred = 0; + _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type)))); + if (i + wg_local_id + elems_in_tile * tile_id < n) + { + val = __in_rng[i + wg_local_id + elems_in_tile * tile_id]; + + satisfies_pred = pred(val); + } + _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, + sycl::plus<_SizeT>()); + + if (satisfies_pred) + wg_copy_if_values[count] = val; + + wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); + } + } + + // Phase 2: Global scan across wg_count + _SizeT prev_sum = 0; + + // The first sub-group will query the previous tiles to find a prefix + if (sg.get_group_id() == 0) + { + _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs); + + if (group.leader()) + scan_mem.set_partial(tile_id, wg_count); + + // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum + prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_SizeT>(), scan_mem); + + if (group.leader()) + scan_mem.set_full(tile_id, prev_sum + wg_count); + } + + _SizeT start_idx = sycl::group_broadcast(group, prev_sum, 0); + + // Phase 3: copy values to global memory + for (int i = wg_local_id; i < wg_count; i += wgsize) + { + __out_rng[start_idx + i] = wg_copy_if_values[i]; + } + if (tile_id == (num_wgs - 1) && group.leader()) + __num_rng[0] = start_idx + wg_count; + }); }); event.wait(); scratch.free(); } -template +template void -single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred) +single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, + _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred) { auto __n = __in_end - __in_begin; - auto __keep1 = - oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); + auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); auto __buf1 = __keep1(__in_begin, __in_end); - auto __keep2 = - oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); + auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); auto __buf2 = __keep2(__out_begin, __out_begin + __n); auto __keep_num = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>(); auto __buf_num = __keep2(__num_begin, __num_begin + 1); - single_pass_copy_if_impl_single_wg<_KernelParam>(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred); + single_pass_copy_if_impl_single_wg<_KernelParam>(__queue, __buf1.all_view(), __buf2.all_view(), + __buf_num.all_view(), pred); } -template +template void -single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred) +single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, + _NumSelectedRange __num_begin, _UnaryPredicate pred) { auto __n = __in_end - __in_begin; - auto __keep1 = - oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); + auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); auto __buf1 = __keep1(__in_begin, __in_end); - auto __keep2 = - oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); + auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); auto __buf2 = __keep2(__out_begin, __out_begin + __n); auto __keep_num = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>(); auto __buf_num = __keep2(__num_begin, __num_begin + 1); - single_pass_copy_if_impl<_KernelParam, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::true_type>(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred); + single_pass_copy_if_impl<_KernelParam, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::true_type>( + __queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred); } } // inline namespace igpu diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp index 75769131522..a77b76491e7 100644 --- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp @@ -24,8 +24,9 @@ using namespace TestUtils; template class CopyIfKernel; -template -bool test(Predicate pred, Generator gen) +template +bool +test(Predicate pred, Generator gen) { bool all_passed = true; sycl::queue q; @@ -34,9 +35,7 @@ bool test(Predicate pred, Generator gen) { int n = 1 << logn; - Sequence in(n, [&](size_t k) -> T { - return gen(n ^ k); - }); + Sequence in(n, [&](size_t k) -> T { return gen(n ^ k); }); Sequence std_out(n); @@ -47,8 +46,9 @@ bool test(Predicate pred, Generator gen) constexpr int n_elements_per_workitem = 8; q.copy(in.data(), in_ptr, n).wait(); - using KernelParams = oneapi::dpl::experimental::kt::kernel_param>; - oneapi::dpl::experimental::kt::single_pass_copy_if(q, in_ptr, in_ptr+n, out_ptr, out_num, pred); + using KernelParams = + oneapi::dpl::experimental::kt::kernel_param>; + oneapi::dpl::experimental::kt::single_pass_copy_if(q, in_ptr, in_ptr + n, out_ptr, out_num, pred); Sequence kt_out(n); size_t num_selected = 0; @@ -59,12 +59,14 @@ bool test(Predicate pred, Generator gen) auto std_out_end = std::copy_if(in.begin(), in.end(), std_out.begin(), pred); bool passed = true; - if (num_selected != (std_out_end - std_out.begin())) { + if (num_selected != (std_out_end - std_out.begin())) + { passed = false; - std::cout << "Num selected wrong: expected " << (std_out_end - std_out.begin()) << " " << num_selected << "\n"; + std::cout << "Num selected wrong: expected " << (std_out_end - std_out.begin()) << " " << num_selected + << "\n"; } - for (size_t i = 0; i < (std_out_end - std_out.begin()); ++i) + for (size_t i = 0; i < (std_out_end - std_out.begin()); ++i) { if (kt_out[i] != std_out[i]) { @@ -87,11 +89,16 @@ bool test(Predicate pred, Generator gen) return all_passed; } -int main() { +int +main() +{ bool all_passed = true; - all_passed &= test([](const float64_t& x) { return x * x <= 1024; }, [](size_t j) { return ((j + 1) % 7 & 2) != 0 ? float64_t(j % 32) : float64_t(j % 33 + 34); }); + all_passed &= + test([](const float64_t& x) { return x * x <= 1024; }, + [](size_t j) { return ((j + 1) % 7 & 2) != 0 ? float64_t(j % 32) : float64_t(j % 33 + 34); }); all_passed &= test([](const int&) { return true; }, [](size_t j) { return j; }); - all_passed &= test([](const std::int32_t& x) { return x != 42; }, [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? std::int32_t(j + 1) : 42; }); + all_passed &= test([](const std::int32_t& x) { return x != 42; }, + [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? std::int32_t(j + 1) : 42; }); return all_passed; } From 70e751a99ad1f3d9b2d681713a5392f1bb9adfa0 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Wed, 29 Nov 2023 12:28:43 +0000 Subject: [PATCH 043/134] Enable pragma unroll for open-source DPC++ --- include/oneapi/dpl/pstl/onedpl_config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/onedpl_config.h b/include/oneapi/dpl/pstl/onedpl_config.h index 450cae9a347..fff5f2405b5 100644 --- a/include/oneapi/dpl/pstl/onedpl_config.h +++ b/include/oneapi/dpl/pstl/onedpl_config.h @@ -123,7 +123,7 @@ // Enable loop unrolling pragmas where supported #if (__INTEL_LLVM_COMPILER || __INTEL_COMPILER || \ (!defined(__INTEL_LLVM_COMPILER) && !defined(__INTEL_COMPILER) && \ - ((_ONEDPL_GCC_VERSION >= 80000) || (_ONEDPL_CLANG_VERSION >= 30700)))) + ((_ONEDPL_GCC_VERSION >= 80000) || (_ONEDPL_CLANG_VERSION >= 40000)))) # define _ONEDPL_PRAGMA_UNROLL _ONEDPL_PRAGMA(unroll) #else //no pragma unroll # define _ONEDPL_PRAGMA_UNROLL From a9fdaa365ec80fc2cc2d640fb946cce227c1373d Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Fri, 18 Aug 2023 14:52:41 -0500 Subject: [PATCH 044/134] Start of single-pass scan kernel template --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 1 + .../hetero/dpcpp/parallel_backend_sycl_scan.h | 150 ++++++++++++++++++ .../numeric/numeric.ops/scan_kt.pass.cpp | 30 ++++ 3 files changed, 181 insertions(+) create mode 100644 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h create mode 100644 test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index b006eae051b..162fcf2c282 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -46,6 +46,7 @@ #endif #include "sycl_traits.h" //SYCL traits specialization for some oneDPL types. +#include "parallel_backend_sycl_scan.h" namespace oneapi { diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h new file mode 100644 index 00000000000..4fc2dbe4d44 --- /dev/null +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -0,0 +1,150 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _ONEDPL_parallel_backend_sycl_scan_H +#define _ONEDPL_parallel_backend_sycl_scan_H + +namespace oneapi::dpl::experimental::igpu +{ + +template +struct __scan_status_flag +{ + using _AtomicRefT = sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space>; + static constexpr std::uint32_t partial_mask = 1 << (sizeof(std::uint32_t)*8 - 2); + static constexpr std::uint32_t full_mask = 1 << (sizeof(std::uint32_t)*8 - 1); + static constexpr std::uint32_t value_mask = ~(partial_mask | full_mask); + + __scan_status_flag(std::uint32_t* flags_begin, const std::uint32_t tile_id) + : atomic_flag(*(flags_begin + tile_id)) + { + + } + + void set_partial(std::uint32_t val) + { + atomic_flag.store(val | partial_mask); + } + + void set_full(std::uint32_t val) + { + atomic_flag.store(val | full_mask); + } + + _T lookback(const std::uint32_t tile_id, std::uint32_t* flags_begin) + { + _T sum = 0; + int i = 0; + for (std::int32_t tile = static_cast(tile_id) - 1; tile >= 0; --tile) + { + _AtomicRefT tile_atomic(*(flags_begin + tile)); + std::uint32_t tile_val = 0; + do { + tile_val = tile_atomic.load(); + } while (tile_val == 0); + + sum += tile_val & value_mask; + + // If this was a full value, we can stop looking at previous tiles. Otherwise, + // keep going through tiles until we either find a full tile or we've completely + // recomputed the prefix using partial values + if (tile_val & full_mask) + break; + } + return sum; + } + + _AtomicRefT atomic_flag; +}; + +template +void +single_pass_scan_impl(_Policy&& __exec, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op) +{ + using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + + const ::std::size_t n = __in_rng.size(); + auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); + std::size_t num_wgs = __max_cu; + + std::size_t wgsize = n/__max_cu; + + std::uint32_t status_flags_buf_size = num_wgs+1; + sycl::buffer status_flags_buf(status_flags_buf_size); + + // TODO: this probably isn't the best way to do this + sycl::host_accessor status_flags(status_flags_buf); + for (std::size_t i = 0; i < status_flags_buf_size; ++i) + status_flags[i] = 0; + + + auto event = __exec.queue().submit([&](sycl::handler& hdl) { + auto status_flags = sycl::accessor(status_flags_buf, hdl); + auto tile_id_lacc = sycl::accessor(sycl::range<1>{1}, hdl); + + oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); + hdl.parallel_for(sycl::nd_range<1>(n, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(32)]] { + auto item_id = item.get_local_linear_id(); + auto group = item.get_group(); + + //std::uint32_t elems_in_tile = elems_per_item*wgsize; + std::uint32_t elems_in_tile = wgsize; + + // Obtain unique ID for this work-group that will be used in decoupled lookback + if (group.leader()) + { + sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space> idx_atomic(status_flags[status_flags_buf_size-1]); + tile_id_lacc[0] = idx_atomic.fetch_add(1); + } + sycl::group_barrier(group); + std::uint32_t tile_id = tile_id_lacc[0]; + + auto in_begin = __in_rng.begin() + (tile_id*elems_in_tile); + auto in_end = __in_rng.begin() + ((tile_id+1)*elems_in_tile); + auto out_begin = __out_rng.begin() + (tile_id*elems_in_tile); + + auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op); + + __scan_status_flag<_Type> flag(status_flags.get_pointer(), tile_id); + flag.set_partial(local_sum); + + auto prev_sum = flag.lookback(tile_id, status_flags.get_pointer()); + flag.set_full(prev_sum + local_sum); + + sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum); + }); + }); + + event.wait(); +} + +template +void +single_pass_inclusive_scan(_Policy&& __exec, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _BinaryOp __binary_op) +{ + auto __n = __in_end - __in_begin; + auto __keep1 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); + auto __buf1 = __keep1(__in_begin, __in_end); + auto __keep2 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); + auto __buf2 = __keep2(__out_begin, __out_begin + __n); + + single_pass_scan_impl(__exec, __buf1.all_view(), __buf2.all_view(), __binary_op); +} + +} // namespace oneapi::dpl::experimental::igpu + +#endif /* _ONEDPL_parallel_backend_sycl_scan_H */ diff --git a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp new file mode 100644 index 00000000000..71a725563d4 --- /dev/null +++ b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp @@ -0,0 +1,30 @@ +// -*- C++ -*- +//===-- scan.pass.cpp -----------------------------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#include "support/test_config.h" + +#include _PSTL_TEST_HEADER(execution) +#include _PSTL_TEST_HEADER(numeric) + +int +main() +{ + int n = 1 << 16; + sycl::queue q; + int* in_ptr = sycl::malloc_device(n, q); + int* out_ptr = sycl::malloc_device(n, q); + oneapi::dpl::experimental::igpu::single_pass_inclusive_scan(oneapi::dpl::execution::dpcpp_default, in_ptr, in_ptr+n, out_ptr, ::std::plus()); + return 0; +} From dfef06f7db2bc09249df4a65769084ab26b7d2e1 Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Thu, 24 Aug 2023 08:48:42 -0500 Subject: [PATCH 045/134] Fix hang in inclusive scan --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 44 ++++++++++++++----- .../numeric/numeric.ops/scan_kt.pass.cpp | 30 ++++++++++++- 2 files changed, 61 insertions(+), 13 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 4fc2dbe4d44..e71398a44b7 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -16,9 +16,11 @@ #ifndef _ONEDPL_parallel_backend_sycl_scan_H #define _ONEDPL_parallel_backend_sycl_scan_H -namespace oneapi::dpl::experimental::igpu +namespace oneapi::dpl::experimental::kt { +inline namespace igpu { + template struct __scan_status_flag { @@ -69,28 +71,36 @@ struct __scan_status_flag _AtomicRefT atomic_flag; }; -template +template void -single_pass_scan_impl(_Policy&& __exec, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op) +single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op) { using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + static_assert(_Inclusive, "Single-pass scan only available for inclusive scan"); + const ::std::size_t n = __in_rng.size(); - auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); - std::size_t num_wgs = __max_cu; + auto __max_cu = __queue.get_device().template get_info(); + //std::size_t num_wgs = __max_cu; + std::size_t num_wgs = 64; - std::size_t wgsize = n/__max_cu; + // TODO: use wgsize and iters per item from _KernelParam + std::size_t wgsize = n/num_wgs; std::uint32_t status_flags_buf_size = num_wgs+1; sycl::buffer status_flags_buf(status_flags_buf_size); // TODO: this probably isn't the best way to do this + { sycl::host_accessor status_flags(status_flags_buf); for (std::size_t i = 0; i < status_flags_buf_size; ++i) status_flags[i] = 0; + } + +// printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%lu\n", n, num_wgs, wgsize, __max_cu); - auto event = __exec.queue().submit([&](sycl::handler& hdl) { + auto event = __queue.submit([&](sycl::handler& hdl) { auto status_flags = sycl::accessor(status_flags_buf, hdl); auto tile_id_lacc = sycl::accessor(sycl::range<1>{1}, hdl); @@ -121,6 +131,7 @@ single_pass_scan_impl(_Policy&& __exec, _InRange&& __in_rng, _OutRange&& __out_r flag.set_partial(local_sum); auto prev_sum = flag.lookback(tile_id, status_flags.get_pointer()); + //auto prev_sum = 0; flag.set_full(prev_sum + local_sum); sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum); @@ -130,9 +141,18 @@ single_pass_scan_impl(_Policy&& __exec, _InRange&& __in_rng, _OutRange&& __out_r event.wait(); } -template +// The generic structure for configuring a kernel +template +struct kernel_param +{ + static constexpr std::uint16_t data_per_workitem = DataPerWorkItem; + static constexpr std::uint16_t workgroup_size = WorkGroupSize; + using kernel_name = KernelName; +}; + +template void -single_pass_inclusive_scan(_Policy&& __exec, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _BinaryOp __binary_op) +single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _BinaryOp __binary_op) { auto __n = __in_end - __in_begin; auto __keep1 = @@ -142,9 +162,11 @@ single_pass_inclusive_scan(_Policy&& __exec, _InIterator __in_begin, _InIterator oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); auto __buf2 = __keep2(__out_begin, __out_begin + __n); - single_pass_scan_impl(__exec, __buf1.all_view(), __buf2.all_view(), __binary_op); + single_pass_scan_impl<_KernelParam, true>(__queue, __buf1.all_view(), __buf2.all_view(), __binary_op); } -} // namespace oneapi::dpl::experimental::igpu +} // inline namespace igpu + +} // namespace oneapi::dpl::experimental::kt #endif /* _ONEDPL_parallel_backend_sycl_scan_H */ diff --git a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp index 71a725563d4..4ae83a92041 100644 --- a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp @@ -22,9 +22,35 @@ int main() { int n = 1 << 16; + std::vector v(n, 1); sycl::queue q; int* in_ptr = sycl::malloc_device(n, q); int* out_ptr = sycl::malloc_device(n, q); - oneapi::dpl::experimental::igpu::single_pass_inclusive_scan(oneapi::dpl::execution::dpcpp_default, in_ptr, in_ptr+n, out_ptr, ::std::plus()); - return 0; + + + q.copy(v.data(), in_ptr, n); + using KernelParams = oneapi::dpl::experimental::kt::kernel_param<128, 2, class ScanKernel>; + oneapi::dpl::experimental::kt::single_pass_inclusive_scan(q, in_ptr, in_ptr+n, out_ptr, ::std::plus()); + + std::vector tmp(n, 0); + q.copy(out_ptr, tmp.data(), n); + + std::inclusive_scan(v.begin(), v.end(), v.begin()); + + bool passed = true; + for (size_t i = 0; i < n; ++i) + { + if (tmp[i] != v[i]) + { + passed = false; + std::cout << "expected " << i << ' ' << v[i] << ' ' << tmp[i] << '\n'; + } + } + + if (passed) + std::cout << "passed" << std::endl; + else + std::cout << "failed" << std::endl; + + return !passed; } From 555f6f9b714c825ffd17577e75c2e2ad0f1c5786 Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Thu, 31 Aug 2023 06:18:55 -0700 Subject: [PATCH 046/134] Debug statements for scan kernel template --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 81 +++++++++++++------ 1 file changed, 58 insertions(+), 23 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index e71398a44b7..c70bbabb82b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -82,56 +82,67 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou const ::std::size_t n = __in_rng.size(); auto __max_cu = __queue.get_device().template get_info(); //std::size_t num_wgs = __max_cu; - std::size_t num_wgs = 64; + std::size_t num_wgs = 256; // TODO: use wgsize and iters per item from _KernelParam - std::size_t wgsize = n/num_wgs; + //constexpr ::std::size_t __elems_per_item = _KernelParam::data_per_workitem; + constexpr ::std::size_t __elems_per_item = 2; + std::size_t wgsize = n/num_wgs/__elems_per_item; + std::size_t num_items = n/__elems_per_item; - std::uint32_t status_flags_buf_size = num_wgs+1; - sycl::buffer status_flags_buf(status_flags_buf_size); - // TODO: this probably isn't the best way to do this - { - sycl::host_accessor status_flags(status_flags_buf); - for (std::size_t i = 0; i < status_flags_buf_size; ++i) - status_flags[i] = 0; - } + std::uint32_t status_flags_size = num_wgs+1; + + uint32_t* status_flags = sycl::malloc_device(status_flags_size, __queue); + __queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t)); -// printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%lu\n", n, num_wgs, wgsize, __max_cu); + //printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu); + /*printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu); + uint32_t* debug1 = sycl::malloc_device(status_flags_size, __queue); + uint32_t* debug2 = sycl::malloc_device(status_flags_size, __queue); + uint32_t* debug3 = sycl::malloc_device(status_flags_size, __queue); + uint32_t* debug4 = sycl::malloc_device(status_flags_size, __queue); + uint32_t* debug5 = sycl::malloc_device(status_flags_size, __queue);*/ auto event = __queue.submit([&](sycl::handler& hdl) { - auto status_flags = sycl::accessor(status_flags_buf, hdl); - auto tile_id_lacc = sycl::accessor(sycl::range<1>{1}, hdl); + auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); - hdl.parallel_for(sycl::nd_range<1>(n, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(32)]] { - auto item_id = item.get_local_linear_id(); + hdl.parallel_for(sycl::nd_range<1>(num_items, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(32)]] { auto group = item.get_group(); - //std::uint32_t elems_in_tile = elems_per_item*wgsize; - std::uint32_t elems_in_tile = wgsize; + std::uint32_t elems_in_tile = wgsize*__elems_per_item; // Obtain unique ID for this work-group that will be used in decoupled lookback if (group.leader()) { - sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space> idx_atomic(status_flags[status_flags_buf_size-1]); + sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space> idx_atomic(status_flags[status_flags_size-1]); tile_id_lacc[0] = idx_atomic.fetch_add(1); } sycl::group_barrier(group); std::uint32_t tile_id = tile_id_lacc[0]; + //debug5[group.get_local_id()] = tile_id; - auto in_begin = __in_rng.begin() + (tile_id*elems_in_tile); - auto in_end = __in_rng.begin() + ((tile_id+1)*elems_in_tile); - auto out_begin = __out_rng.begin() + (tile_id*elems_in_tile); + auto current_offset = (tile_id*elems_in_tile); + auto next_offset = ((tile_id+1)*elems_in_tile); + auto in_begin = __in_rng.begin() + current_offset; + auto in_end = __in_rng.begin() + next_offset; + auto out_begin = __out_rng.begin() + current_offset; + + //debug3[tile_id] = current_offset; + //debug4[tile_id] = next_offset; auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op); + //auto local_sum = 0; + ///debug1[tile_id] = local_sum; - __scan_status_flag<_Type> flag(status_flags.get_pointer(), tile_id); + __scan_status_flag<_Type> flag(status_flags, tile_id); flag.set_partial(local_sum); - auto prev_sum = flag.lookback(tile_id, status_flags.get_pointer()); + auto prev_sum = flag.lookback(tile_id, status_flags); //auto prev_sum = 0; + //debug2[tile_id] = prev_sum; flag.set_full(prev_sum + local_sum); sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum); @@ -139,6 +150,30 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou }); event.wait(); + +#if 0 + std::vector debug1v(status_flags_size); + std::vector debug2v(status_flags_size); + std::vector debug3v(status_flags_size); + std::vector debug4v(status_flags_size); + std::vector debug5v(status_flags_size); + __queue.memcpy(debug1v.data(), debug1, status_flags_size * sizeof(uint32_t)); + __queue.memcpy(debug2v.data(), debug2, status_flags_size * sizeof(uint32_t)); + __queue.memcpy(debug3v.data(), debug3, status_flags_size * sizeof(uint32_t)); + __queue.memcpy(debug4v.data(), debug4, status_flags_size * sizeof(uint32_t)); + __queue.memcpy(debug5v.data(), debug5, status_flags_size * sizeof(uint32_t)); + + for (int i = 0; i < status_flags_size-1; ++i) + std::cout << "local_sum " << i << " " << debug1v[i] << std::endl; + for (int i = 0; i < status_flags_size-1; ++i) + std::cout << "lookback " << i << " " << debug2v[i] << std::endl; + for (int i = 0; i < status_flags_size-1; ++i) + std::cout << "offset " << i << " " << debug3v[i] << std::endl; + for (int i = 0; i < status_flags_size-1; ++i) + std::cout << "end " << i << " " << debug4v[i] << std::endl; +#endif + + sycl::free(status_flags, __queue); } // The generic structure for configuring a kernel From 10cfc687cd7c1feb9c8315fb30ffab034289bee1 Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Wed, 6 Sep 2023 08:46:10 -0500 Subject: [PATCH 047/134] Update scan kernel template test --- .../numeric/numeric.ops/scan_kt.pass.cpp | 54 +++++++++++-------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp index 4ae83a92041..de5ecafc25b 100644 --- a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp @@ -21,36 +21,44 @@ int main() { - int n = 1 << 16; - std::vector v(n, 1); - sycl::queue q; - int* in_ptr = sycl::malloc_device(n, q); - int* out_ptr = sycl::malloc_device(n, q); + bool all_passed = true; + for (int logn : {4, 8, 11, 16, 19, 21}) + { + std::cout << "Testing 2^" << logn << '\n'; + int n = 1 << logn; + std::vector v(n, 1); + sycl::queue q; + int* in_ptr = sycl::malloc_device(n, q); + int* out_ptr = sycl::malloc_device(n, q); - q.copy(v.data(), in_ptr, n); - using KernelParams = oneapi::dpl::experimental::kt::kernel_param<128, 2, class ScanKernel>; - oneapi::dpl::experimental::kt::single_pass_inclusive_scan(q, in_ptr, in_ptr+n, out_ptr, ::std::plus()); - std::vector tmp(n, 0); - q.copy(out_ptr, tmp.data(), n); + q.copy(v.data(), in_ptr, n); + using KernelParams = oneapi::dpl::experimental::kt::kernel_param<128, 2, class ScanKernel>; + oneapi::dpl::experimental::kt::single_pass_inclusive_scan(q, in_ptr, in_ptr+n, out_ptr, ::std::plus()); - std::inclusive_scan(v.begin(), v.end(), v.begin()); + std::vector tmp(n, 0); + q.copy(out_ptr, tmp.data(), n); - bool passed = true; - for (size_t i = 0; i < n; ++i) - { - if (tmp[i] != v[i]) + std::inclusive_scan(v.begin(), v.end(), v.begin()); + + bool passed = true; + for (size_t i = 0; i < n; ++i) { - passed = false; - std::cout << "expected " << i << ' ' << v[i] << ' ' << tmp[i] << '\n'; + if (tmp[i] != v[i]) + { + passed = false; + std::cout << "expected " << i << ' ' << v[i] << ' ' << tmp[i] << '\n'; + } } - } - if (passed) - std::cout << "passed" << std::endl; - else - std::cout << "failed" << std::endl; + if (passed) + std::cout << "passed" << std::endl; + else + std::cout << "failed" << std::endl; + + all_passed &= passed; + } - return !passed; + return !all_passed; } From 53faf10fe9e585654e77fe87a82826b370a7b2f6 Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Thu, 14 Sep 2023 09:08:55 -0700 Subject: [PATCH 048/134] Only have a single work-item per group query for previous tile status --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 27 +++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index c70bbabb82b..b01f56ac539 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -86,7 +86,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou // TODO: use wgsize and iters per item from _KernelParam //constexpr ::std::size_t __elems_per_item = _KernelParam::data_per_workitem; - constexpr ::std::size_t __elems_per_item = 2; + constexpr ::std::size_t __elems_per_item = 16; std::size_t wgsize = n/num_wgs/__elems_per_item; std::size_t num_items = n/__elems_per_item; @@ -96,14 +96,16 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou uint32_t* status_flags = sycl::malloc_device(status_flags_size, __queue); __queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t)); - //printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu); - /*printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu); +#if SCAN_KT_DEBUG + printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu); + printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu); uint32_t* debug1 = sycl::malloc_device(status_flags_size, __queue); uint32_t* debug2 = sycl::malloc_device(status_flags_size, __queue); uint32_t* debug3 = sycl::malloc_device(status_flags_size, __queue); uint32_t* debug4 = sycl::malloc_device(status_flags_size, __queue); - uint32_t* debug5 = sycl::malloc_device(status_flags_size, __queue);*/ + uint32_t* debug5 = sycl::malloc_device(status_flags_size, __queue); +#endif auto event = __queue.submit([&](sycl::handler& hdl) { auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); @@ -138,12 +140,21 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou ///debug1[tile_id] = local_sum; __scan_status_flag<_Type> flag(status_flags, tile_id); - flag.set_partial(local_sum); - auto prev_sum = flag.lookback(tile_id, status_flags); - //auto prev_sum = 0; + if (group.leader()) + flag.set_partial(local_sum); + + // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum + //sycl::reduce_over_group(item.get_subgroup()) + + auto prev_sum = 0; + + if (group.leader()) + prev_sum = flag.lookback(tile_id, status_flags); //debug2[tile_id] = prev_sum; - flag.set_full(prev_sum + local_sum); + + if (group.leader()) + flag.set_full(prev_sum + local_sum); sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum); }); From dc63d16f8832f9c10a1ec6d76508d4c1b1f2c455 Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Mon, 18 Sep 2023 08:06:43 -0700 Subject: [PATCH 049/134] First attempt at parallel lookback --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 106 +++++++++++++++--- 1 file changed, 89 insertions(+), 17 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index b01f56ac539..27fdc1d09b4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -24,13 +24,21 @@ inline namespace igpu { template struct __scan_status_flag { + // 00xxxx - not computed + // 01xxxx - partial + // 10xxxx - full + // 110000 - out of bounds + using _AtomicRefT = sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space>; static constexpr std::uint32_t partial_mask = 1 << (sizeof(std::uint32_t)*8 - 2); static constexpr std::uint32_t full_mask = 1 << (sizeof(std::uint32_t)*8 - 1); static constexpr std::uint32_t value_mask = ~(partial_mask | full_mask); + static constexpr std::uint32_t oob_value = partial_mask | full_mask; + + static constexpr int padding = 32; __scan_status_flag(std::uint32_t* flags_begin, const std::uint32_t tile_id) - : atomic_flag(*(flags_begin + tile_id)) + : atomic_flag(*(flags_begin + tile_id + padding)) { } @@ -42,16 +50,57 @@ struct __scan_status_flag void set_full(std::uint32_t val) { - atomic_flag.store(val | full_mask); + atomic_flag.store((val ^ partial_mask) | full_mask); + } + + template + _T cooperative_lookback(const std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin) + { + _T sum = 0; + int offset = -1; + int i = 0; + int local_id = subgroup.get_local_id(); + + for (int tile = static_cast(tile_id) + offset; tile >= 0; offset -= 32) + { + _AtomicRefT tile_atomic(*(flags_begin + tile + padding - local_id)); + std::uint32_t tile_val = 0; + do { + tile_val = tile_atomic.load(); + + //} while (!sycl::all_of_group(subgroup, tile_val != 0)); + } while (0); + + bool is_full = (tile_val & full_mask) && ((tile_val & partial_mask) == 0); + auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full); + ::std::uint32_t is_full_ballot_bits{}; + is_full_ballot.extract_bits(is_full_ballot_bits); + + auto lowest_item_with_full = sycl::ctz(is_full_ballot_bits); + _T contribution = local_id <= lowest_item_with_full ? tile_val & value_mask : _T{}; + + // Sum all of the partial results from the tiles found, as well as the full contribution from the closest tile (if any) + sum += sycl::reduce_over_group(subgroup, contribution, bin_op); + + // If we found a full value, we can stop looking at previous tiles. Otherwise, + // keep going through tiles until we either find a full tile or we've completely + // recomputed the prefix using partial values + if (is_full_ballot_bits) + break; + + //if (i++ > 10) break; + } + return sum; } +#if 0 _T lookback(const std::uint32_t tile_id, std::uint32_t* flags_begin) { _T sum = 0; int i = 0; for (std::int32_t tile = static_cast(tile_id) - 1; tile >= 0; --tile) { - _AtomicRefT tile_atomic(*(flags_begin + tile)); + _AtomicRefT tile_atomic(*(flags_begin + tile + padding)); std::uint32_t tile_val = 0; do { tile_val = tile_atomic.load(); @@ -67,6 +116,7 @@ struct __scan_status_flag } return sum; } +#endif _AtomicRefT atomic_flag; }; @@ -86,15 +136,28 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou // TODO: use wgsize and iters per item from _KernelParam //constexpr ::std::size_t __elems_per_item = _KernelParam::data_per_workitem; - constexpr ::std::size_t __elems_per_item = 16; +#ifdef _ONEDPL_SCAN_ITER_SIZE + constexpr ::std::size_t __elems_per_item = _ONEDPL_SCAN_ITER_SIZE; +#else + constexpr ::std::size_t __elems_per_item = 8; +#endif std::size_t wgsize = n/num_wgs/__elems_per_item; std::size_t num_items = n/__elems_per_item; - std::uint32_t status_flags_size = num_wgs+1; + constexpr int status_flag_padding = 32; + std::uint32_t status_flags_size = num_wgs+1+status_flag_padding; uint32_t* status_flags = sycl::malloc_device(status_flags_size, __queue); - __queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t)); + //__queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t)); + + auto fill_event = __queue.submit([&](sycl::handler& hdl) { + + hdl.parallel_for(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item) { + int id = item.get_linear_id(); + status_flags[id] = id < status_flag_padding ? __scan_status_flag<_Type>::oob_value : 0; + }); + }); #if SCAN_KT_DEBUG printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu); @@ -109,10 +172,12 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou auto event = __queue.submit([&](sycl::handler& hdl) { auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); + hdl.depends_on(fill_event); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); hdl.parallel_for(sycl::nd_range<1>(num_items, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(32)]] { auto group = item.get_group(); + auto subgroup = item.get_sub_group(); std::uint32_t elems_in_tile = wgsize*__elems_per_item; @@ -139,23 +204,30 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou //auto local_sum = 0; ///debug1[tile_id] = local_sum; - __scan_status_flag<_Type> flag(status_flags, tile_id); + auto prev_sum = 0; - if (group.leader()) - flag.set_partial(local_sum); + // The first sub-group will query the previous tiles to find a prefix + if (subgroup.get_group_id() == 0) + { + __scan_status_flag<_Type> flag(status_flags, tile_id); - // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum - //sycl::reduce_over_group(item.get_subgroup()) + if (group.leader()) + flag.set_partial(local_sum); - auto prev_sum = 0; + // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum + //sycl::reduce_over_group(item.get_subgroup()) - if (group.leader()) - prev_sum = flag.lookback(tile_id, status_flags); - //debug2[tile_id] = prev_sum; - if (group.leader()) - flag.set_full(prev_sum + local_sum); + prev_sum = flag.cooperative_lookback(tile_id, subgroup, __binary_op, status_flags); + //if (group.leader()) + // prev_sum = flag.lookback(tile_id, status_flags); + //debug2[tile_id] = prev_sum; + + if (group.leader()) + flag.set_full(prev_sum + local_sum); + } + prev_sum = sycl::group_broadcast(group, prev_sum, 0); sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum); }); }); From f8c3f2ba26c03476b9fb35716b764152a0addbe3 Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Fri, 22 Sep 2023 11:42:33 -0700 Subject: [PATCH 050/134] Working cooperative lookback --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 61 +++++++++++++------ 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 27fdc1d09b4..963de2952e6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -50,26 +50,26 @@ struct __scan_status_flag void set_full(std::uint32_t val) { - atomic_flag.store((val ^ partial_mask) | full_mask); + atomic_flag.store(val | full_mask); } template - _T cooperative_lookback(const std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin) + _T cooperative_lookback(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin) { _T sum = 0; int offset = -1; int i = 0; int local_id = subgroup.get_local_id(); - for (int tile = static_cast(tile_id) + offset; tile >= 0; offset -= 32) + for (int tile = static_cast(tile_id) + offset; tile >= 0; tile -= 32) { _AtomicRefT tile_atomic(*(flags_begin + tile + padding - local_id)); std::uint32_t tile_val = 0; do { tile_val = tile_atomic.load(); - //} while (!sycl::all_of_group(subgroup, tile_val != 0)); - } while (0); + } while (!sycl::all_of_group(subgroup, tile_val != 0)); + //} while (0); bool is_full = (tile_val & full_mask) && ((tile_val & partial_mask) == 0); auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full); @@ -77,7 +77,7 @@ struct __scan_status_flag is_full_ballot.extract_bits(is_full_ballot_bits); auto lowest_item_with_full = sycl::ctz(is_full_ballot_bits); - _T contribution = local_id <= lowest_item_with_full ? tile_val & value_mask : _T{}; + _T contribution = local_id <= lowest_item_with_full ? tile_val & value_mask : _T{0}; // Sum all of the partial results from the tiles found, as well as the full contribution from the closest tile (if any) sum += sycl::reduce_over_group(subgroup, contribution, bin_op); @@ -132,6 +132,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou const ::std::size_t n = __in_rng.size(); auto __max_cu = __queue.get_device().template get_info(); //std::size_t num_wgs = __max_cu; + //std::size_t num_wgs = 448; std::size_t num_wgs = 256; // TODO: use wgsize and iters per item from _KernelParam @@ -143,26 +144,31 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou #endif std::size_t wgsize = n/num_wgs/__elems_per_item; std::size_t num_items = n/__elems_per_item; + // + //std::size_t wgsize = 256; + //std::size_t num_items = 114688; constexpr int status_flag_padding = 32; std::uint32_t status_flags_size = num_wgs+1+status_flag_padding; + printf("launching kernel items=%lu wgs=%lu wgsize=%lu elems_per_iter=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __elems_per_item, __max_cu); + uint32_t* status_flags = sycl::malloc_device(status_flags_size, __queue); //__queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t)); auto fill_event = __queue.submit([&](sycl::handler& hdl) { - hdl.parallel_for(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item) { + hdl.parallel_for(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item) { int id = item.get_linear_id(); status_flags[id] = id < status_flag_padding ? __scan_status_flag<_Type>::oob_value : 0; }); }); -#if SCAN_KT_DEBUG - printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu); - printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu); + std::uint32_t elems_in_tile = wgsize*__elems_per_item; +#define SCAN_KT_DEBUG 1 +#if SCAN_KT_DEBUG uint32_t* debug1 = sycl::malloc_device(status_flags_size, __queue); uint32_t* debug2 = sycl::malloc_device(status_flags_size, __queue); uint32_t* debug3 = sycl::malloc_device(status_flags_size, __queue); @@ -175,11 +181,10 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou hdl.depends_on(fill_event); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); - hdl.parallel_for(sycl::nd_range<1>(num_items, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(32)]] { + hdl.parallel_for(sycl::nd_range<1>(num_items, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(32)]] { auto group = item.get_group(); auto subgroup = item.get_sub_group(); - std::uint32_t elems_in_tile = wgsize*__elems_per_item; // Obtain unique ID for this work-group that will be used in decoupled lookback if (group.leader()) @@ -189,7 +194,9 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou } sycl::group_barrier(group); std::uint32_t tile_id = tile_id_lacc[0]; - //debug5[group.get_local_id()] = tile_id; +#if SCAN_KT_DEBUG + debug5[group.get_group_linear_id()] = tile_id; +#endif auto current_offset = (tile_id*elems_in_tile); auto next_offset = ((tile_id+1)*elems_in_tile); @@ -197,12 +204,15 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou auto in_end = __in_rng.begin() + next_offset; auto out_begin = __out_rng.begin() + current_offset; - //debug3[tile_id] = current_offset; - //debug4[tile_id] = next_offset; +#if SCAN_KT_DEBUG + debug3[tile_id] = current_offset; + debug4[tile_id] = next_offset; +#endif auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op); - //auto local_sum = 0; - ///debug1[tile_id] = local_sum; +#if SCAN_KT_DEBUG + debug1[tile_id] = local_sum; +#endif auto prev_sum = 0; @@ -221,7 +231,9 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou prev_sum = flag.cooperative_lookback(tile_id, subgroup, __binary_op, status_flags); //if (group.leader()) // prev_sum = flag.lookback(tile_id, status_flags); - //debug2[tile_id] = prev_sum; +#if SCAN_KT_DEBUG + debug2[tile_id] = prev_sum; +#endif if (group.leader()) flag.set_full(prev_sum + local_sum); @@ -234,20 +246,31 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou event.wait(); -#if 0 +#if SCAN_KT_DEBUG std::vector debug1v(status_flags_size); std::vector debug2v(status_flags_size); std::vector debug3v(status_flags_size); std::vector debug4v(status_flags_size); std::vector debug5v(status_flags_size); + std::vector debug6v(status_flags_size); __queue.memcpy(debug1v.data(), debug1, status_flags_size * sizeof(uint32_t)); __queue.memcpy(debug2v.data(), debug2, status_flags_size * sizeof(uint32_t)); __queue.memcpy(debug3v.data(), debug3, status_flags_size * sizeof(uint32_t)); __queue.memcpy(debug4v.data(), debug4, status_flags_size * sizeof(uint32_t)); __queue.memcpy(debug5v.data(), debug5, status_flags_size * sizeof(uint32_t)); + __queue.memcpy(debug6v.data(), status_flags, status_flags_size * sizeof(uint32_t)); + for (int i = 0; i < status_flags_size-1; ++i) + std::cout << "tile " << i << " " << debug5v[i] << std::endl; for (int i = 0; i < status_flags_size-1; ++i) std::cout << "local_sum " << i << " " << debug1v[i] << std::endl; + for (int i = 0; i < status_flags_size-1; ++i) + { + auto val = (debug6v[i] & __scan_status_flag<_Type>::value_mask); + int a = val / elems_in_tile; + int b = val % elems_in_tile; + std::cout << "flags " << i << " " << std::bitset<32>(debug6v[i]) << " (" << val<< " = " << a << "/" << elems_in_tile << "+" << b <<")" << std::endl; + } for (int i = 0; i < status_flags_size-1; ++i) std::cout << "lookback " << i << " " << debug2v[i] << std::endl; for (int i = 0; i < status_flags_size-1; ++i) From 1d72d3f0d4a962eca63eb2c16d6c9db04f8941da Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Wed, 25 Oct 2023 11:13:53 -0700 Subject: [PATCH 051/134] Fix correctness issue with non-power-of-2 sizes --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 52 ++++++++++++++++--- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 963de2952e6..7aaf3f2a255 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -133,7 +133,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou auto __max_cu = __queue.get_device().template get_info(); //std::size_t num_wgs = __max_cu; //std::size_t num_wgs = 448; - std::size_t num_wgs = 256; + //std::size_t num_wgs = 256; // TODO: use wgsize and iters per item from _KernelParam //constexpr ::std::size_t __elems_per_item = _KernelParam::data_per_workitem; @@ -142,8 +142,14 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou #else constexpr ::std::size_t __elems_per_item = 8; #endif - std::size_t wgsize = n/num_wgs/__elems_per_item; - std::size_t num_items = n/__elems_per_item; + // Next power of 2 greater than or equal to __n + auto __n_uniform = n; + if ((__n_uniform & (__n_uniform - 1)) != 0) + __n_uniform = oneapi::dpl::__internal::__dpl_bit_floor(n) << 1; + //std::size_t wgsize = n/num_wgs/__elems_per_item; + std::size_t wgsize = 256; + std::size_t num_items = __n_uniform/__elems_per_item; + std::size_t num_wgs = num_items/wgsize; // //std::size_t wgsize = 256; //std::size_t num_items = 114688; @@ -152,7 +158,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou constexpr int status_flag_padding = 32; std::uint32_t status_flags_size = num_wgs+1+status_flag_padding; - printf("launching kernel items=%lu wgs=%lu wgsize=%lu elems_per_iter=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __elems_per_item, __max_cu); + //printf("launching kernel items=%lu wgs=%lu wgsize=%lu elems_per_iter=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __elems_per_item, __max_cu); uint32_t* status_flags = sycl::malloc_device(status_flags_size, __queue); //__queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t)); @@ -165,10 +171,17 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou }); }); + std::uint32_t elems_in_tile = wgsize*__elems_per_item; -#define SCAN_KT_DEBUG 1 +#define SCAN_KT_DEBUG 0 #if SCAN_KT_DEBUG + std::vector debug11v(status_flags_size); + __queue.memcpy(debug11v.data(), status_flags, status_flags_size * sizeof(uint32_t)); + + for (int i = 0; i < status_flags_size-1; ++i) + std::cout << "flag_before " << i << " " << debug11v[i] << std::endl; + uint32_t* debug1 = sycl::malloc_device(status_flags_size, __queue); uint32_t* debug2 = sycl::malloc_device(status_flags_size, __queue); uint32_t* debug3 = sycl::malloc_device(status_flags_size, __queue); @@ -200,21 +213,27 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou auto current_offset = (tile_id*elems_in_tile); auto next_offset = ((tile_id+1)*elems_in_tile); + if (next_offset > n) + next_offset = n; auto in_begin = __in_rng.begin() + current_offset; auto in_end = __in_rng.begin() + next_offset; auto out_begin = __out_rng.begin() + current_offset; + #if SCAN_KT_DEBUG debug3[tile_id] = current_offset; debug4[tile_id] = next_offset; #endif + if (current_offset >= n) + return; + auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op); #if SCAN_KT_DEBUG debug1[tile_id] = local_sum; #endif - auto prev_sum = 0; + _Type prev_sum = 0; // The first sub-group will query the previous tiles to find a prefix if (subgroup.get_group_id() == 0) @@ -296,6 +315,17 @@ void single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _BinaryOp __binary_op) { auto __n = __in_end - __in_begin; + +#if SCAN_KT_DEBUG + using _Type = std::remove_pointer_t<_InIterator>; + std::vector<_Type> in_debug(__n); + __queue.memcpy(in_debug.data(), __in_begin, __n * sizeof(_Type)); + + for (int i = 0; i < __n; ++i) + std::cout << "input_before " << i << " " << in_debug[i] << std::endl; +#endif + + //printf("KERNEL_TEMPLATE %lu\n", __n); auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); auto __buf1 = __keep1(__in_begin, __in_end); @@ -304,6 +334,16 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera auto __buf2 = __keep2(__out_begin, __out_begin + __n); single_pass_scan_impl<_KernelParam, true>(__queue, __buf1.all_view(), __buf2.all_view(), __binary_op); + +#if SCAN_KT_DEBUG + std::vector<_Type> in_debug2(__n); + __queue.memcpy(in_debug2.data(), __in_begin, __n * sizeof(_Type)); + + for (int i = 0; i < __n; ++i) + std::cout << "input_after " << i << " " << in_debug2[i] << std::endl; +#endif + + //printf("KERNEL_TEMPLATE DONE %lu\n", __n); } } // inline namespace igpu From 567a50ebab7a53ae9fe4539d4640b7c46d8546e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?= Date: Tue, 7 Nov 2023 10:51:21 +0000 Subject: [PATCH 052/134] Scan_kt Flags and Values separated (#15) Atomic flags and the values used in Scan_kt separated to avoid truncating the range to 30bit values, and prepare for a more general scan implementation. --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 242 +++++------------- .../numeric/numeric.ops/scan_kt.pass.cpp | 14 +- 2 files changed, 75 insertions(+), 181 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 7aaf3f2a255..f52e4ef532f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -21,63 +21,75 @@ namespace oneapi::dpl::experimental::kt inline namespace igpu { +constexpr size_t SUBGROUP_SIZE = 32; + template struct __scan_status_flag { - // 00xxxx - not computed - // 01xxxx - partial - // 10xxxx - full - // 110000 - out of bounds - - using _AtomicRefT = sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space>; - static constexpr std::uint32_t partial_mask = 1 << (sizeof(std::uint32_t)*8 - 2); - static constexpr std::uint32_t full_mask = 1 << (sizeof(std::uint32_t)*8 - 1); - static constexpr std::uint32_t value_mask = ~(partial_mask | full_mask); - static constexpr std::uint32_t oob_value = partial_mask | full_mask; - - static constexpr int padding = 32; - - __scan_status_flag(std::uint32_t* flags_begin, const std::uint32_t tile_id) - : atomic_flag(*(flags_begin + tile_id + padding)) + using _AtomicRefT = sycl::atomic_ref<::std::uint32_t, sycl::memory_order::acq_rel, sycl::memory_scope::device, + sycl::access::address_space::global_space>; + static constexpr std::uint32_t NOT_READY = 0; + static constexpr std::uint32_t PARTIAL_MASK = 1; + static constexpr std::uint32_t FULL_MASK = 2; + static constexpr std::uint32_t OUT_OF_BOUNDS = 4; + + static constexpr int padding = SUBGROUP_SIZE; + + __scan_status_flag(const std::uint32_t tile_id, std::uint32_t* flags_begin, _T* tile_sums, + size_t num_elements) + : atomic_flag(*(flags_begin + tile_id + padding)), scanned_partial_value(tile_sums + tile_id + padding), + scanned_full_value(tile_sums + tile_id + padding + num_elements), num_elements{num_elements} { - } - void set_partial(std::uint32_t val) + void + set_partial(_T val) { - atomic_flag.store(val | partial_mask); + (*scanned_partial_value) = val; + atomic_flag.store(PARTIAL_MASK); } - void set_full(std::uint32_t val) + void + set_full(_T val) { - atomic_flag.store(val | full_mask); + (*scanned_full_value) = val; + atomic_flag.store(FULL_MASK); } - template - _T cooperative_lookback(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin) + template + _T + cooperative_lookback(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin, + _T* tile_sums) { _T sum = 0; int offset = -1; int i = 0; int local_id = subgroup.get_local_id(); - for (int tile = static_cast(tile_id) + offset; tile >= 0; tile -= 32) + for (int tile = static_cast(tile_id) + offset; tile >= 0; tile -= SUBGROUP_SIZE) { _AtomicRefT tile_atomic(*(flags_begin + tile + padding - local_id)); - std::uint32_t tile_val = 0; - do { - tile_val = tile_atomic.load(); + std::uint32_t flag; + do + { + flag = tile_atomic.load(); + } while (!sycl::all_of_group(subgroup, flag != NOT_READY)); // Loop till all ready - } while (!sycl::all_of_group(subgroup, tile_val != 0)); - //} while (0); + bool is_full = flag == FULL_MASK; - bool is_full = (tile_val & full_mask) && ((tile_val & partial_mask) == 0); auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full); ::std::uint32_t is_full_ballot_bits{}; is_full_ballot.extract_bits(is_full_ballot_bits); auto lowest_item_with_full = sycl::ctz(is_full_ballot_bits); - _T contribution = local_id <= lowest_item_with_full ? tile_val & value_mask : _T{0}; + + // The partial scan results and the full scan sum values are in contiguous memory. + // Each section of the memory is of size num_elements. + // The partial sum for a tile is at [i] and the full sum is at [i + num_elements] + // is_full * num_elements allows to select between the two values without branching the code. + size_t contrib_offset = tile + padding - local_id + is_full * num_elements; + _T val = *(tile_sums + contrib_offset); + _T contribution = local_id <= lowest_item_with_full && (tile - local_id >= 0) ? val : _T{0}; // Sum all of the partial results from the tiles found, as well as the full contribution from the closest tile (if any) sum += sycl::reduce_over_group(subgroup, contribution, bin_op); @@ -88,37 +100,16 @@ struct __scan_status_flag if (is_full_ballot_bits) break; - //if (i++ > 10) break; } - return sum; - } - -#if 0 - _T lookback(const std::uint32_t tile_id, std::uint32_t* flags_begin) - { - _T sum = 0; - int i = 0; - for (std::int32_t tile = static_cast(tile_id) - 1; tile >= 0; --tile) - { - _AtomicRefT tile_atomic(*(flags_begin + tile + padding)); - std::uint32_t tile_val = 0; - do { - tile_val = tile_atomic.load(); - } while (tile_val == 0); - - sum += tile_val & value_mask; - // If this was a full value, we can stop looking at previous tiles. Otherwise, - // keep going through tiles until we either find a full tile or we've completely - // recomputed the prefix using partial values - if (tile_val & full_mask) - break; - } return sum; } -#endif _AtomicRefT atomic_flag; + _T* scanned_partial_value; + _T* scanned_full_value; + + size_t num_elements; }; template @@ -130,86 +121,57 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou static_assert(_Inclusive, "Single-pass scan only available for inclusive scan"); const ::std::size_t n = __in_rng.size(); - auto __max_cu = __queue.get_device().template get_info(); - //std::size_t num_wgs = __max_cu; - //std::size_t num_wgs = 448; - //std::size_t num_wgs = 256; - - // TODO: use wgsize and iters per item from _KernelParam - //constexpr ::std::size_t __elems_per_item = _KernelParam::data_per_workitem; #ifdef _ONEDPL_SCAN_ITER_SIZE - constexpr ::std::size_t __elems_per_item = _ONEDPL_SCAN_ITER_SIZE; + constexpr ::std::size_t __elems_per_workitem = _ONEDPL_SCAN_ITER_SIZE; #else - constexpr ::std::size_t __elems_per_item = 8; + constexpr ::std::size_t __elems_per_workitem = 8; #endif // Next power of 2 greater than or equal to __n auto __n_uniform = n; if ((__n_uniform & (__n_uniform - 1)) != 0) __n_uniform = oneapi::dpl::__internal::__dpl_bit_floor(n) << 1; - //std::size_t wgsize = n/num_wgs/__elems_per_item; - std::size_t wgsize = 256; - std::size_t num_items = __n_uniform/__elems_per_item; - std::size_t num_wgs = num_items/wgsize; - // - //std::size_t wgsize = 256; - //std::size_t num_items = 114688; - + std::size_t num_workitems = __n_uniform / __elems_per_workitem; + std::size_t wgsize = num_workitems > 256 ? 256 : num_workitems; + std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_workitems, wgsize); - constexpr int status_flag_padding = 32; - std::uint32_t status_flags_size = num_wgs+1+status_flag_padding; - - //printf("launching kernel items=%lu wgs=%lu wgsize=%lu elems_per_iter=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __elems_per_item, __max_cu); + constexpr int status_flag_padding = SUBGROUP_SIZE; + std::uint32_t status_flags_size = num_wgs + status_flag_padding + 1; + std::uint32_t tile_sums_size = num_wgs + status_flag_padding; uint32_t* status_flags = sycl::malloc_device(status_flags_size, __queue); - //__queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t)); + // First status_flags_size elements: partial scanned values (using _BinaryOp) of each workgroup + // Second status_flags_size elements: full scanned values, i.e. sum of the previous and current workgroup's partial sums + _Type* tile_sums = sycl::malloc_device<_Type>(tile_sums_size * 2, __queue); auto fill_event = __queue.submit([&](sycl::handler& hdl) { - hdl.parallel_for(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item) { int id = item.get_linear_id(); - status_flags[id] = id < status_flag_padding ? __scan_status_flag<_Type>::oob_value : 0; + status_flags[id] = id < status_flag_padding ? __scan_status_flag<_Type>::OUT_OF_BOUNDS + : __scan_status_flag<_Type>::NOT_READY; }); }); - - std::uint32_t elems_in_tile = wgsize*__elems_per_item; - -#define SCAN_KT_DEBUG 0 -#if SCAN_KT_DEBUG - std::vector debug11v(status_flags_size); - __queue.memcpy(debug11v.data(), status_flags, status_flags_size * sizeof(uint32_t)); - - for (int i = 0; i < status_flags_size-1; ++i) - std::cout << "flag_before " << i << " " << debug11v[i] << std::endl; - - uint32_t* debug1 = sycl::malloc_device(status_flags_size, __queue); - uint32_t* debug2 = sycl::malloc_device(status_flags_size, __queue); - uint32_t* debug3 = sycl::malloc_device(status_flags_size, __queue); - uint32_t* debug4 = sycl::malloc_device(status_flags_size, __queue); - uint32_t* debug5 = sycl::malloc_device(status_flags_size, __queue); -#endif + std::uint32_t elems_in_tile = wgsize*__elems_per_workitem; auto event = __queue.submit([&](sycl::handler& hdl) { auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); hdl.depends_on(fill_event); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); - hdl.parallel_for(sycl::nd_range<1>(num_items, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(32)]] { + hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { auto group = item.get_group(); auto subgroup = item.get_sub_group(); - // Obtain unique ID for this work-group that will be used in decoupled lookback if (group.leader()) { - sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space> idx_atomic(status_flags[status_flags_size-1]); + sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, + sycl::access::address_space::global_space> + idx_atomic(status_flags[status_flags_size - 1]); tile_id_lacc[0] = idx_atomic.fetch_add(1); } sycl::group_barrier(group); std::uint32_t tile_id = tile_id_lacc[0]; -#if SCAN_KT_DEBUG - debug5[group.get_group_linear_id()] = tile_id; -#endif auto current_offset = (tile_id*elems_in_tile); auto next_offset = ((tile_id+1)*elems_in_tile); @@ -219,40 +181,22 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou auto in_end = __in_rng.begin() + next_offset; auto out_begin = __out_rng.begin() + current_offset; - -#if SCAN_KT_DEBUG - debug3[tile_id] = current_offset; - debug4[tile_id] = next_offset; -#endif - if (current_offset >= n) return; auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op); -#if SCAN_KT_DEBUG - debug1[tile_id] = local_sum; -#endif - _Type prev_sum = 0; // The first sub-group will query the previous tiles to find a prefix if (subgroup.get_group_id() == 0) { - __scan_status_flag<_Type> flag(status_flags, tile_id); + __scan_status_flag<_Type> flag(tile_id, status_flags, tile_sums, status_flags_size); if (group.leader()) flag.set_partial(local_sum); // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum - //sycl::reduce_over_group(item.get_subgroup()) - - - prev_sum = flag.cooperative_lookback(tile_id, subgroup, __binary_op, status_flags); - //if (group.leader()) - // prev_sum = flag.lookback(tile_id, status_flags); -#if SCAN_KT_DEBUG - debug2[tile_id] = prev_sum; -#endif + prev_sum = flag.cooperative_lookback(tile_id, subgroup, __binary_op, status_flags, tile_sums); if (group.leader()) flag.set_full(prev_sum + local_sum); @@ -265,40 +209,8 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou event.wait(); -#if SCAN_KT_DEBUG - std::vector debug1v(status_flags_size); - std::vector debug2v(status_flags_size); - std::vector debug3v(status_flags_size); - std::vector debug4v(status_flags_size); - std::vector debug5v(status_flags_size); - std::vector debug6v(status_flags_size); - __queue.memcpy(debug1v.data(), debug1, status_flags_size * sizeof(uint32_t)); - __queue.memcpy(debug2v.data(), debug2, status_flags_size * sizeof(uint32_t)); - __queue.memcpy(debug3v.data(), debug3, status_flags_size * sizeof(uint32_t)); - __queue.memcpy(debug4v.data(), debug4, status_flags_size * sizeof(uint32_t)); - __queue.memcpy(debug5v.data(), debug5, status_flags_size * sizeof(uint32_t)); - __queue.memcpy(debug6v.data(), status_flags, status_flags_size * sizeof(uint32_t)); - - for (int i = 0; i < status_flags_size-1; ++i) - std::cout << "tile " << i << " " << debug5v[i] << std::endl; - for (int i = 0; i < status_flags_size-1; ++i) - std::cout << "local_sum " << i << " " << debug1v[i] << std::endl; - for (int i = 0; i < status_flags_size-1; ++i) - { - auto val = (debug6v[i] & __scan_status_flag<_Type>::value_mask); - int a = val / elems_in_tile; - int b = val % elems_in_tile; - std::cout << "flags " << i << " " << std::bitset<32>(debug6v[i]) << " (" << val<< " = " << a << "/" << elems_in_tile << "+" << b <<")" << std::endl; - } - for (int i = 0; i < status_flags_size-1; ++i) - std::cout << "lookback " << i << " " << debug2v[i] << std::endl; - for (int i = 0; i < status_flags_size-1; ++i) - std::cout << "offset " << i << " " << debug3v[i] << std::endl; - for (int i = 0; i < status_flags_size-1; ++i) - std::cout << "end " << i << " " << debug4v[i] << std::endl; -#endif - sycl::free(status_flags, __queue); + sycl::free(tile_sums, __queue); } // The generic structure for configuring a kernel @@ -316,16 +228,6 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera { auto __n = __in_end - __in_begin; -#if SCAN_KT_DEBUG - using _Type = std::remove_pointer_t<_InIterator>; - std::vector<_Type> in_debug(__n); - __queue.memcpy(in_debug.data(), __in_begin, __n * sizeof(_Type)); - - for (int i = 0; i < __n; ++i) - std::cout << "input_before " << i << " " << in_debug[i] << std::endl; -#endif - - //printf("KERNEL_TEMPLATE %lu\n", __n); auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); auto __buf1 = __keep1(__in_begin, __in_end); @@ -334,16 +236,6 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera auto __buf2 = __keep2(__out_begin, __out_begin + __n); single_pass_scan_impl<_KernelParam, true>(__queue, __buf1.all_view(), __buf2.all_view(), __binary_op); - -#if SCAN_KT_DEBUG - std::vector<_Type> in_debug2(__n); - __queue.memcpy(in_debug2.data(), __in_begin, __n * sizeof(_Type)); - - for (int i = 0; i < __n; ++i) - std::cout << "input_after " << i << " " << in_debug2[i] << std::endl; -#endif - - //printf("KERNEL_TEMPLATE DONE %lu\n", __n); } } // inline namespace igpu diff --git a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp index de5ecafc25b..38a82b026d7 100644 --- a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp @@ -22,23 +22,23 @@ int main() { bool all_passed = true; + sycl::queue q; for (int logn : {4, 8, 11, 16, 19, 21}) { - std::cout << "Testing 2^" << logn << '\n'; + std::cout << "Testing 2^" << logn << std::endl; int n = 1 << logn; std::vector v(n, 1); - sycl::queue q; int* in_ptr = sycl::malloc_device(n, q); int* out_ptr = sycl::malloc_device(n, q); - - q.copy(v.data(), in_ptr, n); + q.copy(v.data(), in_ptr, n).wait(); using KernelParams = oneapi::dpl::experimental::kt::kernel_param<128, 2, class ScanKernel>; oneapi::dpl::experimental::kt::single_pass_inclusive_scan(q, in_ptr, in_ptr+n, out_ptr, ::std::plus()); std::vector tmp(n, 0); q.copy(out_ptr, tmp.data(), n); + q.wait(); std::inclusive_scan(v.begin(), v.end(), v.begin()); @@ -53,11 +53,13 @@ main() } if (passed) - std::cout << "passed" << std::endl; + std::cout << " passed" << std::endl; else - std::cout << "failed" << std::endl; + std::cout << " failed" << std::endl; all_passed &= passed; + sycl::free(in_ptr, q); + sycl::free(out_ptr, q); } return !all_passed; From 0c91640f4e128a1f8a574dba72563c03ce1f88e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?= Date: Tue, 7 Nov 2023 13:07:04 +0000 Subject: [PATCH 053/134] Refactored Scan_kt code (#16) * Improved Scan_kt: templated parameters, ballot, wgsize calculation. - Changed number of workgroups calculation from next power of two to next multiple of wgsize - Improved group_ballot by using the class member functions - Using kernel_param struct to determine wgsize and elems per work item. --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 34 +++++++------------ .../numeric/numeric.ops/scan_kt.pass.cpp | 2 +- 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index f52e4ef532f..e7a0ca345e6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -76,12 +76,8 @@ struct __scan_status_flag } while (!sycl::all_of_group(subgroup, flag != NOT_READY)); // Loop till all ready bool is_full = flag == FULL_MASK; - auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full); - ::std::uint32_t is_full_ballot_bits{}; - is_full_ballot.extract_bits(is_full_ballot_bits); - - auto lowest_item_with_full = sycl::ctz(is_full_ballot_bits); + auto lowest_item_with_full = is_full_ballot.find_low(); // The partial scan results and the full scan sum values are in contiguous memory. // Each section of the memory is of size num_elements. @@ -97,7 +93,7 @@ struct __scan_status_flag // If we found a full value, we can stop looking at previous tiles. Otherwise, // keep going through tiles until we either find a full tile or we've completely // recomputed the prefix using partial values - if (is_full_ballot_bits) + if (is_full_ballot.any()) break; } @@ -121,18 +117,14 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou static_assert(_Inclusive, "Single-pass scan only available for inclusive scan"); const ::std::size_t n = __in_rng.size(); -#ifdef _ONEDPL_SCAN_ITER_SIZE - constexpr ::std::size_t __elems_per_workitem = _ONEDPL_SCAN_ITER_SIZE; -#else - constexpr ::std::size_t __elems_per_workitem = 8; -#endif - // Next power of 2 greater than or equal to __n - auto __n_uniform = n; - if ((__n_uniform & (__n_uniform - 1)) != 0) - __n_uniform = oneapi::dpl::__internal::__dpl_bit_floor(n) << 1; - std::size_t num_workitems = __n_uniform / __elems_per_workitem; - std::size_t wgsize = num_workitems > 256 ? 256 : num_workitems; - std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_workitems, wgsize); + + constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; + constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem; + + // Avoid non_uniform n by padding up to a multiple of wgsize + std::uint32_t elems_in_tile = wgsize * elems_per_workitem; + ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); + ::std::size_t num_workitems = num_wgs * wgsize; constexpr int status_flag_padding = SUBGROUP_SIZE; std::uint32_t status_flags_size = num_wgs + status_flag_padding + 1; @@ -151,8 +143,6 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou }); }); - std::uint32_t elems_in_tile = wgsize*__elems_per_workitem; - auto event = __queue.submit([&](sycl::handler& hdl) { auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); hdl.depends_on(fill_event); @@ -214,10 +204,10 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou } // The generic structure for configuring a kernel -template +template struct kernel_param { - static constexpr std::uint16_t data_per_workitem = DataPerWorkItem; + static constexpr std::uint16_t elems_per_workitem = ElemsPerWorkItem; static constexpr std::uint16_t workgroup_size = WorkGroupSize; using kernel_name = KernelName; }; diff --git a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp index 38a82b026d7..b3407581f37 100644 --- a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp @@ -33,7 +33,7 @@ main() int* out_ptr = sycl::malloc_device(n, q); q.copy(v.data(), in_ptr, n).wait(); - using KernelParams = oneapi::dpl::experimental::kt::kernel_param<128, 2, class ScanKernel>; + using KernelParams = oneapi::dpl::experimental::kt::kernel_param<8, 128, class ScanKernel>; oneapi::dpl::experimental::kt::single_pass_inclusive_scan(q, in_ptr, in_ptr+n, out_ptr, ::std::plus()); std::vector tmp(n, 0); From 78d2d7d1d4cd264f673e3c2eb587f08a58552f70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?= Date: Wed, 8 Nov 2023 16:47:52 +0000 Subject: [PATCH 054/134] Scan_kt: Single memory allocation for device_memory (#17) and async free of the device memory (#18) * Single memory allocation for device_memory * async free of device memory --------- Co-authored-by: Joe Todd Co-authored-by: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 35 +++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index e7a0ca345e6..5773b80e1be 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -127,13 +127,24 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou ::std::size_t num_workitems = num_wgs * wgsize; constexpr int status_flag_padding = SUBGROUP_SIZE; - std::uint32_t status_flags_size = num_wgs + status_flag_padding + 1; - std::uint32_t tile_sums_size = num_wgs + status_flag_padding; + std::size_t status_flags_elems = num_wgs + status_flag_padding + 1; + std::size_t status_flags_size = status_flags_elems * sizeof(std::uint32_t); - uint32_t* status_flags = sycl::malloc_device(status_flags_size, __queue); - // First status_flags_size elements: partial scanned values (using _BinaryOp) of each workgroup - // Second status_flags_size elements: full scanned values, i.e. sum of the previous and current workgroup's partial sums - _Type* tile_sums = sycl::malloc_device<_Type>(tile_sums_size * 2, __queue); + std::size_t tile_sums_elems = num_wgs + status_flag_padding; + std::size_t tile_sums_size = status_flags_elems * sizeof(_Type); + + std::size_t extra_mem_for_aligment = alignof(_Type) - (status_flags_size % alignof(_Type)); + // status_flags_size for the status_flags + // extra_mem_for_aligment of the datatype _Type + // First tile_sums_size partial scanned values + // Second tile_sums_size full scanned values (current partial plus all previous workgroups partial) + char* mem_pool = + sycl::malloc_device(status_flags_size + extra_mem_for_aligment + 2 * tile_sums_size, __queue); + + std::size_t tile_sums_offset = status_flags_size + extra_mem_for_aligment; + + std::uint32_t* status_flags = reinterpret_cast(mem_pool); + _Type* tile_sums = reinterpret_cast<_Type*>(mem_pool + tile_sums_offset); auto fill_event = __queue.submit([&](sycl::handler& hdl) { hdl.parallel_for(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item) { @@ -180,7 +191,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou // The first sub-group will query the previous tiles to find a prefix if (subgroup.get_group_id() == 0) { - __scan_status_flag<_Type> flag(tile_id, status_flags, tile_sums, status_flags_size); + __scan_status_flag<_Type> flag(tile_id, status_flags, tile_sums, tile_sums_elems); if (group.leader()) flag.set_partial(local_sum); @@ -197,10 +208,14 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou }); }); - event.wait(); + auto free_event = __queue.submit( + [=](sycl::handler& hdl) + { + hdl.depends_on(event); + hdl.host_task([=](){ sycl::free(mem_pool, __queue); }); + }); - sycl::free(status_flags, __queue); - sycl::free(tile_sums, __queue); + event.wait(); } // The generic structure for configuring a kernel From 55dc287d7b2c17188275a0acfd3a186c98ffad4f Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Wed, 8 Nov 2023 16:07:30 +0000 Subject: [PATCH 055/134] Replace sycl::range with sycl::nd_range for fill --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 5773b80e1be..53d925a14c8 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -146,13 +146,22 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou std::uint32_t* status_flags = reinterpret_cast(mem_pool); _Type* tile_sums = reinterpret_cast<_Type*>(mem_pool + tile_sums_offset); - auto fill_event = __queue.submit([&](sycl::handler& hdl) { - hdl.parallel_for(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item) { - int id = item.get_linear_id(); - status_flags[id] = id < status_flag_padding ? __scan_status_flag<_Type>::OUT_OF_BOUNDS - : __scan_status_flag<_Type>::NOT_READY; + ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(status_flags_size, wgsize); + + auto fill_event = __queue.submit( + [&](sycl::handler& hdl) + { + hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize}, + [=](const sycl::nd_item<1>& item) + { + int id = item.get_global_linear_id(); + if (id < status_flags_size) + status_flags[id] = + id < status_flag_padding + ? __scan_status_flag<_Type>::OUT_OF_BOUNDS + : __scan_status_flag<_Type>::NOT_READY; + }); }); - }); auto event = __queue.submit([&](sycl::handler& hdl) { auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); From 37bfd1de142f82e91aa9e4fbeac857e38d467702 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Wed, 8 Nov 2023 19:14:32 +0000 Subject: [PATCH 056/134] Bug fix --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 53d925a14c8..038018a13ac 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -146,7 +146,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou std::uint32_t* status_flags = reinterpret_cast(mem_pool); _Type* tile_sums = reinterpret_cast<_Type*>(mem_pool + tile_sums_offset); - ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(status_flags_size, wgsize); + ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(status_flags_elems, wgsize); auto fill_event = __queue.submit( [&](sycl::handler& hdl) @@ -155,7 +155,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou [=](const sycl::nd_item<1>& item) { int id = item.get_global_linear_id(); - if (id < status_flags_size) + if (id < status_flags_elems) status_flags[id] = id < status_flag_padding ? __scan_status_flag<_Type>::OUT_OF_BOUNDS @@ -177,7 +177,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou { sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space> - idx_atomic(status_flags[status_flags_size - 1]); + idx_atomic(status_flags[status_flags_elems - 1]); tile_id_lacc[0] = idx_atomic.fetch_add(1); } sycl::group_barrier(group); From 21038df158b2196188f675827cba0b5e2bd47f97 Mon Sep 17 00:00:00 2001 From: Aidan Date: Wed, 8 Nov 2023 13:21:32 +0000 Subject: [PATCH 057/134] Global to local then perform op --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 38 ++++++++++++++----- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 038018a13ac..846208007da 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -165,11 +165,14 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou auto event = __queue.submit([&](sycl::handler& hdl) { auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); + auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); hdl.depends_on(fill_event); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { auto group = item.get_group(); + auto local_id = item.get_local_id(0); + auto stride = item.get_local_range(0); auto subgroup = item.get_sub_group(); // Obtain unique ID for this work-group that will be used in decoupled lookback @@ -183,16 +186,33 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou sycl::group_barrier(group); std::uint32_t tile_id = tile_id_lacc[0]; - auto current_offset = (tile_id*elems_in_tile); - auto next_offset = ((tile_id+1)*elems_in_tile); - if (next_offset > n) - next_offset = n; - auto in_begin = __in_rng.begin() + current_offset; - auto in_end = __in_rng.begin() + next_offset; - auto out_begin = __out_rng.begin() + current_offset; - - if (current_offset >= n) + // Global load into local + auto wg_current_offset = (tile_id*elems_in_tile); + auto wg_next_offset = ((tile_id+1)*elems_in_tile); + size_t wg_local_memory_size = elems_in_tile; + if (wg_current_offset >= n) return; + if (wg_next_offset >= n) { + wg_local_memory_size = n - wg_current_offset; + wg_next_offset = n; // Not needed + } + + // TODO: vectorize loads, where possible + if (wg_next_offset <= n) { + _ONEDPL_PRAGMA_UNROLL + for (std::uint32_t i = 0; i < elems_per_workitem; ++i) + tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + local_id + stride * i]; + } else { + for (std::uint32_t i = 0; i < elems_per_workitem; ++i) { + if (wg_current_offset + stride * i < n) + tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + stride * i]; + } + } + sycl::group_barrier(group); + + auto in_begin = tile_vals.get_pointer(); + auto in_end = in_begin + wg_local_memory_size; + auto out_begin = __out_rng.begin() + wg_current_offset; auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op); _Type prev_sum = 0; From bdcc9c9da190cbcc7499a7ba4ff50b824c4f9447 Mon Sep 17 00:00:00 2001 From: Aidan Date: Fri, 10 Nov 2023 13:51:35 +0000 Subject: [PATCH 058/134] Update based on feedback --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 846208007da..1bd10595413 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -192,20 +192,17 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou size_t wg_local_memory_size = elems_in_tile; if (wg_current_offset >= n) return; - if (wg_next_offset >= n) { + if (wg_next_offset > n) wg_local_memory_size = n - wg_current_offset; - wg_next_offset = n; // Not needed - } - // TODO: vectorize loads, where possible if (wg_next_offset <= n) { _ONEDPL_PRAGMA_UNROLL for (std::uint32_t i = 0; i < elems_per_workitem; ++i) tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + local_id + stride * i]; } else { for (std::uint32_t i = 0; i < elems_per_workitem; ++i) { - if (wg_current_offset + stride * i < n) - tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + stride * i]; + if (wg_current_offset + local_id + stride * i < n) + tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + local_id + stride * i]; } } sycl::group_barrier(group); From 9717e095cd3155db9f7175b8a580e114f7a178c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?= Date: Tue, 21 Nov 2023 11:48:48 +0000 Subject: [PATCH 059/134] Refactored cooperative_loopback and memory implementation (#24) * Refactored cooperative_loopback and memory implementation detail * renamed load_counter to fetch_add_counter * Removed dynamic tile counter from the scan memory struct * scratch memory Reordering * Fixed wrong values returned in LoopbackScanMemory.get_value * Improved Class and variable naming --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 334 +++++++++++++----- 1 file changed, 253 insertions(+), 81 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 1bd10595413..314ace11410 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -16,51 +16,244 @@ #ifndef _ONEDPL_parallel_backend_sycl_scan_H #define _ONEDPL_parallel_backend_sycl_scan_H +#include +#include + namespace oneapi::dpl::experimental::kt { inline namespace igpu { -constexpr size_t SUBGROUP_SIZE = 32; +constexpr ::std::size_t SUBGROUP_SIZE = 32; + +template typename LoopbackScanMemory, typename TileId> +struct ScanMemoryManager +{ + using _TileIdT = typename TileId::_TileIdT; + using _FlagT = typename LoopbackScanMemory::_FlagT; + + ScanMemoryManager(sycl::queue q) : q{q} {}; + + ::std::uint8_t* + scan_memory_ptr() noexcept + { + return scan_memory_begin; + }; + + _TileIdT* + tile_id_ptr() noexcept + { + return tile_id_begin; + }; + + void + allocate(::std::size_t num_wgs) + { + ::std::size_t scan_memory_size = LoopbackScanMemory::get_memory_size(num_wgs); + constexpr ::std::size_t padded_tileid_size = TileId::get_padded_memory_size(); + constexpr ::std::size_t tileid_size = TileId::get_memory_size(); + + auto mem_size_bytes = scan_memory_size + padded_tileid_size; + + scratch = sycl::malloc_device<::std::uint8_t>(mem_size_bytes, q); + + scan_memory_begin = scratch; + + void* base_tileid_ptr = reinterpret_cast(scan_memory_begin + scan_memory_size); + size_t remainder = mem_size_bytes - scan_memory_size; + + tile_id_begin = reinterpret_cast<_TileIdT*>( + ::std::align(::std::alignment_of_v<_TileIdT>, tileid_size, base_tileid_ptr, remainder)); + } + + sycl::event + async_free(sycl::event dependency) + { + return q.submit( + [e = dependency, ptr = scratch, q_ = q](sycl::handler& hdl) + { + hdl.depends_on(e); + hdl.host_task([=]() { sycl::free(ptr, q_); }); + }); + } + + private: + ::std::uint8_t* scratch = nullptr; + ::std::uint8_t* scan_memory_begin = nullptr; + _TileIdT* tile_id_begin = nullptr; + + sycl::queue q; +}; -template -struct __scan_status_flag +template +struct LoopbackScanMemory { - using _AtomicRefT = sycl::atomic_ref<::std::uint32_t, sycl::memory_order::acq_rel, sycl::memory_scope::device, - sycl::access::address_space::global_space>; - static constexpr std::uint32_t NOT_READY = 0; - static constexpr std::uint32_t PARTIAL_MASK = 1; - static constexpr std::uint32_t FULL_MASK = 2; - static constexpr std::uint32_t OUT_OF_BOUNDS = 4; - - static constexpr int padding = SUBGROUP_SIZE; - - __scan_status_flag(const std::uint32_t tile_id, std::uint32_t* flags_begin, _T* tile_sums, - size_t num_elements) - : atomic_flag(*(flags_begin + tile_id + padding)), scanned_partial_value(tile_sums + tile_id + padding), - scanned_full_value(tile_sums + tile_id + padding + num_elements), num_elements{num_elements} + using _FlagT = ::std::uint32_t; + using _AtomicFlagRefT = sycl::atomic_ref<_FlagT, sycl::memory_order::acq_rel, sycl::memory_scope::device, + sycl::access::address_space::global_space>; + + static constexpr _FlagT NOT_READY = 0; + static constexpr _FlagT PARTIAL_MASK = 1; + static constexpr _FlagT FULL_MASK = 2; + static constexpr _FlagT OUT_OF_BOUNDS = 4; + + static constexpr ::std::size_t padding = SUBGROUP_SIZE; + + LoopbackScanMemory(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs) + : num_elements(get_num_elements(num_wgs)) { + // LoopbackScanMemory: [Partial Value, ..., Full Value, ..., Flag, ...] + // Each section has num_wgs + padding elements + tile_values_begin = reinterpret_cast<_T*>(scan_memory_begin); + flags_begin = get_flags_begin(scan_memory_begin, num_wgs); } void - set_partial(_T val) + set_partial(::std::size_t tile_id, _T val) { - (*scanned_partial_value) = val; + _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding)); + + tile_values_begin[tile_id + padding] = val; atomic_flag.store(PARTIAL_MASK); } void - set_full(_T val) + set_full(::std::size_t tile_id, _T val) { - (*scanned_full_value) = val; + _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding)); + + tile_values_begin[tile_id + padding + num_elements] = val; atomic_flag.store(FULL_MASK); } - template + _FlagT + load_flag(::std::size_t tile_id) const + { + _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding)); + + return atomic_flag.load(); + } + + _T + get_value(::std::size_t tile_id, _FlagT flag) const + { + ::std::size_t offset = tile_id + padding + num_elements * is_full(flag); + return tile_values_begin[offset]; + } + + static ::std::size_t + get_tile_values_bytes(::std::size_t num_elements) + { + return (2 * num_elements) * sizeof(_T); + } + + static ::std::size_t + get_flag_bytes(::std::size_t num_elements) + { + return num_elements * sizeof(_FlagT); + } + + static ::std::size_t + get_padded_flag_bytes(::std::size_t num_elements) + { + // sizeof(_FlagT) extra bytes for possible intenal alignment + return get_flag_bytes(num_elements) + sizeof(_FlagT); + } + + static _FlagT* + get_flags_begin(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs) + { + // Aligned flags + ::std::size_t num_elements = get_num_elements(num_wgs); + ::std::size_t tile_values_bytes = get_tile_values_bytes(num_elements); + void* base_flags = reinterpret_cast(scan_memory_begin + tile_values_bytes); + auto remainder = get_padded_flag_bytes(num_elements); // scan_memory_bytes - tile_values_bytes + return reinterpret_cast<_FlagT*>( + ::std::align(::std::alignment_of_v<_FlagT>, get_flag_bytes(num_elements), base_flags, remainder)); + } + + static ::std::size_t + get_memory_size(::std::size_t num_wgs) + { + ::std::size_t num_elements = get_num_elements(num_wgs); + // sizeof(_T) extra bytes are not needed because LoopbackScanMemory is going at the beginning of the scratch + ::std::size_t tile_values_bytes = get_tile_values_bytes(num_elements); + // Padding to provide room for aligment + ::std::size_t flag_bytes = get_padded_flag_bytes(num_elements); + + return tile_values_bytes + flag_bytes; + } + + static ::std::size_t + get_num_elements(::std::size_t num_wgs) + { + return padding + num_wgs; + } + + static bool + is_ready(_FlagT flag) + { + return flag != NOT_READY; + } + + static bool + is_full(_FlagT flag) + { + return flag == FULL_MASK; + } + + static bool + is_out_of_bounds(_FlagT flag) + { + return flag == OUT_OF_BOUNDS; + } + + private: + ::std::size_t num_elements; + _FlagT* flags_begin; + _T* tile_values_begin; +}; + +struct TileId +{ + using _TileIdT = ::std::uint32_t; + using _AtomicTileRefT = sycl::atomic_ref<_TileIdT, sycl::memory_order::relaxed, sycl::memory_scope::device, + sycl::access::address_space::global_space>; + + TileId(_TileIdT* tileid_memory) : tile_counter(*(tileid_memory)) {} + + constexpr static ::std::size_t + get_padded_memory_size() + { + // extra sizeof(_TileIdT) for possible aligment issues + return sizeof(_TileIdT) + sizeof(_TileIdT); + } + + constexpr static ::std::size_t + get_memory_size() + { + // extra sizeof(_TileIdT) for possible aligment issues + return sizeof(_TileIdT); + } + + _TileIdT + fetch_inc() + { + return tile_counter.fetch_add(1); + } + + _AtomicTileRefT tile_counter; +}; + +struct cooperative_lookback +{ + + template typename LoopbackScanMemory> _T - cooperative_lookback(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin, - _T* tile_sums) + operator()(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, LoopbackScanMemory<_T> memory) { + using FlagT = typename LoopbackScanMemory<_T>::_FlagT; + _T sum = 0; int offset = -1; int i = 0; @@ -68,24 +261,20 @@ struct __scan_status_flag for (int tile = static_cast(tile_id) + offset; tile >= 0; tile -= SUBGROUP_SIZE) { - _AtomicRefT tile_atomic(*(flags_begin + tile + padding - local_id)); - std::uint32_t flag; + FlagT flag; do { - flag = tile_atomic.load(); - } while (!sycl::all_of_group(subgroup, flag != NOT_READY)); // Loop till all ready + flag = memory.load_flag(tile - local_id); + } while (!sycl::all_of_group(subgroup, LoopbackScanMemory<_T>::is_ready(flag))); // Loop till all ready - bool is_full = flag == FULL_MASK; + bool is_full = LoopbackScanMemory<_T>::is_full(flag); auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full); auto lowest_item_with_full = is_full_ballot.find_low(); - // The partial scan results and the full scan sum values are in contiguous memory. - // Each section of the memory is of size num_elements. - // The partial sum for a tile is at [i] and the full sum is at [i + num_elements] - // is_full * num_elements allows to select between the two values without branching the code. - size_t contrib_offset = tile + padding - local_id + is_full * num_elements; - _T val = *(tile_sums + contrib_offset); - _T contribution = local_id <= lowest_item_with_full && (tile - local_id >= 0) ? val : _T{0}; + // TODO: Use identity_fn for out of bounds values + _T contribution = local_id <= lowest_item_with_full && !LoopbackScanMemory<_T>::is_out_of_bounds(flag) + ? memory.get_value(tile - local_id, flag) + : _T{0}; // Sum all of the partial results from the tiles found, as well as the full contribution from the closest tile (if any) sum += sycl::reduce_over_group(subgroup, contribution, bin_op); @@ -100,12 +289,6 @@ struct __scan_status_flag return sum; } - - _AtomicRefT atomic_flag; - _T* scanned_partial_value; - _T* scanned_full_value; - - size_t num_elements; }; template @@ -113,6 +296,8 @@ void single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op) { using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + using _TileIdT = TileId::_TileIdT; + using _FlagT = typename LoopbackScanMemory<_Type>::_FlagT; static_assert(_Inclusive, "Single-pass scan only available for inclusive scan"); @@ -122,31 +307,22 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem; // Avoid non_uniform n by padding up to a multiple of wgsize - std::uint32_t elems_in_tile = wgsize * elems_per_workitem; + ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem; ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); ::std::size_t num_workitems = num_wgs * wgsize; - constexpr int status_flag_padding = SUBGROUP_SIZE; - std::size_t status_flags_elems = num_wgs + status_flag_padding + 1; - std::size_t status_flags_size = status_flags_elems * sizeof(std::uint32_t); - - std::size_t tile_sums_elems = num_wgs + status_flag_padding; - std::size_t tile_sums_size = status_flags_elems * sizeof(_Type); + ScanMemoryManager<_Type, LoopbackScanMemory, TileId> scratch(__queue); + scratch.allocate(num_wgs); - std::size_t extra_mem_for_aligment = alignof(_Type) - (status_flags_size % alignof(_Type)); - // status_flags_size for the status_flags - // extra_mem_for_aligment of the datatype _Type - // First tile_sums_size partial scanned values - // Second tile_sums_size full scanned values (current partial plus all previous workgroups partial) - char* mem_pool = - sycl::malloc_device(status_flags_size + extra_mem_for_aligment + 2 * tile_sums_size, __queue); + // Memory Structure: + // [Loopback Scan Memory, Tile Id Counter] + auto scan_memory_begin = scratch.scan_memory_ptr(); + auto status_flags_begin = LoopbackScanMemory<_Type>::get_flags_begin(scan_memory_begin, num_wgs); + auto tile_id_begin = scratch.tile_id_ptr(); - std::size_t tile_sums_offset = status_flags_size + extra_mem_for_aligment; - - std::uint32_t* status_flags = reinterpret_cast(mem_pool); - _Type* tile_sums = reinterpret_cast<_Type*>(mem_pool + tile_sums_offset); - - ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(status_flags_elems, wgsize); + ::std::size_t num_elements = LoopbackScanMemory<_Type>::get_num_elements(num_wgs); + // fill_num_wgs num_elements + 1 to also initialize tile_id_counter + ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize); auto fill_event = __queue.submit( [&](sycl::handler& hdl) @@ -155,14 +331,17 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou [=](const sycl::nd_item<1>& item) { int id = item.get_global_linear_id(); - if (id < status_flags_elems) - status_flags[id] = - id < status_flag_padding - ? __scan_status_flag<_Type>::OUT_OF_BOUNDS - : __scan_status_flag<_Type>::NOT_READY; + if (id < num_elements) + status_flags_begin[id] = + id < LoopbackScanMemory<_Type>::padding + ? LoopbackScanMemory<_Type>::OUT_OF_BOUNDS + : LoopbackScanMemory<_Type>::NOT_READY; + if (id == num_elements) + tile_id_begin[0] = 0; }); }); + auto event = __queue.submit([&](sycl::handler& hdl) { auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); @@ -176,12 +355,10 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou auto subgroup = item.get_sub_group(); // Obtain unique ID for this work-group that will be used in decoupled lookback + TileId dynamic_tile_id(tile_id_begin); if (group.leader()) { - sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, - sycl::access::address_space::global_space> - idx_atomic(status_flags[status_flags_elems - 1]); - tile_id_lacc[0] = idx_atomic.fetch_add(1); + tile_id_lacc[0] = dynamic_tile_id.fetch_inc(); } sycl::group_barrier(group); std::uint32_t tile_id = tile_id_lacc[0]; @@ -207,7 +384,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou } sycl::group_barrier(group); - auto in_begin = tile_vals.get_pointer(); + auto in_begin = tile_vals.template get_multi_ptr().get(); auto in_end = in_begin + wg_local_memory_size; auto out_begin = __out_rng.begin() + wg_current_offset; @@ -217,16 +394,16 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou // The first sub-group will query the previous tiles to find a prefix if (subgroup.get_group_id() == 0) { - __scan_status_flag<_Type> flag(tile_id, status_flags, tile_sums, tile_sums_elems); + LoopbackScanMemory<_Type> scan_mem(scan_memory_begin, num_wgs); if (group.leader()) - flag.set_partial(local_sum); + scan_mem.set_partial(tile_id, local_sum); // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum - prev_sum = flag.cooperative_lookback(tile_id, subgroup, __binary_op, status_flags, tile_sums); + prev_sum = cooperative_lookback()(tile_id, subgroup, __binary_op, scan_mem); if (group.leader()) - flag.set_full(prev_sum + local_sum); + scan_mem.set_full(tile_id, prev_sum + local_sum); } prev_sum = sycl::group_broadcast(group, prev_sum, 0); @@ -234,12 +411,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou }); }); - auto free_event = __queue.submit( - [=](sycl::handler& hdl) - { - hdl.depends_on(event); - hdl.host_task([=](){ sycl::free(mem_pool, __queue); }); - }); + scratch.async_free(event); event.wait(); } From 8d23836ebb42aa9650cc0b3a865a4c716cb6e98b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?= Date: Thu, 23 Nov 2023 14:11:27 +0000 Subject: [PATCH 060/134] [Scan_kt] Atomic64 flags + value implementation (#25) * Implemented atomic64 version of the scan_kt pass * Removed repeated offset calculation for tile id atomic flag * Loopback -> Lookback. Removed unused var. --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 351 ++++++++++++------ 1 file changed, 243 insertions(+), 108 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 314ace11410..6dfe1bb6ef1 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -26,11 +26,13 @@ inline namespace igpu { constexpr ::std::size_t SUBGROUP_SIZE = 32; -template typename LoopbackScanMemory, typename TileId> +template typename LookbackScanMemory, + typename TileId> struct ScanMemoryManager { using _TileIdT = typename TileId::_TileIdT; - using _FlagT = typename LoopbackScanMemory::_FlagT; + using _LookbackScanMemory = LookbackScanMemory; + using _FlagT = typename _LookbackScanMemory::_FlagT; ScanMemoryManager(sycl::queue q) : q{q} {}; @@ -49,7 +51,7 @@ struct ScanMemoryManager void allocate(::std::size_t num_wgs) { - ::std::size_t scan_memory_size = LoopbackScanMemory::get_memory_size(num_wgs); + ::std::size_t scan_memory_size = _LookbackScanMemory::get_memory_size(num_wgs); constexpr ::std::size_t padded_tileid_size = TileId::get_padded_memory_size(); constexpr ::std::size_t tileid_size = TileId::get_memory_size(); @@ -85,8 +87,11 @@ struct ScanMemoryManager sycl::queue q; }; +template +struct LookbackScanMemory; + template -struct LoopbackScanMemory +struct LookbackScanMemory<_T, /* UseAtomic64=*/::std::false_type> { using _FlagT = ::std::uint32_t; using _AtomicFlagRefT = sycl::atomic_ref<_FlagT, sycl::memory_order::acq_rel, sycl::memory_scope::device, @@ -99,13 +104,12 @@ struct LoopbackScanMemory static constexpr ::std::size_t padding = SUBGROUP_SIZE; - LoopbackScanMemory(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs) - : num_elements(get_num_elements(num_wgs)) + // LookbackScanMemory: [Partial Value, ..., Full Value, ..., Flag, ...] + // Each section has num_wgs + padding elements + LookbackScanMemory(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs) + : num_elements(get_num_elements(num_wgs)), tile_values_begin(reinterpret_cast<_T*>(scan_memory_begin)), + flags_begin(get_flags_begin(scan_memory_begin, num_wgs)) { - // LoopbackScanMemory: [Partial Value, ..., Full Value, ..., Flag, ...] - // Each section has num_wgs + padding elements - tile_values_begin = reinterpret_cast<_T*>(scan_memory_begin); - flags_begin = get_flags_begin(scan_memory_begin, num_wgs); } void @@ -126,19 +130,17 @@ struct LoopbackScanMemory atomic_flag.store(FULL_MASK); } - _FlagT - load_flag(::std::size_t tile_id) const + _AtomicFlagRefT + get_flag(::std::size_t tile_id) const { - _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding)); - - return atomic_flag.load(); + return _AtomicFlagRefT(*(flags_begin + tile_id + padding)); } _T get_value(::std::size_t tile_id, _FlagT flag) const { - ::std::size_t offset = tile_id + padding + num_elements * is_full(flag); - return tile_values_begin[offset]; + // full_value and partial_value are num_elements apart + return *(tile_values_begin + tile_id + padding + num_elements * is_full(flag)); } static ::std::size_t @@ -176,7 +178,7 @@ struct LoopbackScanMemory get_memory_size(::std::size_t num_wgs) { ::std::size_t num_elements = get_num_elements(num_wgs); - // sizeof(_T) extra bytes are not needed because LoopbackScanMemory is going at the beginning of the scratch + // sizeof(_T) extra bytes are not needed because LookbackScanMemory is going at the beginning of the scratch ::std::size_t tile_values_bytes = get_tile_values_bytes(num_elements); // Padding to provide room for aligment ::std::size_t flag_bytes = get_padded_flag_bytes(num_elements); @@ -214,6 +216,110 @@ struct LoopbackScanMemory _T* tile_values_begin; }; +template +struct LookbackScanMemory<_T, /* UseAtomic64=*/::std::true_type> +{ + using _FlagT = ::std::uint64_t; + using _AtomicFlagRefT = sycl::atomic_ref<_FlagT, sycl::memory_order::relaxed, sycl::memory_scope::device, + sycl::access::address_space::global_space>; + + // Each flag is divided in 2 32bit values + // 32..63 status bits + // 00..31 value bits + // Example: status = full scanned value, int value = 15: + // 1000 0000 0000 0000 0000 0000 0000 0000 | 0000 0000 0000 0000 0000 0000 0000 1111 + + // Status values: + // 00xxxx - not computed + // 01xxxx - partial + // 10xxxx - full + // 110000 - out of bounds + + static constexpr _FlagT NOT_READY = 0; + static constexpr _FlagT PARTIAL_MASK = 1l << (sizeof(_FlagT) * 8 - 2); + static constexpr _FlagT FULL_MASK = 1l << (sizeof(_FlagT) * 8 - 1); + static constexpr _FlagT OUT_OF_BOUNDS = PARTIAL_MASK | FULL_MASK; + + static constexpr _FlagT VALUE_MASK = (1l << sizeof(::std::uint32_t) * 8) - 1; // 32 bit mask to store value + + static constexpr ::std::size_t padding = SUBGROUP_SIZE; + + LookbackScanMemory(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs) + : num_elements(get_num_elements(num_wgs)), flags_begin(get_flags_begin(scan_memory_begin, num_wgs)) + { + } + + void + set_partial(::std::size_t tile_id, _T val) + { + _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding)); + + atomic_flag.store(PARTIAL_MASK | static_cast<::std::uint32_t>(val)); + } + + void + set_full(::std::size_t tile_id, _T val) + { + _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding)); + + atomic_flag.store(FULL_MASK | static_cast<::std::uint32_t>(val)); + } + + _AtomicFlagRefT + get_flag(::std::size_t tile_id) const + { + return _AtomicFlagRefT(*(flags_begin + tile_id + padding)); + } + + _T + get_value(::std::size_t, _FlagT flag) const + { + return static_cast<::std::uint32_t>(flag & VALUE_MASK); + } + + static _FlagT* + get_flags_begin(::std::uint8_t* scan_memory_begin, ::std::size_t) + { + return reinterpret_cast<_FlagT*>(scan_memory_begin); + } + + static ::std::size_t + get_memory_size(::std::size_t num_wgs) + { + ::std::size_t num_elements = get_num_elements(num_wgs); + return num_elements * sizeof(_FlagT); + } + + static ::std::size_t + get_num_elements(::std::size_t num_wgs) + { + return padding + num_wgs; + } + + static bool + is_ready(_FlagT flag) + { + // flag & OUT_OF_BOUNDS != NOT_READY means it has either partial or full value, or is out of bounds + return (flag & OUT_OF_BOUNDS) != NOT_READY; + } + + static bool + is_full(_FlagT flag) + { + return (flag & OUT_OF_BOUNDS) == FULL_MASK; + } + + static bool + is_out_of_bounds(_FlagT flag) + { + return (flag & OUT_OF_BOUNDS) == OUT_OF_BOUNDS; + } + + private: + ::std::size_t num_elements; + _FlagT* flags_begin; +}; + struct TileId { using _TileIdT = ::std::uint32_t; @@ -248,11 +354,14 @@ struct TileId struct cooperative_lookback { - template typename LoopbackScanMemory> + template typename LookbackScanMemory, typename UseAtomic64> _T - operator()(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, LoopbackScanMemory<_T> memory) + operator()(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, + LookbackScanMemory<_T, UseAtomic64> memory) { - using FlagT = typename LoopbackScanMemory<_T>::_FlagT; + using _LookbackScanMemory = LookbackScanMemory<_T, UseAtomic64>; + using FlagT = typename _LookbackScanMemory::_FlagT; _T sum = 0; int offset = -1; @@ -261,18 +370,19 @@ struct cooperative_lookback for (int tile = static_cast(tile_id) + offset; tile >= 0; tile -= SUBGROUP_SIZE) { + auto atomic_flag = memory.get_flag(tile - local_id); FlagT flag; do { - flag = memory.load_flag(tile - local_id); - } while (!sycl::all_of_group(subgroup, LoopbackScanMemory<_T>::is_ready(flag))); // Loop till all ready + flag = atomic_flag.load(); + } while (!sycl::all_of_group(subgroup, _LookbackScanMemory::is_ready(flag))); // Loop till all ready - bool is_full = LoopbackScanMemory<_T>::is_full(flag); + bool is_full = _LookbackScanMemory::is_full(flag); auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full); auto lowest_item_with_full = is_full_ballot.find_low(); // TODO: Use identity_fn for out of bounds values - _T contribution = local_id <= lowest_item_with_full && !LoopbackScanMemory<_T>::is_out_of_bounds(flag) + _T contribution = local_id <= lowest_item_with_full && !_LookbackScanMemory::is_out_of_bounds(flag) ? memory.get_value(tile - local_id, flag) : _T{0}; @@ -291,124 +401,131 @@ struct cooperative_lookback } }; -template +template void single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op) { using _Type = oneapi::dpl::__internal::__value_t<_InRange>; using _TileIdT = TileId::_TileIdT; - using _FlagT = typename LoopbackScanMemory<_Type>::_FlagT; + using _LookbackScanMemory = LookbackScanMemory<_Type, _UseAtomic64>; + using _FlagT = typename _LookbackScanMemory::_FlagT; - static_assert(_Inclusive, "Single-pass scan only available for inclusive scan"); + static_assert(std::is_same_v<_Inclusive, ::std::true_type>, "Single-pass scan only available for inclusive scan"); const ::std::size_t n = __in_rng.size(); constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem; - // Avoid non_uniform n by padding up to a multiple of wgsize ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem; ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); ::std::size_t num_workitems = num_wgs * wgsize; - ScanMemoryManager<_Type, LoopbackScanMemory, TileId> scratch(__queue); + ScanMemoryManager<_Type, _UseAtomic64, LookbackScanMemory, TileId> scratch(__queue); scratch.allocate(num_wgs); // Memory Structure: - // [Loopback Scan Memory, Tile Id Counter] + // [Lookback Scan Memory, Tile Id Counter] auto scan_memory_begin = scratch.scan_memory_ptr(); - auto status_flags_begin = LoopbackScanMemory<_Type>::get_flags_begin(scan_memory_begin, num_wgs); + auto status_flags_begin = _LookbackScanMemory::get_flags_begin(scan_memory_begin, num_wgs); auto tile_id_begin = scratch.tile_id_ptr(); - ::std::size_t num_elements = LoopbackScanMemory<_Type>::get_num_elements(num_wgs); + ::std::size_t num_elements = _LookbackScanMemory::get_num_elements(num_wgs); // fill_num_wgs num_elements + 1 to also initialize tile_id_counter ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize); auto fill_event = __queue.submit( [&](sycl::handler& hdl) { - hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize}, - [=](const sycl::nd_item<1>& item) - { - int id = item.get_global_linear_id(); - if (id < num_elements) - status_flags_begin[id] = - id < LoopbackScanMemory<_Type>::padding - ? LoopbackScanMemory<_Type>::OUT_OF_BOUNDS - : LoopbackScanMemory<_Type>::NOT_READY; - if (id == num_elements) - tile_id_begin[0] = 0; - }); + hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize}, + [=](const sycl::nd_item<1>& item) + { + int id = item.get_global_linear_id(); + if (id < num_elements) + status_flags_begin[id] = id < _LookbackScanMemory::padding + ? _LookbackScanMemory::OUT_OF_BOUNDS + : _LookbackScanMemory::NOT_READY; + if (id == num_elements) + tile_id_begin[0] = 0; + }); }); - auto event = __queue.submit([&](sycl::handler& hdl) { auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); hdl.depends_on(fill_event); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); - hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { - auto group = item.get_group(); - auto local_id = item.get_local_id(0); - auto stride = item.get_local_range(0); - auto subgroup = item.get_sub_group(); - - // Obtain unique ID for this work-group that will be used in decoupled lookback - TileId dynamic_tile_id(tile_id_begin); - if (group.leader()) - { - tile_id_lacc[0] = dynamic_tile_id.fetch_inc(); - } - sycl::group_barrier(group); - std::uint32_t tile_id = tile_id_lacc[0]; - - // Global load into local - auto wg_current_offset = (tile_id*elems_in_tile); - auto wg_next_offset = ((tile_id+1)*elems_in_tile); - size_t wg_local_memory_size = elems_in_tile; - if (wg_current_offset >= n) - return; - if (wg_next_offset > n) - wg_local_memory_size = n - wg_current_offset; - - if (wg_next_offset <= n) { - _ONEDPL_PRAGMA_UNROLL - for (std::uint32_t i = 0; i < elems_per_workitem; ++i) - tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + local_id + stride * i]; - } else { - for (std::uint32_t i = 0; i < elems_per_workitem; ++i) { - if (wg_current_offset + local_id + stride * i < n) - tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + local_id + stride * i]; - } - } - sycl::group_barrier(group); - - auto in_begin = tile_vals.template get_multi_ptr().get(); - auto in_end = in_begin + wg_local_memory_size; - auto out_begin = __out_rng.begin() + wg_current_offset; - - auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op); - _Type prev_sum = 0; - - // The first sub-group will query the previous tiles to find a prefix - if (subgroup.get_group_id() == 0) - { - LoopbackScanMemory<_Type> scan_mem(scan_memory_begin, num_wgs); - - if (group.leader()) - scan_mem.set_partial(tile_id, local_sum); - - // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum - prev_sum = cooperative_lookback()(tile_id, subgroup, __binary_op, scan_mem); - - if (group.leader()) - scan_mem.set_full(tile_id, prev_sum + local_sum); - } - - prev_sum = sycl::group_broadcast(group, prev_sum, 0); - sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum); - }); + hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), + [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] + { + auto group = item.get_group(); + auto local_id = item.get_local_id(0); + auto stride = item.get_local_range(0); + auto subgroup = item.get_sub_group(); + + // Obtain unique ID for this work-group that will be used in decoupled lookback + TileId dynamic_tile_id(tile_id_begin); + if (group.leader()) + { + tile_id_lacc[0] = dynamic_tile_id.fetch_inc(); + } + sycl::group_barrier(group); + std::uint32_t tile_id = tile_id_lacc[0]; + + // Global load into local + auto wg_current_offset = (tile_id * elems_in_tile); + auto wg_next_offset = ((tile_id + 1) * elems_in_tile); + size_t wg_local_memory_size = elems_in_tile; + if (wg_current_offset >= n) + return; + if (wg_next_offset > n) + wg_local_memory_size = n - wg_current_offset; + + if (wg_next_offset <= n) + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint32_t i = 0; i < elems_per_workitem; ++i) + tile_vals[local_id + stride * i] = + __in_rng[wg_current_offset + local_id + stride * i]; + } + else + { + for (std::uint32_t i = 0; i < elems_per_workitem; ++i) + { + if (wg_current_offset + local_id + stride * i < n) + tile_vals[local_id + stride * i] = + __in_rng[wg_current_offset + local_id + stride * i]; + } + } + sycl::group_barrier(group); + + auto in_begin = tile_vals.template get_multi_ptr().get(); + auto in_end = in_begin + wg_local_memory_size; + auto out_begin = __out_rng.begin() + wg_current_offset; + + auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op); + _Type prev_sum = 0; + + // The first sub-group will query the previous tiles to find a prefix + if (subgroup.get_group_id() == 0) + { + _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs); + + if (group.leader()) + scan_mem.set_partial(tile_id, local_sum); + + // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum + prev_sum = cooperative_lookback()(tile_id, subgroup, __binary_op, scan_mem); + + if (group.leader()) + scan_mem.set_full(tile_id, prev_sum + local_sum); + } + + prev_sum = sycl::group_broadcast(group, prev_sum, 0); + sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum); + }); }); scratch.async_free(event); @@ -438,7 +555,25 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); auto __buf2 = __keep2(__out_begin, __out_begin + __n); - single_pass_scan_impl<_KernelParam, true>(__queue, __buf1.all_view(), __buf2.all_view(), __binary_op); + // Avoid aspect query overhead for sizeof(Types) > 32 bits + if constexpr (sizeof(typename std::iterator_traits<_InIterator>::value_type) <= sizeof(std::uint32_t)) + { + if (__queue.get_device().has(sycl::aspect::atomic64)) + { + single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::true_type>( + __queue, __buf1.all_view(), __buf2.all_view(), __binary_op); + } + else + { + single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type>( + __queue, __buf1.all_view(), __buf2.all_view(), __binary_op); + } + } + else + { + single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type>( + __queue, __buf1.all_view(), __buf2.all_view(), __binary_op); + } } } // inline namespace igpu From c3c3218c6e2dedd844df5f3023d7a315ad12d0ac Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 23 Nov 2023 14:39:49 +0000 Subject: [PATCH 061/134] constexpr, types and remove an unneeded check --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 6dfe1bb6ef1..266d4b18657 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -364,8 +364,7 @@ struct cooperative_lookback using FlagT = typename _LookbackScanMemory::_FlagT; _T sum = 0; - int offset = -1; - int i = 0; + constexpr int offset = -1; int local_id = subgroup.get_local_id(); for (int tile = static_cast(tile_id) + offset; tile >= 0; tile -= SUBGROUP_SIZE) @@ -418,7 +417,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem; // Avoid non_uniform n by padding up to a multiple of wgsize - ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem; + constexpr ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem; ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); ::std::size_t num_workitems = num_wgs * wgsize; @@ -461,8 +460,8 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { auto group = item.get_group(); - auto local_id = item.get_local_id(0); - auto stride = item.get_local_range(0); + ::std::uint32_t local_id = item.get_local_id(0); + constexpr ::std::uint32_t stride = wgsize; auto subgroup = item.get_sub_group(); // Obtain unique ID for this work-group that will be used in decoupled lookback @@ -477,9 +476,8 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou // Global load into local auto wg_current_offset = (tile_id * elems_in_tile); auto wg_next_offset = ((tile_id + 1) * elems_in_tile); - size_t wg_local_memory_size = elems_in_tile; - if (wg_current_offset >= n) - return; + auto wg_local_memory_size = elems_in_tile; + if (wg_next_offset > n) wg_local_memory_size = n - wg_current_offset; @@ -502,7 +500,6 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou sycl::group_barrier(group); auto in_begin = tile_vals.template get_multi_ptr().get(); - auto in_end = in_begin + wg_local_memory_size; auto out_begin = __out_rng.begin() + wg_current_offset; auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op); From d2577024a0755c1cf9c3993ff2de02e6060af71f Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 23 Nov 2023 14:40:04 +0000 Subject: [PATCH 062/134] Correct static_cast ? --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 266d4b18657..0655b60deb1 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -274,7 +274,7 @@ struct LookbackScanMemory<_T, /* UseAtomic64=*/::std::true_type> _T get_value(::std::size_t, _FlagT flag) const { - return static_cast<::std::uint32_t>(flag & VALUE_MASK); + return static_cast<_T>(flag & VALUE_MASK); } static _FlagT* From 43e17ba4c5aeba6a519ef8ba063210f8a08d73df Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 23 Nov 2023 14:40:19 +0000 Subject: [PATCH 063/134] Defer group comms in lookback --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 0655b60deb1..ce186b4ffa4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -386,8 +386,7 @@ struct cooperative_lookback : _T{0}; // Sum all of the partial results from the tiles found, as well as the full contribution from the closest tile (if any) - sum += sycl::reduce_over_group(subgroup, contribution, bin_op); - + sum = bin_op(sum, contribution); // If we found a full value, we can stop looking at previous tiles. Otherwise, // keep going through tiles until we either find a full tile or we've completely // recomputed the prefix using partial values @@ -395,6 +394,7 @@ struct cooperative_lookback break; } + sum = sycl::reduce_over_group(subgroup, sum, bin_op); return sum; } From e5b3ca4bb386bb1a665f178348e63904f9aac61b Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 23 Nov 2023 14:40:52 +0000 Subject: [PATCH 064/134] Disable dynamic tile ID by default TODO: we still allocate & initialize the memory for the counter --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 35 ++++++++++++------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index ce186b4ffa4..007186a2f9a 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -400,8 +400,8 @@ struct cooperative_lookback } }; -template +template void single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op) { @@ -464,14 +464,23 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou constexpr ::std::uint32_t stride = wgsize; auto subgroup = item.get_sub_group(); - // Obtain unique ID for this work-group that will be used in decoupled lookback - TileId dynamic_tile_id(tile_id_begin); - if (group.leader()) - { - tile_id_lacc[0] = dynamic_tile_id.fetch_inc(); - } - sycl::group_barrier(group); - std::uint32_t tile_id = tile_id_lacc[0]; + std::uint32_t tile_id; + if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>) + { + // Obtain unique ID for this work-group that will be used in decoupled lookback + TileId dynamic_tile_id(tile_id_begin); + if (group.leader()) + { + tile_id_lacc[0] = dynamic_tile_id.fetch_inc(); + } + sycl::group_barrier(group); + tile_id = tile_id_lacc[0]; + } + else + { + tile_id = group.get_group_linear_id(); + } + // Global load into local auto wg_current_offset = (tile_id * elems_in_tile); @@ -557,18 +566,18 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera { if (__queue.get_device().has(sycl::aspect::atomic64)) { - single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::true_type>( + single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::false_type>( __queue, __buf1.all_view(), __buf2.all_view(), __binary_op); } else { - single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type>( + single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type, /* UseDynamicTileID */ std::false_type>( __queue, __buf1.all_view(), __buf2.all_view(), __binary_op); } } else { - single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type>( + single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type, /* UseDynamicTileID */ std::false_type>( __queue, __buf1.all_view(), __buf2.all_view(), __binary_op); } } From ab346da026243d430f95b1f85ad86d20711f3939 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 23 Nov 2023 14:41:32 +0000 Subject: [PATCH 065/134] Reduce from register sums instead of local mem Also use #pragma unroll for now --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 007186a2f9a..e43cfee6aa6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -489,29 +489,36 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou if (wg_next_offset > n) wg_local_memory_size = n - wg_current_offset; - + _Type my_reducer{}; if (wg_next_offset <= n) { - _ONEDPL_PRAGMA_UNROLL + #pragma unroll for (std::uint32_t i = 0; i < elems_per_workitem; ++i) - tile_vals[local_id + stride * i] = - __in_rng[wg_current_offset + local_id + stride * i]; + { + _Type in_val = __in_rng[wg_current_offset + local_id + stride * i]; + my_reducer = __binary_op(my_reducer, in_val); + tile_vals[local_id + stride * i] = in_val; + } } else { + #pragma unroll for (std::uint32_t i = 0; i < elems_per_workitem; ++i) { if (wg_current_offset + local_id + stride * i < n) - tile_vals[local_id + stride * i] = - __in_rng[wg_current_offset + local_id + stride * i]; + { + _Type in_val = __in_rng[wg_current_offset + local_id + stride * i]; + my_reducer = __binary_op(my_reducer, in_val); + tile_vals[local_id + stride * i] = in_val; + } } } - sycl::group_barrier(group); + + auto local_sum = sycl::reduce_over_group(group, my_reducer, __binary_op); auto in_begin = tile_vals.template get_multi_ptr().get(); auto out_begin = __out_rng.begin() + wg_current_offset; - auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op); _Type prev_sum = 0; // The first sub-group will query the previous tiles to find a prefix From f87573c3f7b835fb3fea91735d33036af6652931 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 23 Nov 2023 14:42:06 +0000 Subject: [PATCH 066/134] Unrolled version of joint_inclusive_scan --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index e43cfee6aa6..68921c08c3c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -536,8 +536,23 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou scan_mem.set_full(tile_id, prev_sum + local_sum); } - prev_sum = sycl::group_broadcast(group, prev_sum, 0); - sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum); + _Type carry = sycl::group_broadcast(group, prev_sum, 0); + #pragma unroll + for (::std::uint32_t step = 0; step < elems_per_workitem; ++step) + { + ::std::uint32_t i = stride * step; + _Type x; + if (i + local_id < wg_local_memory_size) + { + x = in_begin[i + local_id]; + } + _Type out = sycl::inclusive_scan_over_group(group, x, __binary_op, carry); + if (i + local_id < wg_local_memory_size) + { + out_begin[i + local_id] = out; + } + carry = group_broadcast(group, out, stride - 1); + } }); }); From 621adf7eb4d661a4d1a9ef0ee65b3990987d7f69 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 23 Nov 2023 15:25:30 +0000 Subject: [PATCH 067/134] Update include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Alberto Cabrera Pérez --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 68921c08c3c..dae5cd7a48e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -537,6 +537,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou } _Type carry = sycl::group_broadcast(group, prev_sum, 0); + // TODO: Find a fix for _ONEDPL_PRAGMA_UNROLL #pragma unroll for (::std::uint32_t step = 0; step < elems_per_workitem; ++step) { From b8c837f099a6961b5b3eef75489f8c39058f39e0 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 23 Nov 2023 15:27:43 +0000 Subject: [PATCH 068/134] Add TODO --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index dae5cd7a48e..a85d86aeb31 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -489,6 +489,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou if (wg_next_offset > n) wg_local_memory_size = n - wg_current_offset; + //TODO: assumes default ctor produces identity w.r.t. __binary_op _Type my_reducer{}; if (wg_next_offset <= n) { From 8367be7eec618670aa809659994e85f97f7fe976 Mon Sep 17 00:00:00 2001 From: Alberto Cabrera Date: Tue, 28 Nov 2023 15:55:38 +0000 Subject: [PATCH 069/134] Changing fill kernel for a memset --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 39 ++++++++++--------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index a85d86aeb31..c1e1d2c0cbd 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -369,19 +369,20 @@ struct cooperative_lookback for (int tile = static_cast(tile_id) + offset; tile >= 0; tile -= SUBGROUP_SIZE) { - auto atomic_flag = memory.get_flag(tile - local_id); + auto atomic_flag = memory.get_flag(tile - local_id); // FlagT flag; do { flag = atomic_flag.load(); - } while (!sycl::all_of_group(subgroup, _LookbackScanMemory::is_ready(flag))); // Loop till all ready + } while (!sycl::all_of_group(subgroup, _LookbackScanMemory::is_ready(flag) || + (tile - local_id < 0))); // Loop till all ready bool is_full = _LookbackScanMemory::is_full(flag); auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full); auto lowest_item_with_full = is_full_ballot.find_low(); // TODO: Use identity_fn for out of bounds values - _T contribution = local_id <= lowest_item_with_full && !_LookbackScanMemory::is_out_of_bounds(flag) + _T contribution = local_id <= lowest_item_with_full && (tile - local_id >= 0) ? memory.get_value(tile - local_id, flag) : _T{0}; @@ -434,21 +435,23 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou // fill_num_wgs num_elements + 1 to also initialize tile_id_counter ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize); - auto fill_event = __queue.submit( - [&](sycl::handler& hdl) - { - hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize}, - [=](const sycl::nd_item<1>& item) - { - int id = item.get_global_linear_id(); - if (id < num_elements) - status_flags_begin[id] = id < _LookbackScanMemory::padding - ? _LookbackScanMemory::OUT_OF_BOUNDS - : _LookbackScanMemory::NOT_READY; - if (id == num_elements) - tile_id_begin[0] = 0; - }); - }); + auto fill_event = __queue.memset(status_flags_begin, 0, num_elements * sizeof(_FlagT) + 1 * sizeof(_TileIdT)); + + // auto fill_event = __queue.submit( + // [&](sycl::handler& hdl) + // { + // hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize}, + // [=](const sycl::nd_item<1>& item) + // { + // int id = item.get_global_linear_id(); + // if (id < num_elements) + // status_flags_begin[id] = id < _LookbackScanMemory::padding + // ? _LookbackScanMemory::OUT_OF_BOUNDS + // : _LookbackScanMemory::NOT_READY; + // if (id == num_elements) + // tile_id_begin[0] = 0; + // }); + // }); auto event = __queue.submit([&](sycl::handler& hdl) { auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); From 02ff9f338a9703367bf387e9042d5d8de4739411 Mon Sep 17 00:00:00 2001 From: Alberto Cabrera Date: Wed, 29 Nov 2023 15:19:30 +0000 Subject: [PATCH 070/134] Single wg implementation --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 358 ++++++++++++------ 1 file changed, 234 insertions(+), 124 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index c1e1d2c0cbd..345da745608 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -401,6 +401,89 @@ struct cooperative_lookback } }; +template +void +single_pass_scan_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op) +{ + using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + + static_assert(std::is_same_v<_Inclusive, ::std::true_type>, "Single-pass scan only available for inclusive scan"); + + const ::std::size_t n = __in_rng.size(); + + constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; + constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem; + // Avoid non_uniform n by padding up to a multiple of wgsize + constexpr ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem; + constexpr ::std::size_t num_workitems = wgsize; + + auto event = __queue.submit([&](sycl::handler& hdl) { + auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); + + oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); + hdl.parallel_for( + sycl::nd_range<1>(num_workitems, wgsize), [= + ](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { + auto group = item.get_group(); + ::std::uint32_t local_id = item.get_local_id(0); + constexpr ::std::uint32_t stride = wgsize; + auto subgroup = item.get_sub_group(); + + constexpr std::uint32_t tile_id = 0; + constexpr std::uint32_t wg_begin = 0; + constexpr std::uint32_t wg_end = elems_in_tile; + + std::uint32_t wg_local_memory_size = elems_in_tile; + + auto out_begin = __out_rng.begin(); + _Type carry = 0; + + // Global load into local + if (wg_end > n) + wg_local_memory_size = n; + + //TODO: assumes default ctor produces identity w.r.t. __binary_op + // _Type my_reducer{}; + if (wg_end <= n) + { +#pragma unroll + for (std::uint32_t step = 0; step < elems_per_workitem; ++step) + { + ::std::uint32_t i = stride * step; + _Type in_val = __in_rng[i + local_id]; + // my_reducer = __binary_op(my_reducer, in_val); + _Type out = sycl::inclusive_scan_over_group(group, in_val, __binary_op, carry); + out_begin[i + local_id] = out; + carry = group_broadcast(group, out, stride - 1); + } + } + else + { +#pragma unroll + for (std::uint32_t step = 0; step < elems_per_workitem; ++step) + { + ::std::uint32_t i = stride * step; + _Type in_val; + + if (i + local_id < n) + { + in_val = __in_rng[i + local_id]; + // my_reducer = __binary_op(my_reducer, in_val); + } + _Type out = sycl::inclusive_scan_over_group(group, in_val, __binary_op, carry); + if (i + local_id < n) + { + out_begin[i + local_id] = out; + } + carry = group_broadcast(group, out, stride - 1); + } + } + }); + }); + + event.wait(); +} + template void @@ -437,128 +520,111 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou auto fill_event = __queue.memset(status_flags_begin, 0, num_elements * sizeof(_FlagT) + 1 * sizeof(_TileIdT)); - // auto fill_event = __queue.submit( - // [&](sycl::handler& hdl) - // { - // hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize}, - // [=](const sycl::nd_item<1>& item) - // { - // int id = item.get_global_linear_id(); - // if (id < num_elements) - // status_flags_begin[id] = id < _LookbackScanMemory::padding - // ? _LookbackScanMemory::OUT_OF_BOUNDS - // : _LookbackScanMemory::NOT_READY; - // if (id == num_elements) - // tile_id_begin[0] = 0; - // }); - // }); - auto event = __queue.submit([&](sycl::handler& hdl) { auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); hdl.depends_on(fill_event); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); - hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), - [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] - { - auto group = item.get_group(); - ::std::uint32_t local_id = item.get_local_id(0); - constexpr ::std::uint32_t stride = wgsize; - auto subgroup = item.get_sub_group(); - - std::uint32_t tile_id; - if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>) - { - // Obtain unique ID for this work-group that will be used in decoupled lookback - TileId dynamic_tile_id(tile_id_begin); - if (group.leader()) - { - tile_id_lacc[0] = dynamic_tile_id.fetch_inc(); - } - sycl::group_barrier(group); - tile_id = tile_id_lacc[0]; - } - else - { - tile_id = group.get_group_linear_id(); - } - - - // Global load into local - auto wg_current_offset = (tile_id * elems_in_tile); - auto wg_next_offset = ((tile_id + 1) * elems_in_tile); - auto wg_local_memory_size = elems_in_tile; - - if (wg_next_offset > n) - wg_local_memory_size = n - wg_current_offset; - //TODO: assumes default ctor produces identity w.r.t. __binary_op - _Type my_reducer{}; - if (wg_next_offset <= n) - { - #pragma unroll - for (std::uint32_t i = 0; i < elems_per_workitem; ++i) - { - _Type in_val = __in_rng[wg_current_offset + local_id + stride * i]; - my_reducer = __binary_op(my_reducer, in_val); - tile_vals[local_id + stride * i] = in_val; - } - } - else - { - #pragma unroll - for (std::uint32_t i = 0; i < elems_per_workitem; ++i) - { - if (wg_current_offset + local_id + stride * i < n) - { - _Type in_val = __in_rng[wg_current_offset + local_id + stride * i]; - my_reducer = __binary_op(my_reducer, in_val); - tile_vals[local_id + stride * i] = in_val; - } - } - } - - auto local_sum = sycl::reduce_over_group(group, my_reducer, __binary_op); - - auto in_begin = tile_vals.template get_multi_ptr().get(); - auto out_begin = __out_rng.begin() + wg_current_offset; - - _Type prev_sum = 0; - - // The first sub-group will query the previous tiles to find a prefix - if (subgroup.get_group_id() == 0) - { - _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs); - - if (group.leader()) - scan_mem.set_partial(tile_id, local_sum); - - // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum - prev_sum = cooperative_lookback()(tile_id, subgroup, __binary_op, scan_mem); - - if (group.leader()) - scan_mem.set_full(tile_id, prev_sum + local_sum); - } - - _Type carry = sycl::group_broadcast(group, prev_sum, 0); - // TODO: Find a fix for _ONEDPL_PRAGMA_UNROLL - #pragma unroll - for (::std::uint32_t step = 0; step < elems_per_workitem; ++step) - { - ::std::uint32_t i = stride * step; - _Type x; - if (i + local_id < wg_local_memory_size) - { - x = in_begin[i + local_id]; - } - _Type out = sycl::inclusive_scan_over_group(group, x, __binary_op, carry); - if (i + local_id < wg_local_memory_size) - { - out_begin[i + local_id] = out; - } - carry = group_broadcast(group, out, stride - 1); - } - }); + hdl.parallel_for( + sycl::nd_range<1>(num_workitems, wgsize), [= + ](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { + auto group = item.get_group(); + ::std::uint32_t local_id = item.get_local_id(0); + constexpr ::std::uint32_t stride = wgsize; + auto subgroup = item.get_sub_group(); + + std::uint32_t tile_id; + if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>) + { + // Obtain unique ID for this work-group that will be used in decoupled lookback + TileId dynamic_tile_id(tile_id_begin); + if (group.leader()) + { + tile_id_lacc[0] = dynamic_tile_id.fetch_inc(); + } + sycl::group_barrier(group); + tile_id = tile_id_lacc[0]; + } + else + { + tile_id = group.get_group_linear_id(); + } + + // Global load into local + auto wg_current_offset = (tile_id * elems_in_tile); + auto wg_next_offset = ((tile_id + 1) * elems_in_tile); + auto wg_local_memory_size = elems_in_tile; + + if (wg_next_offset > n) + wg_local_memory_size = n - wg_current_offset; + //TODO: assumes default ctor produces identity w.r.t. __binary_op + _Type my_reducer{}; + if (wg_next_offset <= n) + { +#pragma unroll + for (std::uint32_t i = 0; i < elems_per_workitem; ++i) + { + _Type in_val = __in_rng[wg_current_offset + local_id + stride * i]; + my_reducer = __binary_op(my_reducer, in_val); + tile_vals[local_id + stride * i] = in_val; + } + } + else + { +#pragma unroll + for (std::uint32_t i = 0; i < elems_per_workitem; ++i) + { + if (wg_current_offset + local_id + stride * i < n) + { + _Type in_val = __in_rng[wg_current_offset + local_id + stride * i]; + my_reducer = __binary_op(my_reducer, in_val); + tile_vals[local_id + stride * i] = in_val; + } + } + } + + auto local_sum = sycl::reduce_over_group(group, my_reducer, __binary_op); + + auto in_begin = tile_vals.template get_multi_ptr().get(); + auto out_begin = __out_rng.begin() + wg_current_offset; + + _Type prev_sum = 0; + + // The first sub-group will query the previous tiles to find a prefix + if (subgroup.get_group_id() == 0) + { + _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs); + + if (group.leader()) + scan_mem.set_partial(tile_id, local_sum); + + // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum + prev_sum = cooperative_lookback()(tile_id, subgroup, __binary_op, scan_mem); + + if (group.leader()) + scan_mem.set_full(tile_id, prev_sum + local_sum); + } + + _Type carry = sycl::group_broadcast(group, prev_sum, 0); +// TODO: Find a fix for _ONEDPL_PRAGMA_UNROLL +#pragma unroll + for (::std::uint32_t step = 0; step < elems_per_workitem; ++step) + { + ::std::uint32_t i = stride * step; + _Type x; + if (i + local_id < wg_local_memory_size) + { + x = in_begin[i + local_id]; + } + _Type out = sycl::inclusive_scan_over_group(group, x, __binary_op, carry); + if (i + local_id < wg_local_memory_size) + { + out_begin[i + local_id] = out; + } + carry = group_broadcast(group, out, stride - 1); + } + }); }); scratch.async_free(event); @@ -575,9 +641,10 @@ struct kernel_param using kernel_name = KernelName; }; -template +template void -single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _BinaryOp __binary_op) +single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, + _BinaryOp __binary_op) { auto __n = __in_end - __in_begin; @@ -593,19 +660,62 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera { if (__queue.get_device().has(sycl::aspect::atomic64)) { - single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::false_type>( - __queue, __buf1.all_view(), __buf2.all_view(), __binary_op); + single_pass_scan_impl<_KernelParam, _Inclusive, /* UseAtomic64 */ std::true_type, + /* UseDynamicTileID */ std::false_type>(__queue, __buf1.all_view(), __buf2.all_view(), + __binary_op); } else { - single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type, /* UseDynamicTileID */ std::false_type>( - __queue, __buf1.all_view(), __buf2.all_view(), __binary_op); + single_pass_scan_impl<_KernelParam, _Inclusive, /* UseAtomic64 */ std::false_type, + /* UseDynamicTileID */ std::false_type>(__queue, __buf1.all_view(), __buf2.all_view(), + __binary_op); } } else { - single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type, /* UseDynamicTileID */ std::false_type>( - __queue, __buf1.all_view(), __buf2.all_view(), __binary_op); + single_pass_scan_impl<_KernelParam, _Inclusive, /* UseAtomic64 */ std::false_type, + /* UseDynamicTileID */ std::false_type>(__queue, __buf1.all_view(), __buf2.all_view(), + __binary_op); + } +} + +template +void +single_pass_single_wg_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, + _OutIterator __out_begin, _BinaryOp __binary_op) +{ + auto __n = __in_end - __in_begin; + + auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); + auto __buf1 = __keep1(__in_begin, __in_end); + auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); + auto __buf2 = __keep2(__out_begin, __out_begin + __n); + + // Avoid aspect query overhead for sizeof(Types) > 32 bits + single_pass_scan_impl_single_wg<_KernelParam, /* Inclusive */ std::true_type>(__queue, __buf1.all_view(), + __buf2.all_view(), __binary_op); +} + +template +void +single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, + _BinaryOp __binary_op) +{ + constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; + constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem; + // Avoid non_uniform n by padding up to a multiple of wgsize + constexpr ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem; + auto __n = __in_end - __in_begin; + + if (__n <= elems_in_tile) + { + single_pass_single_wg_inclusive_scan<_KernelParam, /* Inclusive */ std::true_type>( + __queue, __in_begin, __in_end, __out_begin, __binary_op); + } + else + { + single_pass_inclusive_scan<_KernelParam, /* Inclusive */ std::true_type>(__queue, __in_begin, __in_end, + __out_begin, __binary_op); } } From 25a93ff4640a6e33de6f66f0986f7ef05cadaa48 Mon Sep 17 00:00:00 2001 From: "aidan.belton" Date: Tue, 21 Nov 2023 10:38:29 +0000 Subject: [PATCH 071/134] Add phase 1 --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 123 ++++++++++++++++++ .../numeric/numeric.ops/copy_if_kt.pass.cpp | 77 +++++++++++ 2 files changed, 200 insertions(+) create mode 100644 test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 345da745608..c6da15a17b0 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -719,6 +719,129 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera } } +// Load function to try and get some PVC perf w/ coalesced +template +inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t wg_group_id) { + // if constexpr (std::is_arithmetic_v) { + // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * wg_group_id); + // } + return src[i + wg_stride * wg_group_id]; +} + +// Load with checking for the subgroup case +template +inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t wg_group_id, size_t input_size) { + // if constexpr (std::is_arithmetic_v) { + // if (i / SUBGROUP_SIZE + SUBGROUP_SIZE + wg_stride * wg_group_id <= input_size) + // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * wg_group_id); + // return src[i + wg_stride * wg_group_id]; + // } + return src[i + wg_stride * wg_group_id]; +} + +template +void +single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _UnaryPredicate pred) +{ + using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + + const ::std::size_t n = __in_rng.size(); + + constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; + constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem; + + // Avoid non_uniform n by padding up to a multiple of wgsize + std::uint32_t elems_in_tile = wgsize * elems_per_workitem; + ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); + ::std::size_t num_workitems = num_wgs * wgsize; + + auto event = __queue.submit([&](sycl::handler& hdl) { + auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_per_workitem*wgsize}, hdl); + auto l_wg_count = sycl::local_accessor(sycl::range<1>{1}, hdl); + + oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); + hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { + auto group = item.get_group(); + auto wg_group_id = item.get_group(0); + auto wg_local_id = item.get_local_id(0); + auto sg = item.get_sub_group(); + + // Must be a better way to init atomics + l_wg_count[0] = 0; + sycl::group_barrier(group); + sycl::atomic_ref wg_count(l_wg_count[0]); + + constexpr size_t elems_per_workgroup = elems_per_workitem*wgsize; + + // Phase 1: Create wg_count and construct in-order wg_copy_if_values + if ((wg_group_id + 1) * elems_per_workgroup <= n) { + #pragma unroll + for (size_t i = wg_local_id; i < elems_per_workgroup; i += wgsize) { + _Type val = load<_Type>(sg, __in_rng, i, elems_per_workgroup, wg_group_id); + + size_t satisfies_pred = pred(val); + //size_t satisfies_pred = 0; + size_t count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus()); + + if (satisfies_pred) + wg_copy_if_values[count + wg_count.load()] = val; + + if (wg_local_id == (wgsize - 1)) + wg_count += (count + satisfies_pred); + sycl::group_barrier(group); + } + } + else { + // Edge of input, have to handle memory bounds + // Might have unneccessary group_barrier calls + #pragma unroll + for (size_t i = wg_local_id; i < elems_per_workgroup; i += wgsize) { + size_t satisfies_pred = 0; + _Type val; // TODO: alloca + if (i + elems_per_workgroup * wg_group_id < n) { + val = load<_Type>(sg, __in_rng, i, elems_per_workgroup, wg_group_id, n); + + satisfies_pred = pred(val); + } + size_t count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus()); + + if (satisfies_pred) + wg_copy_if_values[count + wg_count.load()] = val; + + if (wg_local_id == (wgsize - 1)) + wg_count += (count + satisfies_pred); + sycl::group_barrier(group); + } + } + // Check behaviour + if (group.leader()) { + __out_rng[wg_group_id] = wg_count.load(); + } + + // Phase 2: Global scan across wg_count + + // Phase 3: copy values to global memory + }); + }); + event.wait(); +} + +template +void +single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _UnaryPredicate pred) +{ + auto __n = __in_end - __in_begin; + + auto __keep1 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); + auto __buf1 = __keep1(__in_begin, __in_end); + auto __keep2 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); + auto __buf2 = __keep2(__out_begin, __out_begin + __n); + + single_pass_copy_if_impl<_KernelParam>(__queue, __buf1.all_view(), __buf2.all_view(), pred); +} + } // inline namespace igpu } // namespace oneapi::dpl::experimental::kt diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp new file mode 100644 index 00000000000..459449d933d --- /dev/null +++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp @@ -0,0 +1,77 @@ +// -*- C++ -*- +//===-- scan.pass.cpp -----------------------------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#include "support/test_config.h" + +#include _PSTL_TEST_HEADER(execution) +#include _PSTL_TEST_HEADER(numeric) + +int +main() +{ + bool all_passed = true; + sycl::queue q; + + for (int logn : {4, 8, 10, 12, 14}) + { + std::cout << "Testing 2^" << logn << std::endl; + int n = 1 << logn; + std::cout << "n:" << n << std::endl; + std::vector v(n, 0); + for (size_t i = 0; i < v.size(); ++i) + std::cout << v[i] << ","; + std::cout << std::endl; + + int* in_ptr = sycl::malloc_device(n, q); + int* out_ptr = sycl::malloc_device(n, q); + + constexpr int n_elements_per_workitem = 8; + + q.copy(v.data(), in_ptr, n).wait(); + using KernelParams = oneapi::dpl::experimental::kt::kernel_param; + oneapi::dpl::experimental::kt::single_pass_copy_if(q, in_ptr, in_ptr+n, out_ptr, [](int x) { return x == 0; }); + + std::vector tmp(n, 0); + q.copy(out_ptr, tmp.data(), n); + q.wait(); + + std::copy_if(v.begin(), v.end(), v.begin(), [](int x) { return x == 0; }); + + bool passed = true; + // for (size_t i = 0; i < n; ++i) + // { + // if (tmp[i] != v[i]) + // { + // passed = false; + // std::cout << "expected " << i << ' ' << v[i] << ' ' << tmp[i] << '\n'; + // } + // } + + // if (passed) + // std::cout << " passed" << std::endl; + // else + // std::cout << " failed" << std::endl; + + for (size_t i = 0; i < n/(n_elements_per_workitem*128) + 1; ++i) { + std::cout << "i:" << i << " count:" << tmp[i] << std::endl; + } + + all_passed &= passed; + sycl::free(in_ptr, q); + sycl::free(out_ptr, q); + } + + return !all_passed; +} From aea60093c080026093f671bdd843578b46f5ba0d Mon Sep 17 00:00:00 2001 From: "aidan.belton" Date: Mon, 27 Nov 2023 13:26:38 +0000 Subject: [PATCH 072/134] Add phase 2 --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 112 +++++++++++++++--- .../numeric/numeric.ops/copy_if_kt.pass.cpp | 6 +- 2 files changed, 99 insertions(+), 19 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index c6da15a17b0..5a9d3241574 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -739,11 +739,14 @@ inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, siz return src[i + wg_stride * wg_group_id]; } -template +template void single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _UnaryPredicate pred) { using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + using _TileIdT = TileId::_TileIdT; + using _LookbackScanMemory = LookbackScanMemory<_Type, _UseAtomic64>; + using _FlagT = typename _LookbackScanMemory::_FlagT; const ::std::size_t n = __in_rng.size(); @@ -751,33 +754,87 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem; // Avoid non_uniform n by padding up to a multiple of wgsize - std::uint32_t elems_in_tile = wgsize * elems_per_workitem; + constexpr std::uint32_t elems_in_tile = wgsize * elems_per_workitem; ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); ::std::size_t num_workitems = num_wgs * wgsize; + ScanMemoryManager<_Type, _UseAtomic64, LookbackScanMemory, TileId> scratch(__queue); + scratch.allocate(num_wgs); + + // Memory Structure: + // [Lookback Scan Memory, Tile Id Counter] + auto scan_memory_begin = scratch.scan_memory_ptr(); + auto status_flags_begin = _LookbackScanMemory::get_flags_begin(scan_memory_begin, num_wgs); + auto tile_id_begin = scratch.tile_id_ptr(); + + ::std::size_t num_elements = _LookbackScanMemory::get_num_elements(num_wgs); + // fill_num_wgs num_elements + 1 to also initialize tile_id_counter + ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize); + + auto fill_event = __queue.submit( + [&](sycl::handler& hdl) + { + hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize}, + [=](const sycl::nd_item<1>& item) + { + int id = item.get_global_linear_id(); + if (id < num_elements) + status_flags_begin[id] = id < _LookbackScanMemory::padding + ? _LookbackScanMemory::OUT_OF_BOUNDS + : _LookbackScanMemory::NOT_READY; + if (id == num_elements) + tile_id_begin[0] = 0; + }); + }); + auto event = __queue.submit([&](sycl::handler& hdl) { auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_per_workitem*wgsize}, hdl); auto l_wg_count = sycl::local_accessor(sycl::range<1>{1}, hdl); + auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); + auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); + oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); - hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { + hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { auto group = item.get_group(); auto wg_group_id = item.get_group(0); auto wg_local_id = item.get_local_id(0); auto sg = item.get_sub_group(); + constexpr ::std::uint32_t stride = wgsize; + + // Init tile_id + std::uint32_t tile_id; + if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>) + { + // Obtain unique ID for this work-group that will be used in decoupled lookback + TileId dynamic_tile_id(tile_id_begin); + if (group.leader()) + { + tile_id_lacc[0] = dynamic_tile_id.fetch_inc(); + } + sycl::group_barrier(group); + tile_id = tile_id_lacc[0]; + } + else + { + tile_id = group.get_group_linear_id(); + } + + // Global load into local + auto wg_current_offset = (tile_id * elems_in_tile); + auto wg_next_offset = ((tile_id + 1) * elems_in_tile); + auto wg_local_memory_size = elems_in_tile; // Must be a better way to init atomics l_wg_count[0] = 0; sycl::group_barrier(group); sycl::atomic_ref wg_count(l_wg_count[0]); - constexpr size_t elems_per_workgroup = elems_per_workitem*wgsize; - // Phase 1: Create wg_count and construct in-order wg_copy_if_values - if ((wg_group_id + 1) * elems_per_workgroup <= n) { + if ((wg_group_id + 1) * elems_in_tile <= n) { #pragma unroll - for (size_t i = wg_local_id; i < elems_per_workgroup; i += wgsize) { - _Type val = load<_Type>(sg, __in_rng, i, elems_per_workgroup, wg_group_id); + for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { + _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, wg_group_id); size_t satisfies_pred = pred(val); //size_t satisfies_pred = 0; @@ -794,12 +851,12 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ else { // Edge of input, have to handle memory bounds // Might have unneccessary group_barrier calls - #pragma unroll - for (size_t i = wg_local_id; i < elems_per_workgroup; i += wgsize) { + //#pragma unroll + for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { size_t satisfies_pred = 0; _Type val; // TODO: alloca - if (i + elems_per_workgroup * wg_group_id < n) { - val = load<_Type>(sg, __in_rng, i, elems_per_workgroup, wg_group_id, n); + if (i + elems_in_tile * wg_group_id < n) { + val = load<_Type>(sg, __in_rng, i, elems_in_tile, wg_group_id, n); satisfies_pred = pred(val); } @@ -813,13 +870,36 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ sycl::group_barrier(group); } } + + // Phase 2: Global scan across wg_count + auto local_sum = wg_count.load(); + + auto in_begin = tile_vals.get_pointer(); + + _Type prev_sum = 0; + + // The first sub-group will query the previous tiles to find a prefix + if (sg.get_group_id() == 0) + { + _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs); + + if (group.leader()) + scan_mem.set_partial(tile_id, local_sum); + + // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum + prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_Type>(), scan_mem); + + if (group.leader()) + scan_mem.set_full(tile_id, prev_sum + local_sum); + } + + _Type carry = sycl::group_broadcast(group, prev_sum, 0); + // Check behaviour if (group.leader()) { - __out_rng[wg_group_id] = wg_count.load(); + __out_rng[wg_group_id] = carry; } - // Phase 2: Global scan across wg_count - // Phase 3: copy values to global memory }); }); @@ -839,7 +919,7 @@ single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __i oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); auto __buf2 = __keep2(__out_begin, __out_begin + __n); - single_pass_copy_if_impl<_KernelParam>(__queue, __buf1.all_view(), __buf2.all_view(), pred); + single_pass_copy_if_impl<_KernelParam, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::true_type>(__queue, __buf1.all_view(), __buf2.all_view(), pred); } } // inline namespace igpu diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp index 459449d933d..917e88a7707 100644 --- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp @@ -30,9 +30,9 @@ main() int n = 1 << logn; std::cout << "n:" << n << std::endl; std::vector v(n, 0); - for (size_t i = 0; i < v.size(); ++i) - std::cout << v[i] << ","; - std::cout << std::endl; + //for (size_t i = 0; i < v.size(); ++i) + // std::cout << v[i] << ","; + //std::cout << std::endl; int* in_ptr = sycl::malloc_device(n, q); int* out_ptr = sycl::malloc_device(n, q); From d5c2cb5d130cef60f25757915e02bcf16f6eb695 Mon Sep 17 00:00:00 2001 From: Aidan Date: Tue, 28 Nov 2023 15:19:56 +0000 Subject: [PATCH 073/134] Add phase 3 --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 67 ++++++++------- .../numeric/numeric.ops/copy_if_kt.pass.cpp | 86 ++++++++++++------- 2 files changed, 87 insertions(+), 66 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 5a9d3241574..63a59476234 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -721,27 +721,27 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera // Load function to try and get some PVC perf w/ coalesced template -inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t wg_group_id) { +inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t tile_id) { // if constexpr (std::is_arithmetic_v) { - // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * wg_group_id); + // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * tile_id); // } - return src[i + wg_stride * wg_group_id]; + return src[i + wg_stride * tile_id]; } // Load with checking for the subgroup case template -inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t wg_group_id, size_t input_size) { +inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t tile_id, size_t input_size) { // if constexpr (std::is_arithmetic_v) { - // if (i / SUBGROUP_SIZE + SUBGROUP_SIZE + wg_stride * wg_group_id <= input_size) - // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * wg_group_id); - // return src[i + wg_stride * wg_group_id]; + // if (i / SUBGROUP_SIZE + SUBGROUP_SIZE + wg_stride * tile_id <= input_size) + // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * tile_id); + // return src[i + wg_stride * tile_id]; // } - return src[i + wg_stride * wg_group_id]; + return src[i + wg_stride * tile_id]; } -template +template void -single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _UnaryPredicate pred) +single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred) { using _Type = oneapi::dpl::__internal::__value_t<_InRange>; using _TileIdT = TileId::_TileIdT; @@ -793,11 +793,11 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); + hdl.depends_on(fill_event); - oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); - hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { + oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng); + hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { auto group = item.get_group(); - auto wg_group_id = item.get_group(0); auto wg_local_id = item.get_local_id(0); auto sg = item.get_sub_group(); constexpr ::std::uint32_t stride = wgsize; @@ -822,7 +822,6 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ // Global load into local auto wg_current_offset = (tile_id * elems_in_tile); - auto wg_next_offset = ((tile_id + 1) * elems_in_tile); auto wg_local_memory_size = elems_in_tile; // Must be a better way to init atomics @@ -831,10 +830,10 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ sycl::atomic_ref wg_count(l_wg_count[0]); // Phase 1: Create wg_count and construct in-order wg_copy_if_values - if ((wg_group_id + 1) * elems_in_tile <= n) { + if ((tile_id + 1) * elems_in_tile <= n) { #pragma unroll for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { - _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, wg_group_id); + _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id); size_t satisfies_pred = pred(val); //size_t satisfies_pred = 0; @@ -847,16 +846,15 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ wg_count += (count + satisfies_pred); sycl::group_barrier(group); } - } - else { + } else { // Edge of input, have to handle memory bounds // Might have unneccessary group_barrier calls - //#pragma unroll + #pragma unroll for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { size_t satisfies_pred = 0; _Type val; // TODO: alloca - if (i + elems_in_tile * wg_group_id < n) { - val = load<_Type>(sg, __in_rng, i, elems_in_tile, wg_group_id, n); + if (i + elems_in_tile * tile_id < n) { + val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id, n); satisfies_pred = pred(val); } @@ -873,10 +871,8 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ // Phase 2: Global scan across wg_count auto local_sum = wg_count.load(); - auto in_begin = tile_vals.get_pointer(); - - _Type prev_sum = 0; + size_t prev_sum = 0; // The first sub-group will query the previous tiles to find a prefix if (sg.get_group_id() == 0) @@ -893,22 +889,23 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ scan_mem.set_full(tile_id, prev_sum + local_sum); } - _Type carry = sycl::group_broadcast(group, prev_sum, 0); + size_t start_idx = sycl::group_broadcast(group, prev_sum, 0); - // Check behaviour - if (group.leader()) { - __out_rng[wg_group_id] = carry; - } - // Phase 3: copy values to global memory + for (int i = wg_local_id; i < local_sum; i += wgsize) { + // Probably adjust method to try and get some perf on PVC for arithmetic types using sg.store + __out_rng[start_idx + i] = wg_copy_if_values[i]; + } + if (tile_id == (num_wgs - 1) && group.leader()) + __num_rng[0] = start_idx + local_sum; }); }); event.wait(); } -template +template void -single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _UnaryPredicate pred) +single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred) { auto __n = __in_end - __in_begin; @@ -919,7 +916,11 @@ single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __i oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); auto __buf2 = __keep2(__out_begin, __out_begin + __n); - single_pass_copy_if_impl<_KernelParam, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::true_type>(__queue, __buf1.all_view(), __buf2.all_view(), pred); + auto __keep_num = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>(); + auto __buf_num = __keep2(__num_begin, __num_begin + 1); + + single_pass_copy_if_impl<_KernelParam, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::true_type>(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred); } } // inline namespace igpu diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp index 917e88a7707..202f28fbaad 100644 --- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp @@ -14,64 +14,84 @@ //===----------------------------------------------------------------------===// #include "support/test_config.h" +#include "support/utils.h" #include _PSTL_TEST_HEADER(execution) #include _PSTL_TEST_HEADER(numeric) -int -main() +using namespace TestUtils; + +template +class CopyIfKernel; + +template +bool test(Predicate pred, Generator gen) { bool all_passed = true; sycl::queue q; - for (int logn : {4, 8, 10, 12, 14}) + for (int logn : {4, 8, 10, 12, 14, 15, 18}) { - std::cout << "Testing 2^" << logn << std::endl; int n = 1 << logn; - std::cout << "n:" << n << std::endl; - std::vector v(n, 0); - //for (size_t i = 0; i < v.size(); ++i) - // std::cout << v[i] << ","; - //std::cout << std::endl; - int* in_ptr = sycl::malloc_device(n, q); - int* out_ptr = sycl::malloc_device(n, q); + Sequence in(n, [&](size_t k) -> T { + return gen(n ^ k); + }); + + Sequence std_out(n); + + T* in_ptr = sycl::malloc_device(n, q); + T* out_ptr = sycl::malloc_device(n, q); + size_t* out_num = sycl::malloc_device(1, q); constexpr int n_elements_per_workitem = 8; - q.copy(v.data(), in_ptr, n).wait(); - using KernelParams = oneapi::dpl::experimental::kt::kernel_param; - oneapi::dpl::experimental::kt::single_pass_copy_if(q, in_ptr, in_ptr+n, out_ptr, [](int x) { return x == 0; }); + q.copy(in.data(), in_ptr, n).wait(); + using KernelParams = oneapi::dpl::experimental::kt::kernel_param>; + oneapi::dpl::experimental::kt::single_pass_copy_if(q, in_ptr, in_ptr+n, out_ptr, out_num, pred); - std::vector tmp(n, 0); - q.copy(out_ptr, tmp.data(), n); + Sequence kt_out(n); + size_t num_selected = 0; + q.copy(out_ptr, kt_out.data(), n); + q.copy(out_num, &num_selected, 1); q.wait(); - std::copy_if(v.begin(), v.end(), v.begin(), [](int x) { return x == 0; }); + auto std_out_end = std::copy_if(in.begin(), in.end(), std_out.begin(), pred); bool passed = true; - // for (size_t i = 0; i < n; ++i) - // { - // if (tmp[i] != v[i]) - // { - // passed = false; - // std::cout << "expected " << i << ' ' << v[i] << ' ' << tmp[i] << '\n'; - // } - // } - - // if (passed) - // std::cout << " passed" << std::endl; - // else - // std::cout << " failed" << std::endl; - - for (size_t i = 0; i < n/(n_elements_per_workitem*128) + 1; ++i) { - std::cout << "i:" << i << " count:" << tmp[i] << std::endl; + if (num_selected != (std_out_end - std_out.begin())) { + passed = false; + std::cout << "Num selected wrong: expected " << (std_out_end - std_out.begin()) << " " << num_selected << "\n"; + } + + for (size_t i = 0; i < (std_out_end - std_out.begin()); ++i) + { + if (kt_out[i] != std_out[i]) + { + passed = false; + std::cout << "expected " << i << ' ' << std_out[i] << ' ' << kt_out[i] << '\n'; + } } + if (passed) + std::cout << " passed" << std::endl; + else + std::cout << " failed" << std::endl; + all_passed &= passed; sycl::free(in_ptr, q); sycl::free(out_ptr, q); + sycl::free(out_num, q); } return !all_passed; } + +int main() { + bool all_passed; + all_passed &= test([](const float64_t& x) { return x * x <= 1024; }, [](size_t j) { return ((j + 1) % 7 & 2) != 0 ? float64_t(j % 32) : float64_t(j % 33 + 34); }); + all_passed &= test([](const int&) { return true; }, [](size_t j) { return j; }); + all_passed &= test([](const std::int32_t& x) { return x != 42; }, [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? std::int32_t(j + 1) : 42; }); + + return all_passed; +} From 1f574b8f501922f22c2b919e1fb11b0bae6480a2 Mon Sep 17 00:00:00 2001 From: Aidan Date: Tue, 5 Dec 2023 11:28:40 +0000 Subject: [PATCH 074/134] Add count datatype _SizeT --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 63a59476234..3d6289642bc 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -744,8 +744,9 @@ void single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred) { using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + using _SizeT = uint64_t; using _TileIdT = TileId::_TileIdT; - using _LookbackScanMemory = LookbackScanMemory<_Type, _UseAtomic64>; + using _LookbackScanMemory = LookbackScanMemory<_SizeT, _UseAtomic64>; using _FlagT = typename _LookbackScanMemory::_FlagT; const ::std::size_t n = __in_rng.size(); @@ -758,7 +759,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); ::std::size_t num_workitems = num_wgs * wgsize; - ScanMemoryManager<_Type, _UseAtomic64, LookbackScanMemory, TileId> scratch(__queue); + ScanMemoryManager<_SizeT, _UseAtomic64, LookbackScanMemory, TileId> scratch(__queue); scratch.allocate(num_wgs); // Memory Structure: @@ -792,7 +793,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ auto l_wg_count = sycl::local_accessor(sycl::range<1>{1}, hdl); auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); - auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); + auto tile_vals = sycl::local_accessor<_SizeT, 1>(sycl::range<1>{elems_in_tile}, hdl); hdl.depends_on(fill_event); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng); @@ -827,7 +828,8 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ // Must be a better way to init atomics l_wg_count[0] = 0; sycl::group_barrier(group); - sycl::atomic_ref wg_count(l_wg_count[0]); + sycl::atomic_ref<_SizeT, sycl::memory_order::acq_rel, sycl::memory_scope::work_group, sycl::access::address_space::local_space> wg_count(l_wg_count[0]); + sycl::group_barrier(group); // Phase 1: Create wg_count and construct in-order wg_copy_if_values if ((tile_id + 1) * elems_in_tile <= n) { @@ -835,9 +837,8 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id); - size_t satisfies_pred = pred(val); - //size_t satisfies_pred = 0; - size_t count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus()); + _SizeT satisfies_pred = pred(val); + _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<_SizeT>()); if (satisfies_pred) wg_copy_if_values[count + wg_count.load()] = val; @@ -851,14 +852,14 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ // Might have unneccessary group_barrier calls #pragma unroll for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { - size_t satisfies_pred = 0; + _SizeT satisfies_pred = 0; _Type val; // TODO: alloca if (i + elems_in_tile * tile_id < n) { val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id, n); satisfies_pred = pred(val); } - size_t count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus()); + _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<_SizeT>()); if (satisfies_pred) wg_copy_if_values[count + wg_count.load()] = val; @@ -870,9 +871,9 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ } // Phase 2: Global scan across wg_count - auto local_sum = wg_count.load(); - auto in_begin = tile_vals.get_pointer(); - size_t prev_sum = 0; + _SizeT local_sum = wg_count.load(); + _SizeT* in_begin = tile_vals.get_pointer(); + _SizeT prev_sum = 0; // The first sub-group will query the previous tiles to find a prefix if (sg.get_group_id() == 0) @@ -883,13 +884,13 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ scan_mem.set_partial(tile_id, local_sum); // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum - prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_Type>(), scan_mem); + prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_SizeT>(), scan_mem); if (group.leader()) scan_mem.set_full(tile_id, prev_sum + local_sum); } - size_t start_idx = sycl::group_broadcast(group, prev_sum, 0); + _SizeT start_idx = sycl::group_broadcast(group, prev_sum, 0); // Phase 3: copy values to global memory for (int i = wg_local_id; i < local_sum; i += wgsize) { From 16ef9c2ad691c0c52b51b391a49697584921758c Mon Sep 17 00:00:00 2001 From: Aidan Date: Tue, 5 Dec 2023 12:24:51 +0000 Subject: [PATCH 075/134] Move away from atomics --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 36 +++++++------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 3d6289642bc..60c2db24b78 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -789,11 +789,9 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ }); auto event = __queue.submit([&](sycl::handler& hdl) { - auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_per_workitem*wgsize}, hdl); - auto l_wg_count = sycl::local_accessor(sycl::range<1>{1}, hdl); + auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); - auto tile_vals = sycl::local_accessor<_SizeT, 1>(sycl::range<1>{elems_in_tile}, hdl); hdl.depends_on(fill_event); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng); @@ -825,11 +823,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ auto wg_current_offset = (tile_id * elems_in_tile); auto wg_local_memory_size = elems_in_tile; - // Must be a better way to init atomics - l_wg_count[0] = 0; - sycl::group_barrier(group); - sycl::atomic_ref<_SizeT, sycl::memory_order::acq_rel, sycl::memory_scope::work_group, sycl::access::address_space::local_space> wg_count(l_wg_count[0]); - sycl::group_barrier(group); + _SizeT wg_count = 0; // Phase 1: Create wg_count and construct in-order wg_copy_if_values if ((tile_id + 1) * elems_in_tile <= n) { @@ -838,14 +832,12 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id); _SizeT satisfies_pred = pred(val); - _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<_SizeT>()); + _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); if (satisfies_pred) - wg_copy_if_values[count + wg_count.load()] = val; + wg_copy_if_values[count] = val; - if (wg_local_id == (wgsize - 1)) - wg_count += (count + satisfies_pred); - sycl::group_barrier(group); + wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); } } else { // Edge of input, have to handle memory bounds @@ -859,20 +851,16 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ satisfies_pred = pred(val); } - _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<_SizeT>()); + _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); if (satisfies_pred) - wg_copy_if_values[count + wg_count.load()] = val; + wg_copy_if_values[count] = val; - if (wg_local_id == (wgsize - 1)) - wg_count += (count + satisfies_pred); - sycl::group_barrier(group); + wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); } } // Phase 2: Global scan across wg_count - _SizeT local_sum = wg_count.load(); - _SizeT* in_begin = tile_vals.get_pointer(); _SizeT prev_sum = 0; // The first sub-group will query the previous tiles to find a prefix @@ -881,24 +869,24 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs); if (group.leader()) - scan_mem.set_partial(tile_id, local_sum); + scan_mem.set_partial(tile_id, wg_count); // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_SizeT>(), scan_mem); if (group.leader()) - scan_mem.set_full(tile_id, prev_sum + local_sum); + scan_mem.set_full(tile_id, prev_sum + wg_count); } _SizeT start_idx = sycl::group_broadcast(group, prev_sum, 0); // Phase 3: copy values to global memory - for (int i = wg_local_id; i < local_sum; i += wgsize) { + for (int i = wg_local_id; i < wg_count; i += wgsize) { // Probably adjust method to try and get some perf on PVC for arithmetic types using sg.store __out_rng[start_idx + i] = wg_copy_if_values[i]; } if (tile_id == (num_wgs - 1) && group.leader()) - __num_rng[0] = start_idx + local_sum; + __num_rng[0] = start_idx + wg_count; }); }); event.wait(); From 45a1fb77d57e8754ce196b2476b9fc1fe2bcf213 Mon Sep 17 00:00:00 2001 From: Aidan Date: Tue, 5 Dec 2023 19:42:54 +0000 Subject: [PATCH 076/134] Sort out test logic --- test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp index 202f28fbaad..75769131522 100644 --- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp @@ -84,11 +84,11 @@ bool test(Predicate pred, Generator gen) sycl::free(out_num, q); } - return !all_passed; + return all_passed; } int main() { - bool all_passed; + bool all_passed = true; all_passed &= test([](const float64_t& x) { return x * x <= 1024; }, [](size_t j) { return ((j + 1) % 7 & 2) != 0 ? float64_t(j % 32) : float64_t(j % 33 + 34); }); all_passed &= test([](const int&) { return true; }, [](size_t j) { return j; }); all_passed &= test([](const std::int32_t& x) { return x != 42; }, [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? std::int32_t(j + 1) : 42; }); From cec32d79daa1ac8926f0a58dfd6c722c0908d232 Mon Sep 17 00:00:00 2001 From: Aidan Date: Tue, 5 Dec 2023 19:50:48 +0000 Subject: [PATCH 077/134] Remove unnecessary load and store functions --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 25 ++----------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 60c2db24b78..68d11740df0 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -719,26 +719,6 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera } } -// Load function to try and get some PVC perf w/ coalesced -template -inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t tile_id) { - // if constexpr (std::is_arithmetic_v) { - // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * tile_id); - // } - return src[i + wg_stride * tile_id]; -} - -// Load with checking for the subgroup case -template -inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t tile_id, size_t input_size) { - // if constexpr (std::is_arithmetic_v) { - // if (i / SUBGROUP_SIZE + SUBGROUP_SIZE + wg_stride * tile_id <= input_size) - // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * tile_id); - // return src[i + wg_stride * tile_id]; - // } - return src[i + wg_stride * tile_id]; -} - template void single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred) @@ -829,7 +809,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ if ((tile_id + 1) * elems_in_tile <= n) { #pragma unroll for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { - _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id); + _Type val = __in_rng[i + elems_in_tile * tile_id]; _SizeT satisfies_pred = pred(val); _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); @@ -847,7 +827,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ _SizeT satisfies_pred = 0; _Type val; // TODO: alloca if (i + elems_in_tile * tile_id < n) { - val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id, n); + val = __in_rng[i + elems_in_tile * tile_id]; satisfies_pred = pred(val); } @@ -882,7 +862,6 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ // Phase 3: copy values to global memory for (int i = wg_local_id; i < wg_count; i += wgsize) { - // Probably adjust method to try and get some perf on PVC for arithmetic types using sg.store __out_rng[start_idx + i] = wg_copy_if_values[i]; } if (tile_id == (num_wgs - 1) && group.leader()) From b7d659c81db111deeac008a4647a104cdcc3dfa9 Mon Sep 17 00:00:00 2001 From: Aidan Date: Wed, 6 Dec 2023 11:03:59 +0000 Subject: [PATCH 078/134] Release scratch mem --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 68d11740df0..db642fc7177 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -868,6 +868,9 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ __num_rng[0] = start_idx + wg_count; }); }); + + scratch.async_free(event); + event.wait(); } From fdb1824018b8f4c0c9bfe1e63dcd31f36c61e641 Mon Sep 17 00:00:00 2001 From: Aidan Date: Fri, 8 Dec 2023 11:15:29 +0000 Subject: [PATCH 079/134] Add single wg copy if --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 108 +++++++++++++++++- 1 file changed, 106 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index db642fc7177..36e395b7285 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -79,6 +79,11 @@ struct ScanMemoryManager }); } + void free() + { + sycl::free(scratch, q); + } + private: ::std::uint8_t* scratch = nullptr; ::std::uint8_t* scan_memory_begin = nullptr; @@ -719,6 +724,86 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera } } +template +void +single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred) +{ + using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + using _SizeT = uint64_t; + using _TileIdT = TileId::_TileIdT; + + const ::std::size_t n = __in_rng.size(); + + constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; + constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem; + + // Avoid non_uniform n by padding up to a multiple of wgsize + constexpr std::uint32_t elems_in_tile = wgsize * elems_per_workitem; + ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); + ::std::size_t num_workitems = num_wgs * wgsize; + + auto event = __queue.submit([&](sycl::handler& hdl) { + auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); + + oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng); + hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { + auto group = item.get_group(); + auto wg_local_id = item.get_local_id(0); + constexpr ::std::uint32_t stride = wgsize; + + // Global load into local + auto wg_current_offset = 0; + auto wg_local_memory_size = elems_in_tile; + + _SizeT wg_count = 0; + + // Phase 1: Create wg_count and construct in-order wg_copy_if_values + if (elems_in_tile <= n) { + #pragma unroll + for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { + _Type val = __in_rng[i]; + + _SizeT satisfies_pred = pred(val); + _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); + + if (satisfies_pred) + wg_copy_if_values[count] = val; + + wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); + } + } else { + // Edge of input, have to handle memory bounds + // Might have unneccessary group_barrier calls + #pragma unroll + for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { + _SizeT satisfies_pred = 0; + _Type val; // TODO: alloca + if (i < n) { + val = __in_rng[i]; + + satisfies_pred = pred(val); + } + _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); + + if (satisfies_pred) + wg_copy_if_values[count] = val; + + wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); + } + } + + // Phase 3: copy values to global memory + for (int i = wg_local_id; i < wg_count; i += wgsize) { + __out_rng[i] = wg_copy_if_values[i]; + } + if (group.leader()) + __num_rng[0] = wg_count; + }); + }); + + event.wait(); +} + template void single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred) @@ -869,9 +954,28 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ }); }); - scratch.async_free(event); - event.wait(); + scratch.free(); +} + +template +void +single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred) +{ + auto __n = __in_end - __in_begin; + + auto __keep1 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); + auto __buf1 = __keep1(__in_begin, __in_end); + auto __keep2 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); + auto __buf2 = __keep2(__out_begin, __out_begin + __n); + + auto __keep_num = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>(); + auto __buf_num = __keep2(__num_begin, __num_begin + 1); + + single_pass_copy_if_impl_single_wg<_KernelParam>(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred); } template From 1df5fbb37573f10bc61fc804b6d026126e01d8a6 Mon Sep 17 00:00:00 2001 From: Aidan Date: Fri, 8 Dec 2023 11:38:45 +0000 Subject: [PATCH 080/134] Fix unrolls and use memset --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 36 ++++++------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 36e395b7285..fcfb3ad1b84 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -753,13 +753,12 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou // Global load into local auto wg_current_offset = 0; - auto wg_local_memory_size = elems_in_tile; _SizeT wg_count = 0; // Phase 1: Create wg_count and construct in-order wg_copy_if_values if (elems_in_tile <= n) { - #pragma unroll +#pragma unroll for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { _Type val = __in_rng[i]; @@ -774,7 +773,7 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou } else { // Edge of input, have to handle memory bounds // Might have unneccessary group_barrier calls - #pragma unroll +#pragma unroll for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { _SizeT satisfies_pred = 0; _Type val; // TODO: alloca @@ -837,21 +836,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ // fill_num_wgs num_elements + 1 to also initialize tile_id_counter ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize); - auto fill_event = __queue.submit( - [&](sycl::handler& hdl) - { - hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize}, - [=](const sycl::nd_item<1>& item) - { - int id = item.get_global_linear_id(); - if (id < num_elements) - status_flags_begin[id] = id < _LookbackScanMemory::padding - ? _LookbackScanMemory::OUT_OF_BOUNDS - : _LookbackScanMemory::NOT_READY; - if (id == num_elements) - tile_id_begin[0] = 0; - }); - }); + auto fill_event = __queue.memset(status_flags_begin, 0, num_elements * sizeof(_FlagT) + 1 * sizeof(_TileIdT)); auto event = __queue.submit([&](sycl::handler& hdl) { auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); @@ -886,15 +871,14 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ // Global load into local auto wg_current_offset = (tile_id * elems_in_tile); - auto wg_local_memory_size = elems_in_tile; _SizeT wg_count = 0; // Phase 1: Create wg_count and construct in-order wg_copy_if_values if ((tile_id + 1) * elems_in_tile <= n) { - #pragma unroll - for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { - _Type val = __in_rng[i + elems_in_tile * tile_id]; +#pragma unroll + for (size_t i = 0; i < elems_in_tile; i += wgsize) { + _Type val = __in_rng[i + wg_local_id + elems_in_tile * tile_id]; _SizeT satisfies_pred = pred(val); _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); @@ -907,12 +891,12 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ } else { // Edge of input, have to handle memory bounds // Might have unneccessary group_barrier calls - #pragma unroll - for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { +#pragma unroll + for (size_t i = 0; i < elems_in_tile; i += wgsize) { _SizeT satisfies_pred = 0; _Type val; // TODO: alloca - if (i + elems_in_tile * tile_id < n) { - val = __in_rng[i + elems_in_tile * tile_id]; + if (i + wg_local_id + elems_in_tile * tile_id < n) { + val = __in_rng[i + wg_local_id + elems_in_tile * tile_id]; satisfies_pred = pred(val); } From d8b77febdea55241dbde182449e88df413338356 Mon Sep 17 00:00:00 2001 From: Aidan Date: Fri, 8 Dec 2023 11:48:50 +0000 Subject: [PATCH 081/134] apply changes to single wg --- .../pstl/hetero/dpcpp/parallel_backend_sycl_scan.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index fcfb3ad1b84..60007e4566c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -759,8 +759,8 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou // Phase 1: Create wg_count and construct in-order wg_copy_if_values if (elems_in_tile <= n) { #pragma unroll - for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { - _Type val = __in_rng[i]; + for (size_t i = 0; i < elems_in_tile; i += wgsize) { + _Type val = __in_rng[i + wg_local_id]; _SizeT satisfies_pred = pred(val); _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); @@ -774,11 +774,11 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou // Edge of input, have to handle memory bounds // Might have unneccessary group_barrier calls #pragma unroll - for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) { + for (size_t i = 0; i < elems_in_tile; i += wgsize) { _SizeT satisfies_pred = 0; - _Type val; // TODO: alloca - if (i < n) { - val = __in_rng[i]; + _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type)))); + if (i + wg_local_id < n) { + val = __in_rng[i + wg_local_id]; satisfies_pred = pred(val); } @@ -894,7 +894,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ #pragma unroll for (size_t i = 0; i < elems_in_tile; i += wgsize) { _SizeT satisfies_pred = 0; - _Type val; // TODO: alloca + _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type)))); if (i + wg_local_id + elems_in_tile * tile_id < n) { val = __in_rng[i + wg_local_id + elems_in_tile * tile_id]; From 5b53de669e20fe2a81be81ec2cc5fddd9bdb6543 Mon Sep 17 00:00:00 2001 From: Aidan Date: Fri, 8 Dec 2023 15:31:31 +0000 Subject: [PATCH 082/134] Remove unused variables --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index 60007e4566c..fcb539cab2b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -741,6 +741,7 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou constexpr std::uint32_t elems_in_tile = wgsize * elems_per_workitem; ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); ::std::size_t num_workitems = num_wgs * wgsize; + assert(num_wgs == 1); auto event = __queue.submit([&](sycl::handler& hdl) { auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); @@ -752,8 +753,6 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou constexpr ::std::uint32_t stride = wgsize; // Global load into local - auto wg_current_offset = 0; - _SizeT wg_count = 0; // Phase 1: Create wg_count and construct in-order wg_copy_if_values @@ -869,9 +868,6 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ tile_id = group.get_group_linear_id(); } - // Global load into local - auto wg_current_offset = (tile_id * elems_in_tile); - _SizeT wg_count = 0; // Phase 1: Create wg_count and construct in-order wg_copy_if_values From acc4f9b65a842fe113c58f86e85e31b386a384aa Mon Sep 17 00:00:00 2001 From: "aidan.belton" Date: Fri, 8 Dec 2023 15:35:41 +0000 Subject: [PATCH 083/134] Clang-format copy_if_kt commits --- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 333 ++++++++++-------- .../numeric/numeric.ops/copy_if_kt.pass.cpp | 33 +- 2 files changed, 200 insertions(+), 166 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index fcb539cab2b..0838817fd4f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -79,7 +79,8 @@ struct ScanMemoryManager }); } - void free() + void + free() { sycl::free(scratch, q); } @@ -724,9 +725,11 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera } } -template +template void -single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred) +single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, + _NumSelectedRange __num_rng, _UnaryPredicate pred) { using _Type = oneapi::dpl::__internal::__value_t<_InRange>; using _SizeT = uint64_t; @@ -747,64 +750,76 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng); - hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { - auto group = item.get_group(); - auto wg_local_id = item.get_local_id(0); - constexpr ::std::uint32_t stride = wgsize; - - // Global load into local - _SizeT wg_count = 0; - - // Phase 1: Create wg_count and construct in-order wg_copy_if_values - if (elems_in_tile <= n) { + hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), + [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { + auto group = item.get_group(); + auto wg_local_id = item.get_local_id(0); + constexpr ::std::uint32_t stride = wgsize; + + // Global load into local + _SizeT wg_count = 0; + + // Phase 1: Create wg_count and construct in-order wg_copy_if_values + if (elems_in_tile <= n) + { #pragma unroll - for (size_t i = 0; i < elems_in_tile; i += wgsize) { - _Type val = __in_rng[i + wg_local_id]; - - _SizeT satisfies_pred = pred(val); - _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); - - if (satisfies_pred) - wg_copy_if_values[count] = val; - - wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); - } - } else { - // Edge of input, have to handle memory bounds - // Might have unneccessary group_barrier calls + for (size_t i = 0; i < elems_in_tile; i += wgsize) + { + _Type val = __in_rng[i + wg_local_id]; + + _SizeT satisfies_pred = pred(val); + _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, + sycl::plus<_SizeT>()); + + if (satisfies_pred) + wg_copy_if_values[count] = val; + + wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); + } + } + else + { + // Edge of input, have to handle memory bounds + // Might have unneccessary group_barrier calls #pragma unroll - for (size_t i = 0; i < elems_in_tile; i += wgsize) { - _SizeT satisfies_pred = 0; - _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type)))); - if (i + wg_local_id < n) { - val = __in_rng[i + wg_local_id]; - - satisfies_pred = pred(val); - } - _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); - - if (satisfies_pred) - wg_copy_if_values[count] = val; - - wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); - } - } - - // Phase 3: copy values to global memory - for (int i = wg_local_id; i < wg_count; i += wgsize) { - __out_rng[i] = wg_copy_if_values[i]; - } - if (group.leader()) - __num_rng[0] = wg_count; - }); + for (size_t i = 0; i < elems_in_tile; i += wgsize) + { + _SizeT satisfies_pred = 0; + _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type)))); + if (i + wg_local_id < n) + { + val = __in_rng[i + wg_local_id]; + + satisfies_pred = pred(val); + } + _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, + sycl::plus<_SizeT>()); + + if (satisfies_pred) + wg_copy_if_values[count] = val; + + wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); + } + } + + // Phase 3: copy values to global memory + for (int i = wg_local_id; i < wg_count; i += wgsize) + { + __out_rng[i] = wg_copy_if_values[i]; + } + if (group.leader()) + __num_rng[0] = wg_count; + }); }); event.wait(); } -template +template void -single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred) +single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, + _UnaryPredicate pred) { using _Type = oneapi::dpl::__internal::__value_t<_InRange>; using _SizeT = uint64_t; @@ -844,138 +859,150 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ hdl.depends_on(fill_event); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng); - hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { - auto group = item.get_group(); - auto wg_local_id = item.get_local_id(0); - auto sg = item.get_sub_group(); - constexpr ::std::uint32_t stride = wgsize; - - // Init tile_id - std::uint32_t tile_id; - if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>) - { - // Obtain unique ID for this work-group that will be used in decoupled lookback - TileId dynamic_tile_id(tile_id_begin); - if (group.leader()) - { - tile_id_lacc[0] = dynamic_tile_id.fetch_inc(); - } - sycl::group_barrier(group); - tile_id = tile_id_lacc[0]; - } - else - { - tile_id = group.get_group_linear_id(); - } - - _SizeT wg_count = 0; - - // Phase 1: Create wg_count and construct in-order wg_copy_if_values - if ((tile_id + 1) * elems_in_tile <= n) { + hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), + [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { + auto group = item.get_group(); + auto wg_local_id = item.get_local_id(0); + auto sg = item.get_sub_group(); + constexpr ::std::uint32_t stride = wgsize; + + // Init tile_id + std::uint32_t tile_id; + if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>) + { + // Obtain unique ID for this work-group that will be used in decoupled lookback + TileId dynamic_tile_id(tile_id_begin); + if (group.leader()) + { + tile_id_lacc[0] = dynamic_tile_id.fetch_inc(); + } + sycl::group_barrier(group); + tile_id = tile_id_lacc[0]; + } + else + { + tile_id = group.get_group_linear_id(); + } + + _SizeT wg_count = 0; + + // Phase 1: Create wg_count and construct in-order wg_copy_if_values + if ((tile_id + 1) * elems_in_tile <= n) + { #pragma unroll - for (size_t i = 0; i < elems_in_tile; i += wgsize) { - _Type val = __in_rng[i + wg_local_id + elems_in_tile * tile_id]; - - _SizeT satisfies_pred = pred(val); - _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); - - if (satisfies_pred) - wg_copy_if_values[count] = val; - - wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); - } - } else { - // Edge of input, have to handle memory bounds - // Might have unneccessary group_barrier calls + for (size_t i = 0; i < elems_in_tile; i += wgsize) + { + _Type val = __in_rng[i + wg_local_id + elems_in_tile * tile_id]; + + _SizeT satisfies_pred = pred(val); + _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, + sycl::plus<_SizeT>()); + + if (satisfies_pred) + wg_copy_if_values[count] = val; + + wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); + } + } + else + { + // Edge of input, have to handle memory bounds + // Might have unneccessary group_barrier calls #pragma unroll - for (size_t i = 0; i < elems_in_tile; i += wgsize) { - _SizeT satisfies_pred = 0; - _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type)))); - if (i + wg_local_id + elems_in_tile * tile_id < n) { - val = __in_rng[i + wg_local_id + elems_in_tile * tile_id]; - - satisfies_pred = pred(val); - } - _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); - - if (satisfies_pred) - wg_copy_if_values[count] = val; - - wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); - } - } - - // Phase 2: Global scan across wg_count - _SizeT prev_sum = 0; - - // The first sub-group will query the previous tiles to find a prefix - if (sg.get_group_id() == 0) - { - _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs); - - if (group.leader()) - scan_mem.set_partial(tile_id, wg_count); - - // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum - prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_SizeT>(), scan_mem); - - if (group.leader()) - scan_mem.set_full(tile_id, prev_sum + wg_count); - } - - _SizeT start_idx = sycl::group_broadcast(group, prev_sum, 0); - - // Phase 3: copy values to global memory - for (int i = wg_local_id; i < wg_count; i += wgsize) { - __out_rng[start_idx + i] = wg_copy_if_values[i]; - } - if (tile_id == (num_wgs - 1) && group.leader()) - __num_rng[0] = start_idx + wg_count; - }); + for (size_t i = 0; i < elems_in_tile; i += wgsize) + { + _SizeT satisfies_pred = 0; + _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type)))); + if (i + wg_local_id + elems_in_tile * tile_id < n) + { + val = __in_rng[i + wg_local_id + elems_in_tile * tile_id]; + + satisfies_pred = pred(val); + } + _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, + sycl::plus<_SizeT>()); + + if (satisfies_pred) + wg_copy_if_values[count] = val; + + wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); + } + } + + // Phase 2: Global scan across wg_count + _SizeT prev_sum = 0; + + // The first sub-group will query the previous tiles to find a prefix + if (sg.get_group_id() == 0) + { + _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs); + + if (group.leader()) + scan_mem.set_partial(tile_id, wg_count); + + // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum + prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_SizeT>(), scan_mem); + + if (group.leader()) + scan_mem.set_full(tile_id, prev_sum + wg_count); + } + + _SizeT start_idx = sycl::group_broadcast(group, prev_sum, 0); + + // Phase 3: copy values to global memory + for (int i = wg_local_id; i < wg_count; i += wgsize) + { + __out_rng[start_idx + i] = wg_copy_if_values[i]; + } + if (tile_id == (num_wgs - 1) && group.leader()) + __num_rng[0] = start_idx + wg_count; + }); }); event.wait(); scratch.free(); } -template +template void -single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred) +single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, + _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred) { auto __n = __in_end - __in_begin; - auto __keep1 = - oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); + auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); auto __buf1 = __keep1(__in_begin, __in_end); - auto __keep2 = - oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); + auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); auto __buf2 = __keep2(__out_begin, __out_begin + __n); auto __keep_num = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>(); auto __buf_num = __keep2(__num_begin, __num_begin + 1); - single_pass_copy_if_impl_single_wg<_KernelParam>(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred); + single_pass_copy_if_impl_single_wg<_KernelParam>(__queue, __buf1.all_view(), __buf2.all_view(), + __buf_num.all_view(), pred); } -template +template void -single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred) +single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, + _NumSelectedRange __num_begin, _UnaryPredicate pred) { auto __n = __in_end - __in_begin; - auto __keep1 = - oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); + auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); auto __buf1 = __keep1(__in_begin, __in_end); - auto __keep2 = - oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); + auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); auto __buf2 = __keep2(__out_begin, __out_begin + __n); auto __keep_num = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>(); auto __buf_num = __keep2(__num_begin, __num_begin + 1); - single_pass_copy_if_impl<_KernelParam, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::true_type>(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred); + single_pass_copy_if_impl<_KernelParam, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::true_type>( + __queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred); } } // inline namespace igpu diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp index 75769131522..a77b76491e7 100644 --- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp @@ -24,8 +24,9 @@ using namespace TestUtils; template class CopyIfKernel; -template -bool test(Predicate pred, Generator gen) +template +bool +test(Predicate pred, Generator gen) { bool all_passed = true; sycl::queue q; @@ -34,9 +35,7 @@ bool test(Predicate pred, Generator gen) { int n = 1 << logn; - Sequence in(n, [&](size_t k) -> T { - return gen(n ^ k); - }); + Sequence in(n, [&](size_t k) -> T { return gen(n ^ k); }); Sequence std_out(n); @@ -47,8 +46,9 @@ bool test(Predicate pred, Generator gen) constexpr int n_elements_per_workitem = 8; q.copy(in.data(), in_ptr, n).wait(); - using KernelParams = oneapi::dpl::experimental::kt::kernel_param>; - oneapi::dpl::experimental::kt::single_pass_copy_if(q, in_ptr, in_ptr+n, out_ptr, out_num, pred); + using KernelParams = + oneapi::dpl::experimental::kt::kernel_param>; + oneapi::dpl::experimental::kt::single_pass_copy_if(q, in_ptr, in_ptr + n, out_ptr, out_num, pred); Sequence kt_out(n); size_t num_selected = 0; @@ -59,12 +59,14 @@ bool test(Predicate pred, Generator gen) auto std_out_end = std::copy_if(in.begin(), in.end(), std_out.begin(), pred); bool passed = true; - if (num_selected != (std_out_end - std_out.begin())) { + if (num_selected != (std_out_end - std_out.begin())) + { passed = false; - std::cout << "Num selected wrong: expected " << (std_out_end - std_out.begin()) << " " << num_selected << "\n"; + std::cout << "Num selected wrong: expected " << (std_out_end - std_out.begin()) << " " << num_selected + << "\n"; } - for (size_t i = 0; i < (std_out_end - std_out.begin()); ++i) + for (size_t i = 0; i < (std_out_end - std_out.begin()); ++i) { if (kt_out[i] != std_out[i]) { @@ -87,11 +89,16 @@ bool test(Predicate pred, Generator gen) return all_passed; } -int main() { +int +main() +{ bool all_passed = true; - all_passed &= test([](const float64_t& x) { return x * x <= 1024; }, [](size_t j) { return ((j + 1) % 7 & 2) != 0 ? float64_t(j % 32) : float64_t(j % 33 + 34); }); + all_passed &= + test([](const float64_t& x) { return x * x <= 1024; }, + [](size_t j) { return ((j + 1) % 7 & 2) != 0 ? float64_t(j % 32) : float64_t(j % 33 + 34); }); all_passed &= test([](const int&) { return true; }, [](size_t j) { return j; }); - all_passed &= test([](const std::int32_t& x) { return x != 42; }, [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? std::int32_t(j + 1) : 42; }); + all_passed &= test([](const std::int32_t& x) { return x != 42; }, + [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? std::int32_t(j + 1) : 42; }); return all_passed; } From 9d39fc689e79c3e77cd68a779acafe982acb1fb2 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 24 May 2024 10:29:06 -0400 Subject: [PATCH 084/134] refactor to share lookback and memory mgr Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 490 ++++++++++++++++-- .../hetero/dpcpp/parallel_backend_sycl_scan.h | 280 ---------- .../numeric/numeric.ops/copy_if_kt.pass.cpp | 26 +- 3 files changed, 456 insertions(+), 340 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index cafffd6493d..e420dad591a 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -44,10 +44,120 @@ class __lookback_kernel; static constexpr int SUBGROUP_SIZE = 32; + +template +struct ScanMemoryManager +{ + using _FlagStorageType = typename _ScanStatusFlag::_FlagStorageType; + using _ValueType = typename _ScanStatusFlag::_ValueType; + + ScanMemoryManager(sycl::queue __q, std::size_t num_wgs) : __queue{__q}, __num_wgs(num_wgs) {}; + + ::std::uint8_t* + scan_memory_ptr() noexcept + { + return scan_memory_begin; + }; + + void + allocate() + { + ::std::size_t scan_memory_size = get_memory_size(); + + scan_memory_begin = sycl::malloc_device<::std::uint8_t>(scan_memory_size, __queue); + if (!scan_memory_begin) + throw std::bad_alloc(); + } + + sycl::event + async_free(sycl::event dependency) + { + return __queue.submit( + [e = dependency, ptr = scan_memory_begin, __q = __queue](sycl::handler& hdl) + { + hdl.depends_on(e); + hdl.host_task([=]() { sycl::free(ptr, __q); }); + }); + } + + void + free() + { + sycl::free(scan_memory_begin, __queue); + } + + _FlagStorageType* + get_flags_begin() + { + // Aligned flags + ::std::size_t tile_values_bytes = get_tile_values_bytes(); + void* base_flags = reinterpret_cast(scan_memory_begin + tile_values_bytes); + auto remainder = get_padded_flag_bytes(); // scan_memory_bytes - tile_values_bytes + return reinterpret_cast<_FlagStorageType*>( + ::std::align(::std::alignment_of_v<_FlagStorageType>, get_flag_bytes(), base_flags, remainder)); + } + + _ValueType* + get_partial_values_begin() + { + return reinterpret_cast<_ValueType*>(scan_memory_begin); + } + + _ValueType* + get_full_values_begin() + { + return reinterpret_cast<_ValueType*>(scan_memory_begin + get_num_elements() * sizeof(_ValueType)); + } + + std::size_t + get_num_elements() + { + return _ScanStatusFlag::__padding + __num_wgs; + } + + private: + + std::size_t + get_tile_values_bytes() + { + return (2 * get_num_elements()) * sizeof(_ValueType); + } + + std::size_t + get_flag_bytes() + { + return get_num_elements() * sizeof(_FlagStorageType); + } + + std::size_t + get_padded_flag_bytes() + { + // sizeof(_FlagStorageType) extra bytes for possible intenal alignment + return get_flag_bytes() + sizeof(_FlagStorageType); + } + + std::size_t + get_memory_size() + { + // sizeof(_T) extra bytes are not needed because data is going at the beginning of the scratch + ::std::size_t tile_values_bytes = get_tile_values_bytes(); + // Padding to provide room for aligment + ::std::size_t flag_bytes = get_padded_flag_bytes(); + + return tile_values_bytes + flag_bytes; + } + + std::uint8_t* scan_memory_begin = nullptr; + std::size_t __num_wgs; + + sycl::queue __queue; +}; + template struct __scan_status_flag { using _FlagStorageType = uint32_t; + using _ValueType = _T; using _AtomicFlagT = sycl::atomic_ref<_FlagStorageType, sycl::memory_order::acq_rel, sycl::memory_scope::device, sycl::access::address_space::global_space>; using _AtomicValueT = sycl::atomic_ref<_T, sycl::memory_order::acq_rel, sycl::memory_scope::device, @@ -159,6 +269,34 @@ struct __lookback_init_submitter<_FlagType, _Type, _BinaryOp, } }; +template +void +__lookback_phase(const _Group& __group, const _SubGroup& __subgroup, _StatusFlags& __status_flags, + _StatusValues& __status_vals_full, _StatusValues& __status_vals_partial, std::uint32_t __tile_id, + _Type& __local_reduction, _Type& __prev_tile_reduction, _BinaryOp __binary_op) +{ + // The first sub-group will query the previous tiles to find a prefix + if (__subgroup.get_group_id() == 0) + { + _FlagType __flag(__status_flags, __status_vals_full, __status_vals_partial, __tile_id); + + if (__subgroup.get_local_id() == 0) + { + __flag.set_partial(__local_reduction); + } + + __prev_tile_reduction = __flag.cooperative_lookback(__subgroup, __binary_op); + + if (__subgroup.get_local_id() == 0) + { + __flag.set_full(__binary_op(__prev_tile_reduction, __local_reduction)); + } + } + __prev_tile_reduction = sycl::group_broadcast(__group, __prev_tile_reduction, 0); +} + + template struct __lookback_submitter; @@ -243,25 +381,8 @@ struct __lookback_kernel_func sycl::joint_reduce(__group, __tile_vals_ptr, __tile_vals_ptr + __wg_local_memory_size, __binary_op); _Type __prev_tile_reduction{}; - // The first sub-group will query the previous tiles to find a prefix - if (__subgroup.get_group_id() == 0) - { - _FlagType __flag(__status_flags, __status_vals_full, __status_vals_partial, __tile_id); - - if (__subgroup.get_local_id() == 0) - { - __flag.set_partial(__local_reduction); - } - - __prev_tile_reduction = __flag.cooperative_lookback(__subgroup, __binary_op); - - if (__subgroup.get_local_id() == 0) - { - __flag.set_full(__binary_op(__prev_tile_reduction, __local_reduction)); - } - } - - __prev_tile_reduction = sycl::group_broadcast(__group, __prev_tile_reduction, 0); + __lookback_phase<_FlagType>(__group, __subgroup, __status_flags, __status_vals_full, __status_vals_partial, + __tile_id, __local_reduction, __prev_tile_reduction, __binary_op); sycl::joint_inclusive_scan(__group, __tile_vals_ptr, __tile_vals_ptr + __wg_local_memory_size, __out_begin, __binary_op, __prev_tile_reduction); @@ -348,31 +469,19 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r std::size_t __elems_in_tile = __workgroup_size * __data_per_workitem; std::size_t __num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile); - constexpr int __status_flag_padding = SUBGROUP_SIZE; - std::size_t __status_flags_size = __num_wgs + 1 + __status_flag_padding; - - std::size_t __mem_align_pad = sizeof(_Type); - std::size_t __status_flags_bytes = __status_flags_size * sizeof(_FlagStorageType); - std::size_t __status_vals_full_offset_bytes = __status_flags_size * sizeof(_Type); - std::size_t __status_vals_partial_offset_bytes = __status_flags_size * sizeof(_Type); - std::size_t __mem_bytes = - __status_flags_bytes + __status_vals_full_offset_bytes + __status_vals_partial_offset_bytes + __mem_align_pad; - - std::byte* __device_mem = reinterpret_cast(sycl::malloc_device(__mem_bytes, __queue)); - if (!__device_mem) - throw std::bad_alloc(); - - _FlagStorageType* __status_flags = reinterpret_cast<_FlagStorageType*>(__device_mem); - std::size_t __remainder = __mem_bytes - __status_flags_bytes; - void* __vals_base_ptr = reinterpret_cast(__device_mem + __status_flags_bytes); - void* __vals_aligned_ptr = - std::align(std::alignment_of_v<_Type>, __status_vals_full_offset_bytes, __vals_base_ptr, __remainder); - _Type* __status_vals_full = reinterpret_cast<_Type*>(__vals_aligned_ptr); - _Type* __status_vals_partial = - reinterpret_cast<_Type*>(__status_vals_full + __status_vals_full_offset_bytes / sizeof(_Type)); + + ScanMemoryManager<_FlagType> __device_mem_mgr(__queue, __num_wgs); + + __device_mem_mgr.allocate(); + + _Type* __status_vals_full = __device_mem_mgr.get_full_values_begin(); + _Type* __status_vals_partial = __device_mem_mgr.get_partial_values_begin(); + _FlagStorageType* __status_flags = __device_mem_mgr.get_flags_begin(); + //adding 1 to the number elements to account for the tile id + std::size_t __status_flags_size = __device_mem_mgr.get_num_elements() + 1; auto __fill_event = __lookback_init_submitter<_FlagType, _Type, _BinaryOp, _LookbackInitKernel>{}( - __queue, __status_flags, __status_vals_partial, __status_flags_size, __status_flag_padding); + __queue, __status_flags, __status_vals_partial, __status_flags_size, _FlagType::__padding); std::size_t __current_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile); std::size_t __current_num_items = __current_num_wgs * __workgroup_size; @@ -388,21 +497,308 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r // we should replace this code with the asynchronous version below. if (0) { - return __queue.submit([=](sycl::handler& __hdl) { - __hdl.depends_on(__prev_event); - __hdl.host_task([=]() { sycl::free(__device_mem, __queue); }); - }); + return __device_mem_mgr.async_free(__prev_event); } else { __prev_event.wait(); - sycl::free(__device_mem, __queue); + __device_mem_mgr.free(); return __prev_event; } } +template +void +single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, + _NumSelectedRange __num_rng, _UnaryPredicate pred, _KernelParam) +{ + using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + using _SizeT = uint64_t; + using _TileIdT = TileId::_TileIdT; + using _KernelName = typename _KernelParam::kernel_name; + + const ::std::size_t n = __in_rng.size(); + + constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; + constexpr ::std::size_t elems_per_workitem = _KernelParam::data_per_workitem; + + // Avoid non_uniform n by padding up to a multiple of wgsize + constexpr std::uint32_t elems_in_tile = wgsize * elems_per_workitem; + ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); + ::std::size_t num_workitems = num_wgs * wgsize; + assert(num_wgs == 1); + + auto event = __queue.submit([&](sycl::handler& hdl) { + auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); + + oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng); + hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), + [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { + auto group = item.get_group(); + auto wg_local_id = item.get_local_id(0); + constexpr ::std::uint32_t stride = wgsize; + + // Global load into local + _SizeT wg_count = 0; + + // Phase 1: Create wg_count and construct in-order wg_copy_if_values + if (elems_in_tile <= n) + { +#pragma unroll + for (size_t i = 0; i < elems_in_tile; i += wgsize) + { + _Type val = __in_rng[i + wg_local_id]; + + _SizeT satisfies_pred = pred(val); + _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, + sycl::plus<_SizeT>()); + + if (satisfies_pred) + wg_copy_if_values[count] = val; + + wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); + } + } + else + { + // Edge of input, have to handle memory bounds + // Might have unneccessary group_barrier calls +#pragma unroll + for (size_t i = 0; i < elems_in_tile; i += wgsize) + { + _SizeT satisfies_pred = 0; + // launder is used here to create data without initialization without requiring + // a default constructor or out of bounds access + // TODO: replace with "union" trick to avoid launder, + // see https://github.com/oneapi-src/oneDPL/pull/1495 and https://github.com/oneapi-src/oneDPL/pull/1470 + _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type)))); + if (i + wg_local_id < n) + { + val = __in_rng[i + wg_local_id]; + + satisfies_pred = pred(val); + } + _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, + sycl::plus<_SizeT>()); + + if (satisfies_pred) + wg_copy_if_values[count] = val; + + wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); + } + } + + // Phase 3: copy values to global memory + for (int i = wg_local_id; i < wg_count; i += wgsize) + { + __out_rng[i] = wg_copy_if_values[i]; + } + if (group.leader()) + __num_rng[0] = wg_count; + }); + }); + + event.wait(); +} + +template +void +single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, + _UnaryPredicate pred, _KernelParam) +{ + using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + using _SizeT = uint64_t; + using _TileIdT = TileId::_TileIdT; + using _KernelName = typename _KernelParam::kernel_name; + + using _BinaryOp = std::plus<_SizeT>; + + + using _LookbackInitKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __lookback_init_kernel<_KernelName, _SizeT, _BinaryOp>>; + + using _FlagType = __scan_status_flag<_SizeT>; + using _FlagStorageType = typename _FlagType::_FlagStorageType; + + + const ::std::size_t n = __in_rng.size(); + + constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; + constexpr ::std::size_t elems_per_workitem = _KernelParam::data_per_workitem; + + // Avoid non_uniform n by padding up to a multiple of wgsize + constexpr std::uint32_t __elems_in_tile = wgsize * elems_per_workitem; + ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, __elems_in_tile); + ::std::size_t num_workitems = num_wgs * wgsize; + + ScanMemoryManager<_FlagType> scratch(__queue, num_wgs); + scratch.allocate(); + + // Memory Structure: + // [Lookback Scan Memory, Tile Id Counter] + auto __status_vals_full = scratch.get_full_values_begin(); + auto __status_vals_partial = scratch.get_partial_values_begin(); + auto __status_flags = scratch.get_flags_begin(); + //adding 1 to the number elements to account for the tile id + std::size_t __status_flags_size = scratch.get_num_elements() + 1; + + auto __fill_event = __lookback_init_submitter<_FlagType, _SizeT, _BinaryOp, _LookbackInitKernel>{}( + __queue, __status_flags, __status_vals_partial, __status_flags_size, _FlagType::__padding); + + auto event = __queue.submit([&](sycl::handler& hdl) { + auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{__elems_in_tile}, hdl); + + auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); + hdl.depends_on(__fill_event); + + oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng); + hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), + [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { + auto __group = item.get_group(); + auto wg_local_id = item.get_local_id(0); + auto sg = item.get_sub_group(); + constexpr ::std::uint32_t stride = wgsize; + + std::uint32_t __tile_id = 0; + + // Obtain unique ID for this work-group that will be used in decoupled lookback + if (__group.leader()) + { + sycl::atomic_ref<_FlagStorageType, sycl::memory_order::relaxed, sycl::memory_scope::device, + sycl::access::address_space::global_space> + __idx_atomic(__status_flags[__status_flags_size - 1]); + __tile_id = __idx_atomic.fetch_add(1); + } + + __tile_id = sycl::group_broadcast(__group, __tile_id, 0); + + std::size_t __current_offset = static_cast(__tile_id) * __elems_in_tile; + + _SizeT wg_count = 0; + + // Phase 1: Create wg_count and construct in-order wg_copy_if_values + if ((__tile_id + 1) * __elems_in_tile <= n) + { +#pragma unroll + for (size_t i = 0; i < __elems_in_tile; i += wgsize) + { + // TODO: explore scalar impl. Does this allow us to avoid the group broadcast (sync)? + // if load is done in a scalar fashion and provides the same performance, we + // can avoid the broadcast (I think) + // would need to loop over the elements per work item first accumulating into + // satisfies pred, copying to "my slot" in SLM then do scan, then the copy to + // global memory needs to be loaded per work item per element, skipping copies + // when they were not saved. + _Type val = __in_rng[i + wg_local_id + __elems_in_tile * __tile_id]; + + _SizeT satisfies_pred = pred(val); + _SizeT count = sycl::exclusive_scan_over_group(__group, satisfies_pred, wg_count, + sycl::plus<_SizeT>()); + + if (satisfies_pred) + wg_copy_if_values[count] = val; + + wg_count = sycl::group_broadcast(__group, count + satisfies_pred, wgsize - 1); + } + } + else + { + // Edge of input, have to handle memory bounds + // Might have unneccessary group_barrier calls +#pragma unroll + for (size_t i = 0; i < __elems_in_tile; i += wgsize) + { + _SizeT satisfies_pred = 0; + // launder is used here to create data without initialization without requiring + // a default constructor or out of bounds access + // TODO: replace with "union" trick to avoid launder, + // see https://github.com/oneapi-src/oneDPL/pull/1495 and https://github.com/oneapi-src/oneDPL/pull/1470 + _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type)))); + if (i + wg_local_id + __elems_in_tile * __tile_id < n) + { + val = __in_rng[i + wg_local_id + __elems_in_tile * __tile_id]; + + satisfies_pred = pred(val); + } + _SizeT count = sycl::exclusive_scan_over_group(__group, satisfies_pred, wg_count, + sycl::plus<_SizeT>()); + + if (satisfies_pred) + wg_copy_if_values[count] = val; + + wg_count = sycl::group_broadcast(__group, count + satisfies_pred, wgsize - 1); + } + } + + // Phase 2: Global scan across wg_count + _SizeT copied_elements = 0; + + __lookback_phase<_FlagType>(__group, sg, __status_flags, __status_vals_full, __status_vals_partial, + __tile_id, wg_count, copied_elements, sycl::plus<_SizeT>()); + + //TODO: explore above comment about scalar load + // Phase 3: copy values to global memory + for (int i = wg_local_id; i < wg_count; i += wgsize) + { + __out_rng[copied_elements + i] = wg_copy_if_values[i]; + } + if (__tile_id == (num_wgs - 1) && __group.leader()) + __num_rng[0] = copied_elements + wg_count; + }); + }); + + event.wait(); + scratch.free(); +} + } // namespace __impl +template +void +single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, + _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred, + _KernelParam __param = {}) +{ + auto __n = __in_end - __in_begin; + + auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); + auto __buf1 = __keep1(__in_begin, __in_end); + auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); + auto __buf2 = __keep2(__out_begin, __out_begin + __n); + + auto __keep_num = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>(); + auto __buf_num = __keep2(__num_begin, __num_begin + 1); + + __impl::single_pass_copy_if_impl_single_wg(__queue, __buf1.all_view(), __buf2.all_view(), + __buf_num.all_view(), pred, __param); +} + +template +void +single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, + _NumSelectedRange __num_begin, _UnaryPredicate pred, _KernelParam __param = {}) +{ + auto __n = __in_end - __in_begin; + + auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); + auto __buf1 = __keep1(__in_begin, __in_end); + auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); + auto __buf2 = __keep2(__out_begin, __out_begin + __n); + + auto __keep_num = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>(); + auto __buf_num = __keep2(__num_begin, __num_begin + 1); + + __impl::single_pass_copy_if_impl(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred, __param); +} + + + template sycl::event inclusive_scan(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _BinaryOp __binary_op, diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h index aaca8a4a81e..8752c4baf0e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h @@ -716,286 +716,6 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera } } -template -void -single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, - _NumSelectedRange __num_rng, _UnaryPredicate pred) -{ - using _Type = oneapi::dpl::__internal::__value_t<_InRange>; - using _SizeT = uint64_t; - using _TileIdT = TileId::_TileIdT; - - const ::std::size_t n = __in_rng.size(); - - constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; - constexpr ::std::size_t elems_per_workitem = _KernelParam::data_per_workitem; - - // Avoid non_uniform n by padding up to a multiple of wgsize - constexpr std::uint32_t elems_in_tile = wgsize * elems_per_workitem; - ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); - ::std::size_t num_workitems = num_wgs * wgsize; - assert(num_wgs == 1); - - auto event = __queue.submit([&](sycl::handler& hdl) { - auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); - - oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng); - hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), - [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { - auto group = item.get_group(); - auto wg_local_id = item.get_local_id(0); - constexpr ::std::uint32_t stride = wgsize; - - // Global load into local - _SizeT wg_count = 0; - - // Phase 1: Create wg_count and construct in-order wg_copy_if_values - if (elems_in_tile <= n) - { -#pragma unroll - for (size_t i = 0; i < elems_in_tile; i += wgsize) - { - _Type val = __in_rng[i + wg_local_id]; - - _SizeT satisfies_pred = pred(val); - _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, - sycl::plus<_SizeT>()); - - if (satisfies_pred) - wg_copy_if_values[count] = val; - - wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); - } - } - else - { - // Edge of input, have to handle memory bounds - // Might have unneccessary group_barrier calls -#pragma unroll - for (size_t i = 0; i < elems_in_tile; i += wgsize) - { - _SizeT satisfies_pred = 0; - _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type)))); - if (i + wg_local_id < n) - { - val = __in_rng[i + wg_local_id]; - - satisfies_pred = pred(val); - } - _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, - sycl::plus<_SizeT>()); - - if (satisfies_pred) - wg_copy_if_values[count] = val; - - wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); - } - } - - // Phase 3: copy values to global memory - for (int i = wg_local_id; i < wg_count; i += wgsize) - { - __out_rng[i] = wg_copy_if_values[i]; - } - if (group.leader()) - __num_rng[0] = wg_count; - }); - }); - - event.wait(); -} - -template -void -single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, - _UnaryPredicate pred) -{ - using _Type = oneapi::dpl::__internal::__value_t<_InRange>; - using _SizeT = uint64_t; - using _TileIdT = TileId::_TileIdT; - using _LookbackScanMemory = LookbackScanMemory<_SizeT, _UseAtomic64>; - using _FlagT = typename _LookbackScanMemory::_FlagT; - - const ::std::size_t n = __in_rng.size(); - - constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; - constexpr ::std::size_t elems_per_workitem = _KernelParam::data_per_workitem; - - // Avoid non_uniform n by padding up to a multiple of wgsize - constexpr std::uint32_t elems_in_tile = wgsize * elems_per_workitem; - ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); - ::std::size_t num_workitems = num_wgs * wgsize; - - ScanMemoryManager<_SizeT, _UseAtomic64, LookbackScanMemory, TileId> scratch(__queue); - scratch.allocate(num_wgs); - - // Memory Structure: - // [Lookback Scan Memory, Tile Id Counter] - auto scan_memory_begin = scratch.scan_memory_ptr(); - auto status_flags_begin = _LookbackScanMemory::get_flags_begin(scan_memory_begin, num_wgs); - auto tile_id_begin = scratch.tile_id_ptr(); - - ::std::size_t num_elements = _LookbackScanMemory::get_num_elements(num_wgs); - // fill_num_wgs num_elements + 1 to also initialize tile_id_counter - ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize); - - auto fill_event = __queue.memset(status_flags_begin, 0, num_elements * sizeof(_FlagT) + 1 * sizeof(_TileIdT)); - - auto event = __queue.submit([&](sycl::handler& hdl) { - auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); - - auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); - hdl.depends_on(fill_event); - - oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng); - hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), - [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { - auto group = item.get_group(); - auto wg_local_id = item.get_local_id(0); - auto sg = item.get_sub_group(); - constexpr ::std::uint32_t stride = wgsize; - - // Init tile_id - std::uint32_t tile_id; - if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>) - { - // Obtain unique ID for this work-group that will be used in decoupled lookback - TileId dynamic_tile_id(tile_id_begin); - if (group.leader()) - { - tile_id_lacc[0] = dynamic_tile_id.fetch_inc(); - } - sycl::group_barrier(group); - tile_id = tile_id_lacc[0]; - } - else - { - tile_id = group.get_group_linear_id(); - } - - _SizeT wg_count = 0; - - // Phase 1: Create wg_count and construct in-order wg_copy_if_values - if ((tile_id + 1) * elems_in_tile <= n) - { -#pragma unroll - for (size_t i = 0; i < elems_in_tile; i += wgsize) - { - _Type val = __in_rng[i + wg_local_id + elems_in_tile * tile_id]; - - _SizeT satisfies_pred = pred(val); - _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, - sycl::plus<_SizeT>()); - - if (satisfies_pred) - wg_copy_if_values[count] = val; - - wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); - } - } - else - { - // Edge of input, have to handle memory bounds - // Might have unneccessary group_barrier calls -#pragma unroll - for (size_t i = 0; i < elems_in_tile; i += wgsize) - { - _SizeT satisfies_pred = 0; - _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type)))); - if (i + wg_local_id + elems_in_tile * tile_id < n) - { - val = __in_rng[i + wg_local_id + elems_in_tile * tile_id]; - - satisfies_pred = pred(val); - } - _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, - sycl::plus<_SizeT>()); - - if (satisfies_pred) - wg_copy_if_values[count] = val; - - wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); - } - } - - // Phase 2: Global scan across wg_count - _SizeT prev_sum = 0; - - // The first sub-group will query the previous tiles to find a prefix - if (sg.get_group_id() == 0) - { - _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs); - - if (group.leader()) - scan_mem.set_partial(tile_id, wg_count); - - // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum - prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_SizeT>(), scan_mem); - - if (group.leader()) - scan_mem.set_full(tile_id, prev_sum + wg_count); - } - - _SizeT start_idx = sycl::group_broadcast(group, prev_sum, 0); - - // Phase 3: copy values to global memory - for (int i = wg_local_id; i < wg_count; i += wgsize) - { - __out_rng[start_idx + i] = wg_copy_if_values[i]; - } - if (tile_id == (num_wgs - 1) && group.leader()) - __num_rng[0] = start_idx + wg_count; - }); - }); - - event.wait(); - scratch.free(); -} - -template -void -single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, - _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred) -{ - auto __n = __in_end - __in_begin; - - auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); - auto __buf1 = __keep1(__in_begin, __in_end); - auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); - auto __buf2 = __keep2(__out_begin, __out_begin + __n); - - auto __keep_num = - oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>(); - auto __buf_num = __keep2(__num_begin, __num_begin + 1); - - single_pass_copy_if_impl_single_wg<_KernelParam>(__queue, __buf1.all_view(), __buf2.all_view(), - __buf_num.all_view(), pred); -} - -template -void -single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, - _NumSelectedRange __num_begin, _UnaryPredicate pred) -{ - auto __n = __in_end - __in_begin; - - auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); - auto __buf1 = __keep1(__in_begin, __in_end); - auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); - auto __buf2 = __keep2(__out_begin, __out_begin + __n); - - auto __keep_num = - oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>(); - auto __buf_num = __keep2(__num_begin, __num_begin + 1); - - single_pass_copy_if_impl<_KernelParam, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::true_type>( - __queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred); -} - } // inline namespace igpu } // namespace oneapi::dpl::experimental::kt diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp index a77b76491e7..e0a079eaa3f 100644 --- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp @@ -16,17 +16,15 @@ #include "support/test_config.h" #include "support/utils.h" +#include #include _PSTL_TEST_HEADER(execution) #include _PSTL_TEST_HEADER(numeric) using namespace TestUtils; -template -class CopyIfKernel; - -template +template bool -test(Predicate pred, Generator gen) +test(Predicate pred, Generator gen, KernelParam param) { bool all_passed = true; sycl::queue q; @@ -43,12 +41,8 @@ test(Predicate pred, Generator gen) T* out_ptr = sycl::malloc_device(n, q); size_t* out_num = sycl::malloc_device(1, q); - constexpr int n_elements_per_workitem = 8; - q.copy(in.data(), in_ptr, n).wait(); - using KernelParams = - oneapi::dpl::experimental::kt::kernel_param>; - oneapi::dpl::experimental::kt::single_pass_copy_if(q, in_ptr, in_ptr + n, out_ptr, out_num, pred); + oneapi::dpl::experimental::kt::gpu::single_pass_copy_if(q, in_ptr, in_ptr + n, out_ptr, out_num, pred, param); Sequence kt_out(n); size_t num_selected = 0; @@ -93,12 +87,18 @@ int main() { bool all_passed = true; + constexpr int n_elements_per_workitem = 8; + + auto param = oneapi::dpl::experimental::kt::kernel_param{}; all_passed &= test([](const float64_t& x) { return x * x <= 1024; }, - [](size_t j) { return ((j + 1) % 7 & 2) != 0 ? float64_t(j % 32) : float64_t(j % 33 + 34); }); - all_passed &= test([](const int&) { return true; }, [](size_t j) { return j; }); + [](size_t j) { return ((j + 1) % 7 & 2) != 0 ? float64_t(j % 32) : float64_t(j % 33 + 34); }, + TestUtils::get_new_kernel_params<0>(param)); + all_passed &= test([](const int&) { return true; }, [](size_t j) { return j; }, + TestUtils::get_new_kernel_params<1>(param)); all_passed &= test([](const std::int32_t& x) { return x != 42; }, - [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? std::int32_t(j + 1) : 42; }); + [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? std::int32_t(j + 1) : 42; }, + TestUtils::get_new_kernel_params<2>(param)); return all_passed; } From 1b10214d09bf8f9f231e914a937c94f506ded22f Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 24 May 2024 10:32:47 -0400 Subject: [PATCH 085/134] formatting Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 341 +++++++++--------- 1 file changed, 167 insertions(+), 174 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index e420dad591a..df85e047ca1 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -44,14 +44,13 @@ class __lookback_kernel; static constexpr int SUBGROUP_SIZE = 32; - template struct ScanMemoryManager { using _FlagStorageType = typename _ScanStatusFlag::_FlagStorageType; using _ValueType = typename _ScanStatusFlag::_ValueType; - ScanMemoryManager(sycl::queue __q, std::size_t num_wgs) : __queue{__q}, __num_wgs(num_wgs) {}; + ScanMemoryManager(sycl::queue __q, std::size_t num_wgs) : __queue{__q}, __num_wgs(num_wgs){}; ::std::uint8_t* scan_memory_ptr() noexcept @@ -72,12 +71,10 @@ struct ScanMemoryManager sycl::event async_free(sycl::event dependency) { - return __queue.submit( - [e = dependency, ptr = scan_memory_begin, __q = __queue](sycl::handler& hdl) - { - hdl.depends_on(e); - hdl.host_task([=]() { sycl::free(ptr, __q); }); - }); + return __queue.submit([e = dependency, ptr = scan_memory_begin, __q = __queue](sycl::handler& hdl) { + hdl.depends_on(e); + hdl.host_task([=]() { sycl::free(ptr, __q); }); + }); } void @@ -116,7 +113,6 @@ struct ScanMemoryManager } private: - std::size_t get_tile_values_bytes() { @@ -273,8 +269,8 @@ template void __lookback_phase(const _Group& __group, const _SubGroup& __subgroup, _StatusFlags& __status_flags, - _StatusValues& __status_vals_full, _StatusValues& __status_vals_partial, std::uint32_t __tile_id, - _Type& __local_reduction, _Type& __prev_tile_reduction, _BinaryOp __binary_op) + _StatusValues& __status_vals_full, _StatusValues& __status_vals_partial, std::uint32_t __tile_id, + _Type& __local_reduction, _Type& __prev_tile_reduction, _BinaryOp __binary_op) { // The first sub-group will query the previous tiles to find a prefix if (__subgroup.get_group_id() == 0) @@ -296,7 +292,6 @@ __lookback_phase(const _Group& __group, const _SubGroup& __subgroup, _StatusFlag __prev_tile_reduction = sycl::group_broadcast(__group, __prev_tile_reduction, 0); } - template struct __lookback_submitter; @@ -382,7 +377,7 @@ struct __lookback_kernel_func _Type __prev_tile_reduction{}; __lookback_phase<_FlagType>(__group, __subgroup, __status_flags, __status_vals_full, __status_vals_partial, - __tile_id, __local_reduction, __prev_tile_reduction, __binary_op); + __tile_id, __local_reduction, __prev_tile_reduction, __binary_op); sycl::joint_inclusive_scan(__group, __tile_vals_ptr, __tile_vals_ptr + __wg_local_memory_size, __out_begin, __binary_op, __prev_tile_reduction); @@ -469,7 +464,6 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r std::size_t __elems_in_tile = __workgroup_size * __data_per_workitem; std::size_t __num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile); - ScanMemoryManager<_FlagType> __device_mem_mgr(__queue, __num_wgs); __device_mem_mgr.allocate(); @@ -507,8 +501,8 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r } } -template +template void single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred, _KernelParam) @@ -533,77 +527,78 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng); - hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), - [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { - auto group = item.get_group(); - auto wg_local_id = item.get_local_id(0); - constexpr ::std::uint32_t stride = wgsize; - - // Global load into local - _SizeT wg_count = 0; - - // Phase 1: Create wg_count and construct in-order wg_copy_if_values - if (elems_in_tile <= n) - { + hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& + item) [[intel::reqd_sub_group_size( + SUBGROUP_SIZE)]] { + auto group = item.get_group(); + auto wg_local_id = item.get_local_id(0); + constexpr ::std::uint32_t stride = wgsize; + + // Global load into local + _SizeT wg_count = 0; + + // Phase 1: Create wg_count and construct in-order wg_copy_if_values + if (elems_in_tile <= n) + { #pragma unroll - for (size_t i = 0; i < elems_in_tile; i += wgsize) - { - _Type val = __in_rng[i + wg_local_id]; - - _SizeT satisfies_pred = pred(val); - _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, - sycl::plus<_SizeT>()); - - if (satisfies_pred) - wg_copy_if_values[count] = val; - - wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); - } - } - else - { + for (size_t i = 0; i < elems_in_tile; i += wgsize) + { + _Type val = __in_rng[i + wg_local_id]; + + _SizeT satisfies_pred = pred(val); + _SizeT count = + sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); + + if (satisfies_pred) + wg_copy_if_values[count] = val; + + wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); + } + } + else + { // Edge of input, have to handle memory bounds // Might have unneccessary group_barrier calls #pragma unroll - for (size_t i = 0; i < elems_in_tile; i += wgsize) - { - _SizeT satisfies_pred = 0; - // launder is used here to create data without initialization without requiring - // a default constructor or out of bounds access - // TODO: replace with "union" trick to avoid launder, - // see https://github.com/oneapi-src/oneDPL/pull/1495 and https://github.com/oneapi-src/oneDPL/pull/1470 - _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type)))); - if (i + wg_local_id < n) - { - val = __in_rng[i + wg_local_id]; - - satisfies_pred = pred(val); - } - _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, - sycl::plus<_SizeT>()); - - if (satisfies_pred) - wg_copy_if_values[count] = val; - - wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); - } - } - - // Phase 3: copy values to global memory - for (int i = wg_local_id; i < wg_count; i += wgsize) - { - __out_rng[i] = wg_copy_if_values[i]; - } - if (group.leader()) - __num_rng[0] = wg_count; - }); + for (size_t i = 0; i < elems_in_tile; i += wgsize) + { + _SizeT satisfies_pred = 0; + // launder is used here to create data without initialization without requiring + // a default constructor or out of bounds access + // TODO: replace with "union" trick to avoid launder, + // see https://github.com/oneapi-src/oneDPL/pull/1495 and https://github.com/oneapi-src/oneDPL/pull/1470 + _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type)))); + if (i + wg_local_id < n) + { + val = __in_rng[i + wg_local_id]; + + satisfies_pred = pred(val); + } + _SizeT count = + sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); + + if (satisfies_pred) + wg_copy_if_values[count] = val; + + wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); + } + } + + // Phase 3: copy values to global memory + for (int i = wg_local_id; i < wg_count; i += wgsize) + { + __out_rng[i] = wg_copy_if_values[i]; + } + if (group.leader()) + __num_rng[0] = wg_count; + }); }); event.wait(); } -template +template void single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred, _KernelParam) @@ -615,14 +610,12 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ using _BinaryOp = std::plus<_SizeT>; - using _LookbackInitKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __lookback_init_kernel<_KernelName, _SizeT, _BinaryOp>>; using _FlagType = __scan_status_flag<_SizeT>; using _FlagStorageType = typename _FlagType::_FlagStorageType; - const ::std::size_t n = __in_rng.size(); constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; @@ -654,99 +647,100 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ hdl.depends_on(__fill_event); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng); - hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), - [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { - auto __group = item.get_group(); - auto wg_local_id = item.get_local_id(0); - auto sg = item.get_sub_group(); - constexpr ::std::uint32_t stride = wgsize; - - std::uint32_t __tile_id = 0; - - // Obtain unique ID for this work-group that will be used in decoupled lookback - if (__group.leader()) - { - sycl::atomic_ref<_FlagStorageType, sycl::memory_order::relaxed, sycl::memory_scope::device, - sycl::access::address_space::global_space> - __idx_atomic(__status_flags[__status_flags_size - 1]); - __tile_id = __idx_atomic.fetch_add(1); - } + hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& + item) [[intel::reqd_sub_group_size( + SUBGROUP_SIZE)]] { + auto __group = item.get_group(); + auto wg_local_id = item.get_local_id(0); + auto sg = item.get_sub_group(); + constexpr ::std::uint32_t stride = wgsize; + + std::uint32_t __tile_id = 0; + + // Obtain unique ID for this work-group that will be used in decoupled lookback + if (__group.leader()) + { + sycl::atomic_ref<_FlagStorageType, sycl::memory_order::relaxed, sycl::memory_scope::device, + sycl::access::address_space::global_space> + __idx_atomic(__status_flags[__status_flags_size - 1]); + __tile_id = __idx_atomic.fetch_add(1); + } - __tile_id = sycl::group_broadcast(__group, __tile_id, 0); + __tile_id = sycl::group_broadcast(__group, __tile_id, 0); - std::size_t __current_offset = static_cast(__tile_id) * __elems_in_tile; + std::size_t __current_offset = static_cast(__tile_id) * __elems_in_tile; - _SizeT wg_count = 0; + _SizeT wg_count = 0; - // Phase 1: Create wg_count and construct in-order wg_copy_if_values - if ((__tile_id + 1) * __elems_in_tile <= n) - { + // Phase 1: Create wg_count and construct in-order wg_copy_if_values + if ((__tile_id + 1) * __elems_in_tile <= n) + { #pragma unroll - for (size_t i = 0; i < __elems_in_tile; i += wgsize) - { - // TODO: explore scalar impl. Does this allow us to avoid the group broadcast (sync)? - // if load is done in a scalar fashion and provides the same performance, we - // can avoid the broadcast (I think) - // would need to loop over the elements per work item first accumulating into - // satisfies pred, copying to "my slot" in SLM then do scan, then the copy to - // global memory needs to be loaded per work item per element, skipping copies - // when they were not saved. - _Type val = __in_rng[i + wg_local_id + __elems_in_tile * __tile_id]; - - _SizeT satisfies_pred = pred(val); - _SizeT count = sycl::exclusive_scan_over_group(__group, satisfies_pred, wg_count, - sycl::plus<_SizeT>()); - - if (satisfies_pred) - wg_copy_if_values[count] = val; - - wg_count = sycl::group_broadcast(__group, count + satisfies_pred, wgsize - 1); - } - } - else - { + for (size_t i = 0; i < __elems_in_tile; i += wgsize) + { + // TODO: explore scalar impl. Does this allow us to avoid the group broadcast (sync)? + // if load is done in a scalar fashion and provides the same performance, we + // can avoid the broadcast (I think) + // would need to loop over the elements per work item first accumulating into + // satisfies pred, copying to "my slot" in SLM then do scan, then the copy to + // global memory needs to be loaded per work item per element, skipping copies + // when they were not saved. + _Type val = __in_rng[i + wg_local_id + __elems_in_tile * __tile_id]; + + _SizeT satisfies_pred = pred(val); + _SizeT count = + sycl::exclusive_scan_over_group(__group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); + + if (satisfies_pred) + wg_copy_if_values[count] = val; + + wg_count = sycl::group_broadcast(__group, count + satisfies_pred, wgsize - 1); + } + } + else + { // Edge of input, have to handle memory bounds // Might have unneccessary group_barrier calls #pragma unroll - for (size_t i = 0; i < __elems_in_tile; i += wgsize) - { - _SizeT satisfies_pred = 0; - // launder is used here to create data without initialization without requiring - // a default constructor or out of bounds access - // TODO: replace with "union" trick to avoid launder, - // see https://github.com/oneapi-src/oneDPL/pull/1495 and https://github.com/oneapi-src/oneDPL/pull/1470 - _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type)))); - if (i + wg_local_id + __elems_in_tile * __tile_id < n) - { - val = __in_rng[i + wg_local_id + __elems_in_tile * __tile_id]; - - satisfies_pred = pred(val); - } - _SizeT count = sycl::exclusive_scan_over_group(__group, satisfies_pred, wg_count, - sycl::plus<_SizeT>()); - - if (satisfies_pred) - wg_copy_if_values[count] = val; - - wg_count = sycl::group_broadcast(__group, count + satisfies_pred, wgsize - 1); - } - } - - // Phase 2: Global scan across wg_count - _SizeT copied_elements = 0; - - __lookback_phase<_FlagType>(__group, sg, __status_flags, __status_vals_full, __status_vals_partial, - __tile_id, wg_count, copied_elements, sycl::plus<_SizeT>()); - - //TODO: explore above comment about scalar load - // Phase 3: copy values to global memory - for (int i = wg_local_id; i < wg_count; i += wgsize) - { - __out_rng[copied_elements + i] = wg_copy_if_values[i]; - } - if (__tile_id == (num_wgs - 1) && __group.leader()) - __num_rng[0] = copied_elements + wg_count; - }); + for (size_t i = 0; i < __elems_in_tile; i += wgsize) + { + _SizeT satisfies_pred = 0; + // launder is used here to create data without initialization without requiring + // a default constructor or out of bounds access + // TODO: replace with "union" trick to avoid launder, + // see https://github.com/oneapi-src/oneDPL/pull/1495 and https://github.com/oneapi-src/oneDPL/pull/1470 + _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type)))); + if (i + wg_local_id + __elems_in_tile * __tile_id < n) + { + val = __in_rng[i + wg_local_id + __elems_in_tile * __tile_id]; + + satisfies_pred = pred(val); + } + _SizeT count = + sycl::exclusive_scan_over_group(__group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); + + if (satisfies_pred) + wg_copy_if_values[count] = val; + + wg_count = sycl::group_broadcast(__group, count + satisfies_pred, wgsize - 1); + } + } + + // Phase 2: Global scan across wg_count + _SizeT copied_elements = 0; + + __lookback_phase<_FlagType>(__group, sg, __status_flags, __status_vals_full, __status_vals_partial, + __tile_id, wg_count, copied_elements, sycl::plus<_SizeT>()); + + //TODO: explore above comment about scalar load + // Phase 3: copy values to global memory + for (int i = wg_local_id; i < wg_count; i += wgsize) + { + __out_rng[copied_elements + i] = wg_copy_if_values[i]; + } + if (__tile_id == (num_wgs - 1) && __group.leader()) + __num_rng[0] = copied_elements + wg_count; + }); }); event.wait(); @@ -755,8 +749,8 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ } // namespace __impl -template +template void single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred, @@ -773,12 +767,12 @@ single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIt oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>(); auto __buf_num = __keep2(__num_begin, __num_begin + 1); - __impl::single_pass_copy_if_impl_single_wg(__queue, __buf1.all_view(), __buf2.all_view(), - __buf_num.all_view(), pred, __param); + __impl::single_pass_copy_if_impl_single_wg(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), + pred, __param); } -template +template void single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred, _KernelParam __param = {}) @@ -794,11 +788,10 @@ single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __i oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>(); auto __buf_num = __keep2(__num_begin, __num_begin + 1); - __impl::single_pass_copy_if_impl(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred, __param); + __impl::single_pass_copy_if_impl(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred, + __param); } - - template sycl::event inclusive_scan(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _BinaryOp __binary_op, From 593c218c9d2f00d8c94e97e64dddd732ec4cc4b5 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 24 May 2024 10:54:06 -0400 Subject: [PATCH 086/134] remove launder in favor of lazy ctor union Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 38 +++++++++---------- include/oneapi/dpl/pstl/utils.h | 8 ++++ 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index df85e047ca1..14027002f1b 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -563,23 +563,21 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou for (size_t i = 0; i < elems_in_tile; i += wgsize) { _SizeT satisfies_pred = 0; - // launder is used here to create data without initialization without requiring - // a default constructor or out of bounds access - // TODO: replace with "union" trick to avoid launder, - // see https://github.com/oneapi-src/oneDPL/pull/1495 and https://github.com/oneapi-src/oneDPL/pull/1470 - _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type)))); + oneapi::dpl::__internal::__lazy_ctor_storage<_Type> val; if (i + wg_local_id < n) { - val = __in_rng[i + wg_local_id]; + new (&val.__v) _Type(__in_rng[i + wg_local_id]); - satisfies_pred = pred(val); + satisfies_pred = pred(val.__v); } _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); - - if (satisfies_pred) - wg_copy_if_values[count] = val; - + if (i + wg_local_id < n) + { + if (satisfies_pred) + wg_copy_if_values[count] = std::move(val.__v); + val.__v.~_Type(); + } wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); } } @@ -705,22 +703,22 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ for (size_t i = 0; i < __elems_in_tile; i += wgsize) { _SizeT satisfies_pred = 0; - // launder is used here to create data without initialization without requiring - // a default constructor or out of bounds access - // TODO: replace with "union" trick to avoid launder, - // see https://github.com/oneapi-src/oneDPL/pull/1495 and https://github.com/oneapi-src/oneDPL/pull/1470 - _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type)))); + oneapi::dpl::__internal::__lazy_ctor_storage<_Type> val; if (i + wg_local_id + __elems_in_tile * __tile_id < n) { - val = __in_rng[i + wg_local_id + __elems_in_tile * __tile_id]; + new (&val.__v) _Type(__in_rng[i + wg_local_id + __elems_in_tile * __tile_id]); - satisfies_pred = pred(val); + satisfies_pred = pred(val.__v); } _SizeT count = sycl::exclusive_scan_over_group(__group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); - if (satisfies_pred) - wg_copy_if_values[count] = val; + if (i + wg_local_id + __elems_in_tile * __tile_id < n) + { + if (satisfies_pred) + wg_copy_if_values[count] = std::move(val.__v); + val.__v.~_Type(); + } wg_count = sycl::group_broadcast(__group, count + satisfies_pred, wgsize - 1); } diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h index e8bbde63c04..c68e74e6ef7 100644 --- a/include/oneapi/dpl/pstl/utils.h +++ b/include/oneapi/dpl/pstl/utils.h @@ -765,6 +765,14 @@ struct __is_iterator_type<_T, std::void_t::dif template static constexpr bool __is_iterator_type_v = __is_iterator_type<_T>::value; +//For use to lazily create objects values of type _Tp without requiring a default constructibility of _Tp +template +union __lazy_ctor_storage +{ + _Tp __v; + __lazy_ctor_storage() {} +}; + } // namespace __internal } // namespace dpl } // namespace oneapi From 5a1752ce32cabc2b9a033be2a6098d23488fb8ee Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 24 May 2024 11:22:20 -0400 Subject: [PATCH 087/134] distinguishing kernel names Signed-off-by: Dan Hoeflinger --- .../oneapi/dpl/experimental/kt/single_pass_scan.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 14027002f1b..880aaf2b411 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -36,6 +36,15 @@ namespace gpu namespace __impl { +template +class __copy_if_kernel; + +template +class __copy_if_single_wg_kernel; + +template +class __inclusive_scan_kernel; + template class __lookback_init_kernel; @@ -426,7 +435,7 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r using _FlagType = __scan_status_flag<_Type>; using _FlagStorageType = typename _FlagType::_FlagStorageType; - using _KernelName = typename _KernelParam::kernel_name; + using _KernelName = __inclusive_scan_kernel; using _LookbackInitKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __lookback_init_kernel<_KernelName, _Type, _BinaryOp>>; using _LookbackKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< @@ -510,7 +519,7 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou using _Type = oneapi::dpl::__internal::__value_t<_InRange>; using _SizeT = uint64_t; using _TileIdT = TileId::_TileIdT; - using _KernelName = typename _KernelParam::kernel_name; + using _KernelName = __copy_if_single_wg_kernel; const ::std::size_t n = __in_rng.size(); @@ -604,7 +613,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ using _Type = oneapi::dpl::__internal::__value_t<_InRange>; using _SizeT = uint64_t; using _TileIdT = TileId::_TileIdT; - using _KernelName = typename _KernelParam::kernel_name; + using _KernelName = __copy_if_kernel; using _BinaryOp = std::plus<_SizeT>; From a0576c3c2967a54debc55a2e0fe3b4c7682967b4 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 24 May 2024 11:38:39 -0400 Subject: [PATCH 088/134] uglify Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 267 +++++++++--------- 1 file changed, 126 insertions(+), 141 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 880aaf2b411..ede5ed4c2ae 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -54,71 +54,62 @@ class __lookback_kernel; static constexpr int SUBGROUP_SIZE = 32; template -struct ScanMemoryManager +struct __scan_lookback_mem_mgr { using _FlagStorageType = typename _ScanStatusFlag::_FlagStorageType; using _ValueType = typename _ScanStatusFlag::_ValueType; - ScanMemoryManager(sycl::queue __q, std::size_t num_wgs) : __queue{__q}, __num_wgs(num_wgs){}; - - ::std::uint8_t* - scan_memory_ptr() noexcept - { - return scan_memory_begin; - }; + __scan_lookback_mem_mgr(sycl::queue __q, std::size_t __num_wgs) : __queue{__q}, __num_workgroups(__num_wgs){}; void allocate() { - ::std::size_t scan_memory_size = get_memory_size(); - - scan_memory_begin = sycl::malloc_device<::std::uint8_t>(scan_memory_size, __queue); - if (!scan_memory_begin) + __scan_memory_begin = sycl::malloc_device(get_memory_size(), __queue); + if (!__scan_memory_begin) throw std::bad_alloc(); } sycl::event - async_free(sycl::event dependency) + async_free(sycl::event __dependency) { - return __queue.submit([e = dependency, ptr = scan_memory_begin, __q = __queue](sycl::handler& hdl) { - hdl.depends_on(e); - hdl.host_task([=]() { sycl::free(ptr, __q); }); + return __queue.submit([__e = __dependency, __ptr = __scan_memory_begin, __q = __queue](sycl::handler& __hdl) { + __hdl.depends_on(__e); + __hdl.host_task([=]() { sycl::free(__ptr, __q); }); }); } void free() { - sycl::free(scan_memory_begin, __queue); + sycl::free(__scan_memory_begin, __queue); } _FlagStorageType* get_flags_begin() { // Aligned flags - ::std::size_t tile_values_bytes = get_tile_values_bytes(); - void* base_flags = reinterpret_cast(scan_memory_begin + tile_values_bytes); - auto remainder = get_padded_flag_bytes(); // scan_memory_bytes - tile_values_bytes + void* __base_flags = reinterpret_cast(__scan_memory_begin + get_tile_values_bytes()); + auto __remainder = get_padded_flag_bytes(); return reinterpret_cast<_FlagStorageType*>( - ::std::align(::std::alignment_of_v<_FlagStorageType>, get_flag_bytes(), base_flags, remainder)); + ::std::align(::std::alignment_of_v<_FlagStorageType>, get_flag_bytes(), __base_flags, __remainder)); } _ValueType* get_partial_values_begin() { - return reinterpret_cast<_ValueType*>(scan_memory_begin); + return reinterpret_cast<_ValueType*>(__scan_memory_begin); } _ValueType* get_full_values_begin() { - return reinterpret_cast<_ValueType*>(scan_memory_begin + get_num_elements() * sizeof(_ValueType)); + return reinterpret_cast<_ValueType*>(__scan_memory_begin + get_num_elements() * sizeof(_ValueType)); } std::size_t get_num_elements() { - return _ScanStatusFlag::__padding + __num_wgs; + return _ScanStatusFlag::__padding + __num_workgroups; } private: @@ -144,16 +135,11 @@ struct ScanMemoryManager std::size_t get_memory_size() { - // sizeof(_T) extra bytes are not needed because data is going at the beginning of the scratch - ::std::size_t tile_values_bytes = get_tile_values_bytes(); - // Padding to provide room for aligment - ::std::size_t flag_bytes = get_padded_flag_bytes(); - - return tile_values_bytes + flag_bytes; + return get_tile_values_bytes() + get_padded_flag_bytes(); } - std::uint8_t* scan_memory_begin = nullptr; - std::size_t __num_wgs; + std::uint8_t* __scan_memory_begin = nullptr; + std::size_t __num_workgroups; sycl::queue __queue; }; @@ -473,7 +459,7 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r std::size_t __elems_in_tile = __workgroup_size * __data_per_workitem; std::size_t __num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile); - ScanMemoryManager<_FlagType> __device_mem_mgr(__queue, __num_wgs); + __scan_lookback_mem_mgr<_FlagType> __device_mem_mgr(__queue, __num_wgs); __device_mem_mgr.allocate(); @@ -514,54 +500,55 @@ template void single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, - _NumSelectedRange __num_rng, _UnaryPredicate pred, _KernelParam) + _NumSelectedRange __num_rng, _UnaryPredicate __pred, _KernelParam) { using _Type = oneapi::dpl::__internal::__value_t<_InRange>; using _SizeT = uint64_t; using _TileIdT = TileId::_TileIdT; using _KernelName = __copy_if_single_wg_kernel; + using _BinaryOp = std::plus<_SizeT>; - const ::std::size_t n = __in_rng.size(); + const ::std::size_t __n = __in_rng.size(); - constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; - constexpr ::std::size_t elems_per_workitem = _KernelParam::data_per_workitem; + constexpr ::std::size_t __wgsize = _KernelParam::workgroup_size; + constexpr ::std::size_t __elems_per_workitem = _KernelParam::data_per_workitem; - // Avoid non_uniform n by padding up to a multiple of wgsize - constexpr std::uint32_t elems_in_tile = wgsize * elems_per_workitem; - ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); - ::std::size_t num_workitems = num_wgs * wgsize; - assert(num_wgs == 1); + // Avoid non_uniform n by padding up to a multiple of __wgsize + constexpr std::uint32_t __elems_in_tile = __wgsize * __elems_per_workitem; + ::std::size_t __num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile); + ::std::size_t __num_workitesm = __num_wgs * __wgsize; + assert(__num_wgs == 1); - auto event = __queue.submit([&](sycl::handler& hdl) { - auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); + auto __event = __queue.submit([&](sycl::handler& hdl) { + auto __wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{__elems_in_tile}, hdl); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng); - hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& + hdl.parallel_for(sycl::nd_range<1>(__num_workitesm, __wgsize), [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size( SUBGROUP_SIZE)]] { - auto group = item.get_group(); - auto wg_local_id = item.get_local_id(0); - constexpr ::std::uint32_t stride = wgsize; + auto __group = item.get_group(); + auto __wg_local_id = item.get_local_id(0); + constexpr ::std::uint32_t stride = __wgsize; // Global load into local - _SizeT wg_count = 0; + _SizeT __wg_count = 0; - // Phase 1: Create wg_count and construct in-order wg_copy_if_values - if (elems_in_tile <= n) + // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values + if (__elems_in_tile <= __n) { #pragma unroll - for (size_t i = 0; i < elems_in_tile; i += wgsize) + for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize) { - _Type val = __in_rng[i + wg_local_id]; + _Type __val = __in_rng[__i + __wg_local_id]; - _SizeT satisfies_pred = pred(val); - _SizeT count = - sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); + _SizeT __satisfies_pred = __pred(__val); + _SizeT __count = + sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); - if (satisfies_pred) - wg_copy_if_values[count] = val; + if (__satisfies_pred) + __wg_copy_if_values[__count] = __val; - wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); + __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1); } } else @@ -569,46 +556,46 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou // Edge of input, have to handle memory bounds // Might have unneccessary group_barrier calls #pragma unroll - for (size_t i = 0; i < elems_in_tile; i += wgsize) + for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize) { - _SizeT satisfies_pred = 0; - oneapi::dpl::__internal::__lazy_ctor_storage<_Type> val; - if (i + wg_local_id < n) + _SizeT __satisfies_pred = 0; + oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val; + if (__i + __wg_local_id < __n) { - new (&val.__v) _Type(__in_rng[i + wg_local_id]); + new (&__val.__v) _Type(__in_rng[__i + __wg_local_id]); - satisfies_pred = pred(val.__v); + __satisfies_pred = __pred(__val.__v); } - _SizeT count = - sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); - if (i + wg_local_id < n) + _SizeT __count = + sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); + if (__i + __wg_local_id < __n) { - if (satisfies_pred) - wg_copy_if_values[count] = std::move(val.__v); - val.__v.~_Type(); + if (__satisfies_pred) + __wg_copy_if_values[__count] = std::move(__val.__v); + __val.__v.~_Type(); } - wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1); + __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1); } } // Phase 3: copy values to global memory - for (int i = wg_local_id; i < wg_count; i += wgsize) + for (int __i = __wg_local_id; __i < __wg_count; __i += __wgsize) { - __out_rng[i] = wg_copy_if_values[i]; + __out_rng[__i] = __wg_copy_if_values[__i]; } - if (group.leader()) - __num_rng[0] = wg_count; + if (__group.leader()) + __num_rng[0] = __wg_count; }); }); - event.wait(); + __event.wait(); } template void single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, - _UnaryPredicate pred, _KernelParam) + _UnaryPredicate __pred, _KernelParam) { using _Type = oneapi::dpl::__internal::__value_t<_InRange>; using _SizeT = uint64_t; @@ -623,44 +610,42 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ using _FlagType = __scan_status_flag<_SizeT>; using _FlagStorageType = typename _FlagType::_FlagStorageType; - const ::std::size_t n = __in_rng.size(); + const ::std::size_t __n = __in_rng.size(); - constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; - constexpr ::std::size_t elems_per_workitem = _KernelParam::data_per_workitem; + constexpr ::std::size_t __wgsize = _KernelParam::workgroup_size; + constexpr ::std::size_t __elems_per_workitem = _KernelParam::data_per_workitem; - // Avoid non_uniform n by padding up to a multiple of wgsize - constexpr std::uint32_t __elems_in_tile = wgsize * elems_per_workitem; - ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, __elems_in_tile); - ::std::size_t num_workitems = num_wgs * wgsize; + // Avoid non_uniform n by padding up to a multiple of __wgsize + constexpr std::uint32_t __elems_in_tile = __wgsize * __elems_per_workitem; + ::std::size_t __num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile); + ::std::size_t __num_workitesm = __num_wgs * __wgsize; - ScanMemoryManager<_FlagType> scratch(__queue, num_wgs); - scratch.allocate(); + __scan_lookback_mem_mgr<_FlagType> __scratch(__queue, __num_wgs); + __scratch.allocate(); // Memory Structure: // [Lookback Scan Memory, Tile Id Counter] - auto __status_vals_full = scratch.get_full_values_begin(); - auto __status_vals_partial = scratch.get_partial_values_begin(); - auto __status_flags = scratch.get_flags_begin(); + auto __status_vals_full = __scratch.get_full_values_begin(); + auto __status_vals_partial = __scratch.get_partial_values_begin(); + auto __status_flags = __scratch.get_flags_begin(); //adding 1 to the number elements to account for the tile id - std::size_t __status_flags_size = scratch.get_num_elements() + 1; + std::size_t __status_flags_size = __scratch.get_num_elements() + 1; auto __fill_event = __lookback_init_submitter<_FlagType, _SizeT, _BinaryOp, _LookbackInitKernel>{}( __queue, __status_flags, __status_vals_partial, __status_flags_size, _FlagType::__padding); - auto event = __queue.submit([&](sycl::handler& hdl) { - auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{__elems_in_tile}, hdl); - - auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); - hdl.depends_on(__fill_event); + auto __event = __queue.submit([&](sycl::handler& __hdl) { + auto __wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{__elems_in_tile}, __hdl); + __hdl.depends_on(__fill_event); - oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng); - hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& - item) [[intel::reqd_sub_group_size( + oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng, __num_rng); + __hdl.parallel_for(sycl::nd_range<1>(__num_workitesm, __wgsize), [=](const sycl::nd_item<1>& + __item) [[intel::reqd_sub_group_size( SUBGROUP_SIZE)]] { - auto __group = item.get_group(); - auto wg_local_id = item.get_local_id(0); - auto sg = item.get_sub_group(); - constexpr ::std::uint32_t stride = wgsize; + auto __group = __item.get_group(); + auto __wg_local_id = __item.get_local_id(0); + auto __sg = __item.get_sub_group(); + constexpr ::std::uint32_t __stride = __wgsize; std::uint32_t __tile_id = 0; @@ -677,13 +662,13 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ std::size_t __current_offset = static_cast(__tile_id) * __elems_in_tile; - _SizeT wg_count = 0; + _SizeT __wg_count = 0; - // Phase 1: Create wg_count and construct in-order wg_copy_if_values - if ((__tile_id + 1) * __elems_in_tile <= n) + // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values + if ((__tile_id + 1) * __elems_in_tile <= __n) { #pragma unroll - for (size_t i = 0; i < __elems_in_tile; i += wgsize) + for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize) { // TODO: explore scalar impl. Does this allow us to avoid the group broadcast (sync)? // if load is done in a scalar fashion and provides the same performance, we @@ -692,16 +677,16 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ // satisfies pred, copying to "my slot" in SLM then do scan, then the copy to // global memory needs to be loaded per work item per element, skipping copies // when they were not saved. - _Type val = __in_rng[i + wg_local_id + __elems_in_tile * __tile_id]; + _Type __val = __in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id]; - _SizeT satisfies_pred = pred(val); - _SizeT count = - sycl::exclusive_scan_over_group(__group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); + _SizeT __satisfies_pred = __pred(__val); + _SizeT __count = + sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); - if (satisfies_pred) - wg_copy_if_values[count] = val; + if (__satisfies_pred) + __wg_copy_if_values[__count] = __val; - wg_count = sycl::group_broadcast(__group, count + satisfies_pred, wgsize - 1); + __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1); } } else @@ -709,49 +694,49 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ // Edge of input, have to handle memory bounds // Might have unneccessary group_barrier calls #pragma unroll - for (size_t i = 0; i < __elems_in_tile; i += wgsize) + for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize) { - _SizeT satisfies_pred = 0; - oneapi::dpl::__internal::__lazy_ctor_storage<_Type> val; - if (i + wg_local_id + __elems_in_tile * __tile_id < n) + _SizeT __satisfies_pred = 0; + oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val; + if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n) { - new (&val.__v) _Type(__in_rng[i + wg_local_id + __elems_in_tile * __tile_id]); + new (&__val.__v) _Type(__in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id]); - satisfies_pred = pred(val.__v); + __satisfies_pred = __pred(__val.__v); } - _SizeT count = - sycl::exclusive_scan_over_group(__group, satisfies_pred, wg_count, sycl::plus<_SizeT>()); + _SizeT __count = + sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); - if (i + wg_local_id + __elems_in_tile * __tile_id < n) + if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n) { - if (satisfies_pred) - wg_copy_if_values[count] = std::move(val.__v); - val.__v.~_Type(); + if (__satisfies_pred) + __wg_copy_if_values[__count] = std::move(__val.__v); + __val.__v.~_Type(); } - wg_count = sycl::group_broadcast(__group, count + satisfies_pred, wgsize - 1); + __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1); } } - // Phase 2: Global scan across wg_count - _SizeT copied_elements = 0; + // Phase 2: Global scan across __wg_count + _SizeT __copied_elements = 0; - __lookback_phase<_FlagType>(__group, sg, __status_flags, __status_vals_full, __status_vals_partial, - __tile_id, wg_count, copied_elements, sycl::plus<_SizeT>()); + __lookback_phase<_FlagType>(__group, __sg, __status_flags, __status_vals_full, __status_vals_partial, + __tile_id, __wg_count, __copied_elements, _BinaryOp{}); //TODO: explore above comment about scalar load // Phase 3: copy values to global memory - for (int i = wg_local_id; i < wg_count; i += wgsize) + for (int __i = __wg_local_id; __i < __wg_count; __i += __wgsize) { - __out_rng[copied_elements + i] = wg_copy_if_values[i]; + __out_rng[__copied_elements + __i] = __wg_copy_if_values[__i]; } - if (__tile_id == (num_wgs - 1) && __group.leader()) - __num_rng[0] = copied_elements + wg_count; + if (__tile_id == (__num_wgs - 1) && __group.leader()) + __num_rng[0] = __copied_elements + __wg_count; }); }); - event.wait(); - scratch.free(); + __event.wait(); + __scratch.free(); } } // namespace __impl @@ -760,7 +745,7 @@ template void single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, - _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred, + _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate __pred, _KernelParam __param = {}) { auto __n = __in_end - __in_begin; @@ -775,14 +760,14 @@ single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIt auto __buf_num = __keep2(__num_begin, __num_begin + 1); __impl::single_pass_copy_if_impl_single_wg(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), - pred, __param); + __pred, __param); } template void single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, - _NumSelectedRange __num_begin, _UnaryPredicate pred, _KernelParam __param = {}) + _NumSelectedRange __num_begin, _UnaryPredicate __pred, _KernelParam __param = {}) { auto __n = __in_end - __in_begin; @@ -795,7 +780,7 @@ single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __i oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>(); auto __buf_num = __keep2(__num_begin, __num_begin + 1); - __impl::single_pass_copy_if_impl(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred, + __impl::single_pass_copy_if_impl(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), __pred, __param); } From c5065a784c901da356c9aacc4684f4bd188757ae Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 24 May 2024 12:02:35 -0400 Subject: [PATCH 089/134] format Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 260 +++++++++--------- 1 file changed, 130 insertions(+), 130 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index ede5ed4c2ae..aee2dbb063d 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -523,69 +523,69 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou auto __wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{__elems_in_tile}, hdl); oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng); - hdl.parallel_for(sycl::nd_range<1>(__num_workitesm, __wgsize), [=](const sycl::nd_item<1>& - item) [[intel::reqd_sub_group_size( - SUBGROUP_SIZE)]] { - auto __group = item.get_group(); - auto __wg_local_id = item.get_local_id(0); - constexpr ::std::uint32_t stride = __wgsize; - - // Global load into local - _SizeT __wg_count = 0; - - // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values - if (__elems_in_tile <= __n) - { -#pragma unroll - for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize) + hdl.parallel_for( + sycl::nd_range<1>(__num_workitesm, __wgsize), + [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { + auto __group = item.get_group(); + auto __wg_local_id = item.get_local_id(0); + constexpr ::std::uint32_t stride = __wgsize; + + // Global load into local + _SizeT __wg_count = 0; + + // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values + if (__elems_in_tile <= __n) { - _Type __val = __in_rng[__i + __wg_local_id]; +#pragma unroll + for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize) + { + _Type __val = __in_rng[__i + __wg_local_id]; - _SizeT __satisfies_pred = __pred(__val); - _SizeT __count = - sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); + _SizeT __satisfies_pred = __pred(__val); + _SizeT __count = + sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); - if (__satisfies_pred) - __wg_copy_if_values[__count] = __val; + if (__satisfies_pred) + __wg_copy_if_values[__count] = __val; - __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1); + __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1); + } } - } - else - { + else + { // Edge of input, have to handle memory bounds // Might have unneccessary group_barrier calls #pragma unroll - for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize) - { - _SizeT __satisfies_pred = 0; - oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val; - if (__i + __wg_local_id < __n) + for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize) { - new (&__val.__v) _Type(__in_rng[__i + __wg_local_id]); - - __satisfies_pred = __pred(__val.__v); + _SizeT __satisfies_pred = 0; + oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val; + if (__i + __wg_local_id < __n) + { + new (&__val.__v) _Type(__in_rng[__i + __wg_local_id]); + + __satisfies_pred = __pred(__val.__v); + } + _SizeT __count = + sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); + if (__i + __wg_local_id < __n) + { + if (__satisfies_pred) + __wg_copy_if_values[__count] = std::move(__val.__v); + __val.__v.~_Type(); + } + __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1); } - _SizeT __count = - sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); - if (__i + __wg_local_id < __n) - { - if (__satisfies_pred) - __wg_copy_if_values[__count] = std::move(__val.__v); - __val.__v.~_Type(); - } - __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1); } - } - // Phase 3: copy values to global memory - for (int __i = __wg_local_id; __i < __wg_count; __i += __wgsize) - { - __out_rng[__i] = __wg_copy_if_values[__i]; - } - if (__group.leader()) - __num_rng[0] = __wg_count; - }); + // Phase 3: copy values to global memory + for (int __i = __wg_local_id; __i < __wg_count; __i += __wgsize) + { + __out_rng[__i] = __wg_copy_if_values[__i]; + } + if (__group.leader()) + __num_rng[0] = __wg_count; + }); }); __event.wait(); @@ -639,100 +639,100 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ __hdl.depends_on(__fill_event); oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng, __num_rng); - __hdl.parallel_for(sycl::nd_range<1>(__num_workitesm, __wgsize), [=](const sycl::nd_item<1>& - __item) [[intel::reqd_sub_group_size( - SUBGROUP_SIZE)]] { - auto __group = __item.get_group(); - auto __wg_local_id = __item.get_local_id(0); - auto __sg = __item.get_sub_group(); - constexpr ::std::uint32_t __stride = __wgsize; - - std::uint32_t __tile_id = 0; - - // Obtain unique ID for this work-group that will be used in decoupled lookback - if (__group.leader()) - { - sycl::atomic_ref<_FlagStorageType, sycl::memory_order::relaxed, sycl::memory_scope::device, - sycl::access::address_space::global_space> - __idx_atomic(__status_flags[__status_flags_size - 1]); - __tile_id = __idx_atomic.fetch_add(1); - } + __hdl.parallel_for( + sycl::nd_range<1>(__num_workitesm, __wgsize), + [=](const sycl::nd_item<1>& __item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { + auto __group = __item.get_group(); + auto __wg_local_id = __item.get_local_id(0); + auto __sg = __item.get_sub_group(); + constexpr ::std::uint32_t __stride = __wgsize; + + std::uint32_t __tile_id = 0; + + // Obtain unique ID for this work-group that will be used in decoupled lookback + if (__group.leader()) + { + sycl::atomic_ref<_FlagStorageType, sycl::memory_order::relaxed, sycl::memory_scope::device, + sycl::access::address_space::global_space> + __idx_atomic(__status_flags[__status_flags_size - 1]); + __tile_id = __idx_atomic.fetch_add(1); + } - __tile_id = sycl::group_broadcast(__group, __tile_id, 0); + __tile_id = sycl::group_broadcast(__group, __tile_id, 0); - std::size_t __current_offset = static_cast(__tile_id) * __elems_in_tile; + std::size_t __current_offset = static_cast(__tile_id) * __elems_in_tile; - _SizeT __wg_count = 0; + _SizeT __wg_count = 0; - // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values - if ((__tile_id + 1) * __elems_in_tile <= __n) - { -#pragma unroll - for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize) + // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values + if ((__tile_id + 1) * __elems_in_tile <= __n) { - // TODO: explore scalar impl. Does this allow us to avoid the group broadcast (sync)? - // if load is done in a scalar fashion and provides the same performance, we - // can avoid the broadcast (I think) - // would need to loop over the elements per work item first accumulating into - // satisfies pred, copying to "my slot" in SLM then do scan, then the copy to - // global memory needs to be loaded per work item per element, skipping copies - // when they were not saved. - _Type __val = __in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id]; - - _SizeT __satisfies_pred = __pred(__val); - _SizeT __count = - sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); - - if (__satisfies_pred) - __wg_copy_if_values[__count] = __val; - - __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1); - } - } - else - { - // Edge of input, have to handle memory bounds - // Might have unneccessary group_barrier calls #pragma unroll - for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize) - { - _SizeT __satisfies_pred = 0; - oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val; - if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n) + for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize) { - new (&__val.__v) _Type(__in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id]); + // TODO: explore scalar impl. Does this allow us to avoid the group broadcast (sync)? + // if load is done in a scalar fashion and provides the same performance, we + // can avoid the broadcast (I think) + // would need to loop over the elements per work item first accumulating into + // satisfies pred, copying to "my slot" in SLM then do scan, then the copy to + // global memory needs to be loaded per work item per element, skipping copies + // when they were not saved. + _Type __val = __in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id]; + + _SizeT __satisfies_pred = __pred(__val); + _SizeT __count = + sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); - __satisfies_pred = __pred(__val.__v); - } - _SizeT __count = - sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); + if (__satisfies_pred) + __wg_copy_if_values[__count] = __val; - if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n) + __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1); + } + } + else + { + // Edge of input, have to handle memory bounds + // Might have unneccessary group_barrier calls +#pragma unroll + for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize) { - if (__satisfies_pred) - __wg_copy_if_values[__count] = std::move(__val.__v); - __val.__v.~_Type(); + _SizeT __satisfies_pred = 0; + oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val; + if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n) + { + new (&__val.__v) _Type(__in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id]); + + __satisfies_pred = __pred(__val.__v); + } + _SizeT __count = + sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); + + if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n) + { + if (__satisfies_pred) + __wg_copy_if_values[__count] = std::move(__val.__v); + __val.__v.~_Type(); + } + + __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1); } - - __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1); } - } - // Phase 2: Global scan across __wg_count - _SizeT __copied_elements = 0; + // Phase 2: Global scan across __wg_count + _SizeT __copied_elements = 0; - __lookback_phase<_FlagType>(__group, __sg, __status_flags, __status_vals_full, __status_vals_partial, - __tile_id, __wg_count, __copied_elements, _BinaryOp{}); + __lookback_phase<_FlagType>(__group, __sg, __status_flags, __status_vals_full, __status_vals_partial, + __tile_id, __wg_count, __copied_elements, _BinaryOp{}); - //TODO: explore above comment about scalar load - // Phase 3: copy values to global memory - for (int __i = __wg_local_id; __i < __wg_count; __i += __wgsize) - { - __out_rng[__copied_elements + __i] = __wg_copy_if_values[__i]; - } - if (__tile_id == (__num_wgs - 1) && __group.leader()) - __num_rng[0] = __copied_elements + __wg_count; - }); + //TODO: explore above comment about scalar load + // Phase 3: copy values to global memory + for (int __i = __wg_local_id; __i < __wg_count; __i += __wgsize) + { + __out_rng[__copied_elements + __i] = __wg_copy_if_values[__i]; + } + if (__tile_id == (__num_wgs - 1) && __group.leader()) + __num_rng[0] = __copied_elements + __wg_count; + }); }); __event.wait(); From 6a7291a61b6beffb888d755cee3bc8eb2396b872 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 24 May 2024 15:31:08 -0400 Subject: [PATCH 090/134] change single wg scan to submitter and kernel operator Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 209 +++++++++++------- 1 file changed, 124 insertions(+), 85 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index aee2dbb063d..50fc41b2be6 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -287,10 +287,6 @@ __lookback_phase(const _Group& __group, const _SubGroup& __subgroup, _StatusFlag __prev_tile_reduction = sycl::group_broadcast(__group, __prev_tile_reduction, 0); } -template -struct __lookback_submitter; - template @@ -379,9 +375,13 @@ struct __lookback_kernel_func } }; +template +struct __lookback_scan_submitter; + template -struct __lookback_submitter<__data_per_workitem, __workgroup_size, _Type, _FlagType, +struct __lookback_scan_submitter<__data_per_workitem, __workgroup_size, _Type, _FlagType, oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>> { @@ -476,7 +476,7 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r std::size_t __current_num_items = __current_num_wgs * __workgroup_size; auto __prev_event = - __lookback_submitter<__data_per_workitem, __workgroup_size, _Type, _FlagType, _LookbackKernel>{}( + __lookback_scan_submitter<__data_per_workitem, __workgroup_size, _Type, _FlagType, _LookbackKernel>{}( __queue, __fill_event, __in_rng, __out_rng, __binary_op, __n, __status_flags, __status_flags_size, __status_vals_full, __status_vals_partial, __current_num_items); @@ -496,6 +496,121 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r } } + + +template +struct __copy_if_single_wg_kernel_func +{ + static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; + using _SizeT = std::size_t; + using _BinaryOp = std::plus<_SizeT>; + using _Type = oneapi::dpl::__internal::__value_t<_InRng>; + + _InRng __in_rng; + _OutRng __out_rng; + _NumRng __num_rng; + _SizeT __n; + _TileValues __wg_copy_if_values; + + [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]] void + operator()(const sycl::nd_item<1>& __item) const + { + auto __group = __item.get_group(); + auto __wg_local_id = __item.get_local_id(0); + + // Global load into local + _SizeT __wg_count = 0; + + // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values + if (__elems_in_tile <= __n) + { +#pragma unroll + for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size) + { + _Type __val = __in_rng[__i + __wg_local_id]; + + _SizeT __satisfies_pred = __pred(__val); + _SizeT __count = + sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); + + if (__satisfies_pred) + __wg_copy_if_values[__count] = __val; + + __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1); + } + } + else + { + // Edge of input, have to handle memory bounds + // Might have unneccessary group_barrier calls +#pragma unroll + for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size) + { + _SizeT __satisfies_pred = 0; + oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val; + if (__i + __wg_local_id < __n) + { + new (&__val.__v) _Type(__in_rng[__i + __wg_local_id]); + + __satisfies_pred = __pred(__val.__v); + } + _SizeT __count = + sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); + if (__i + __wg_local_id < __n) + { + if (__satisfies_pred) + __wg_copy_if_values[__count] = std::move(__val.__v); + __val.__v.~_Type(); + } + __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1); + } + } + + // Phase 3: copy values to global memory + for (int __i = __wg_local_id; __i < __wg_count; __i += __workgroup_size) + { + __out_rng[__i] = __wg_copy_if_values[__i]; + } + if (__group.leader()) + __num_rng[0] = __wg_count; + } +}; + + + + +template +struct __copy_if_single_wg_submitter; + +template +struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size, + oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>> +{ + + template + sycl::event + operator()(sycl::queue __q, _InRng&& __in_rng, _OutRng&& __out_rng, _NumSelectedRange&& __num_rng, std::size_t __n) const + { + using _Type = oneapi::dpl::__internal::__value_t<_InRng>; + using _LocalAccessorType = sycl::local_accessor<_Type, 1>; + using _KernelFunc = + __copy_if_single_wg_kernel_func<__data_per_workitem, __workgroup_size, std::decay_t<_InRng>, + std::decay_t<_OutRng>, std::decay_t<_NumSelectedRange>, _LocalAccessorType>; + + static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; + + return __q.submit([&](sycl::handler& __hdl) { + auto __tile_vals = _LocalAccessorType(sycl::range<1>{__elems_in_tile}, __hdl); + oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng, __num_rng); + __hdl.parallel_for<_Name...>(sycl::nd_range<1>(__workgroup_size, __workgroup_size), + _KernelFunc{__in_rng, __out_rng, __n, __num_rng, __tile_vals}); + }); + } +}; + + + + template void @@ -513,82 +628,7 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou constexpr ::std::size_t __wgsize = _KernelParam::workgroup_size; constexpr ::std::size_t __elems_per_workitem = _KernelParam::data_per_workitem; - // Avoid non_uniform n by padding up to a multiple of __wgsize - constexpr std::uint32_t __elems_in_tile = __wgsize * __elems_per_workitem; - ::std::size_t __num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile); - ::std::size_t __num_workitesm = __num_wgs * __wgsize; - assert(__num_wgs == 1); - - auto __event = __queue.submit([&](sycl::handler& hdl) { - auto __wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{__elems_in_tile}, hdl); - - oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng); - hdl.parallel_for( - sycl::nd_range<1>(__num_workitesm, __wgsize), - [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { - auto __group = item.get_group(); - auto __wg_local_id = item.get_local_id(0); - constexpr ::std::uint32_t stride = __wgsize; - - // Global load into local - _SizeT __wg_count = 0; - - // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values - if (__elems_in_tile <= __n) - { -#pragma unroll - for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize) - { - _Type __val = __in_rng[__i + __wg_local_id]; - - _SizeT __satisfies_pred = __pred(__val); - _SizeT __count = - sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); - - if (__satisfies_pred) - __wg_copy_if_values[__count] = __val; - - __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1); - } - } - else - { - // Edge of input, have to handle memory bounds - // Might have unneccessary group_barrier calls -#pragma unroll - for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize) - { - _SizeT __satisfies_pred = 0; - oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val; - if (__i + __wg_local_id < __n) - { - new (&__val.__v) _Type(__in_rng[__i + __wg_local_id]); - - __satisfies_pred = __pred(__val.__v); - } - _SizeT __count = - sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); - if (__i + __wg_local_id < __n) - { - if (__satisfies_pred) - __wg_copy_if_values[__count] = std::move(__val.__v); - __val.__v.~_Type(); - } - __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1); - } - } - - // Phase 3: copy values to global memory - for (int __i = __wg_local_id; __i < __wg_count; __i += __wgsize) - { - __out_rng[__i] = __wg_copy_if_values[__i]; - } - if (__group.leader()) - __num_rng[0] = __wg_count; - }); - }); - - __event.wait(); + __copy_if_single_wg_submitter<__elems_per_workitem, __wgsize, _KernelName>{}(__queue, __in_rng, __out_rng, __num_rng, __n).wait(); } template __scratch(__queue, __num_wgs); __scratch.allocate(); @@ -640,12 +680,11 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng, __num_rng); __hdl.parallel_for( - sycl::nd_range<1>(__num_workitesm, __wgsize), + sycl::nd_range<1>(__num_workitems, __wgsize), [=](const sycl::nd_item<1>& __item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { auto __group = __item.get_group(); auto __wg_local_id = __item.get_local_id(0); auto __sg = __item.get_sub_group(); - constexpr ::std::uint32_t __stride = __wgsize; std::uint32_t __tile_id = 0; From 1b78dcda322b224c130aad3ce74cef17fe3e81df Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 29 May 2024 08:38:22 -0400 Subject: [PATCH 091/134] change scan to submitter and kernel operator Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 205 ++++++++++++------ 1 file changed, 135 insertions(+), 70 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 50fc41b2be6..9a729ae4356 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -263,8 +263,8 @@ struct __lookback_init_submitter<_FlagType, _Type, _BinaryOp, template void -__lookback_phase(const _Group& __group, const _SubGroup& __subgroup, _StatusFlags& __status_flags, - _StatusValues& __status_vals_full, _StatusValues& __status_vals_partial, std::uint32_t __tile_id, +__lookback_phase(const _Group& __group, const _SubGroup& __subgroup, _StatusFlags __status_flags, + _StatusValues __status_vals_full, _StatusValues __status_vals_partial, std::uint32_t __tile_id, _Type& __local_reduction, _Type& __prev_tile_reduction, _BinaryOp __binary_op) { // The first sub-group will query the previous tiles to find a prefix @@ -303,7 +303,6 @@ struct __lookback_kernel_func std::size_t __status_flags_size; _StatusValues __status_vals_full; _StatusValues __status_vals_partial; - std::size_t __current_num_items; _TileVals __tile_vals; [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]] void @@ -408,7 +407,7 @@ struct __lookback_scan_submitter<__data_per_workitem, __workgroup_size, _Type, _ __hdl.parallel_for<_Name...>(sycl::nd_range<1>(__current_num_items, __workgroup_size), _KernelFunc{__in_rng, __out_rng, __binary_op, __n, __status_flags, __status_flags_size, __status_vals_full, __status_vals_partial, - __current_num_items, __tile_vals}); + __tile_vals}); }); } }; @@ -498,7 +497,7 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r -template +template struct __copy_if_single_wg_kernel_func { static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; @@ -510,6 +509,7 @@ struct __copy_if_single_wg_kernel_func _OutRng __out_rng; _NumRng __num_rng; _SizeT __n; + _UnaryPredicate __pred; _TileValues __wg_copy_if_values; [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]] void @@ -587,15 +587,15 @@ struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size, oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>> { - template + template sycl::event - operator()(sycl::queue __q, _InRng&& __in_rng, _OutRng&& __out_rng, _NumSelectedRange&& __num_rng, std::size_t __n) const + operator()(sycl::queue __q, _InRng&& __in_rng, _OutRng&& __out_rng, _NumSelectedRange&& __num_rng, std::size_t __n, _UnaryPredicate __pred) const { using _Type = oneapi::dpl::__internal::__value_t<_InRng>; using _LocalAccessorType = sycl::local_accessor<_Type, 1>; using _KernelFunc = __copy_if_single_wg_kernel_func<__data_per_workitem, __workgroup_size, std::decay_t<_InRng>, - std::decay_t<_OutRng>, std::decay_t<_NumSelectedRange>, _LocalAccessorType>; + std::decay_t<_OutRng>, std::decay_t<_NumSelectedRange>, _UnaryPredicate, std::decay_t<_LocalAccessorType>>; static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; @@ -603,7 +603,7 @@ struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size, auto __tile_vals = _LocalAccessorType(sycl::range<1>{__elems_in_tile}, __hdl); oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng, __num_rng); __hdl.parallel_for<_Name...>(sycl::nd_range<1>(__workgroup_size, __workgroup_size), - _KernelFunc{__in_rng, __out_rng, __n, __num_rng, __tile_vals}); + _KernelFunc{__in_rng, __out_rng, __num_rng, __n, __pred, __tile_vals}); }); } }; @@ -617,71 +617,42 @@ void single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate __pred, _KernelParam) { - using _Type = oneapi::dpl::__internal::__value_t<_InRange>; - using _SizeT = uint64_t; - using _TileIdT = TileId::_TileIdT; using _KernelName = __copy_if_single_wg_kernel; - using _BinaryOp = std::plus<_SizeT>; const ::std::size_t __n = __in_rng.size(); constexpr ::std::size_t __wgsize = _KernelParam::workgroup_size; constexpr ::std::size_t __elems_per_workitem = _KernelParam::data_per_workitem; - __copy_if_single_wg_submitter<__elems_per_workitem, __wgsize, _KernelName>{}(__queue, __in_rng, __out_rng, __num_rng, __n).wait(); + __copy_if_single_wg_submitter<__elems_per_workitem, __wgsize, _KernelName>{}(__queue, __in_rng, __out_rng, __num_rng, __n, __pred).wait(); } -template -void -single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, - _UnaryPredicate __pred, _KernelParam) -{ - using _Type = oneapi::dpl::__internal::__value_t<_InRange>; - using _SizeT = uint64_t; - using _TileIdT = TileId::_TileIdT; - using _KernelName = __copy_if_kernel; +template +struct __copy_if_kernel_func +{ + static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; + using _SizeT = std::size_t; using _BinaryOp = std::plus<_SizeT>; - - using _LookbackInitKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __lookback_init_kernel<_KernelName, _SizeT, _BinaryOp>>; - - using _FlagType = __scan_status_flag<_SizeT>; + using _Type = oneapi::dpl::__internal::__value_t<_InRng>; using _FlagStorageType = typename _FlagType::_FlagStorageType; - const ::std::size_t __n = __in_rng.size(); - - constexpr ::std::size_t __wgsize = _KernelParam::workgroup_size; - constexpr ::std::size_t __elems_per_workitem = _KernelParam::data_per_workitem; - - // Avoid non_uniform n by padding up to a multiple of __wgsize - constexpr std::uint32_t __elems_in_tile = __wgsize * __elems_per_workitem; - ::std::size_t __num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile); - ::std::size_t __num_workitems = __num_wgs * __wgsize; - - __scan_lookback_mem_mgr<_FlagType> __scratch(__queue, __num_wgs); - __scratch.allocate(); - - // Memory Structure: - // [Lookback Scan Memory, Tile Id Counter] - auto __status_vals_full = __scratch.get_full_values_begin(); - auto __status_vals_partial = __scratch.get_partial_values_begin(); - auto __status_flags = __scratch.get_flags_begin(); - //adding 1 to the number elements to account for the tile id - std::size_t __status_flags_size = __scratch.get_num_elements() + 1; - - auto __fill_event = __lookback_init_submitter<_FlagType, _SizeT, _BinaryOp, _LookbackInitKernel>{}( - __queue, __status_flags, __status_vals_partial, __status_flags_size, _FlagType::__padding); - - auto __event = __queue.submit([&](sycl::handler& __hdl) { - auto __wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{__elems_in_tile}, __hdl); - __hdl.depends_on(__fill_event); + _InRng __in_rng; + _OutRng __out_rng; + _NumRng __num_rng; + _SizeT __n; + _UnaryPredicate __pred; + _StatusFlags __status_flags; + std::size_t __status_flags_size; + _StatusValues __status_vals_full; + _StatusValues __status_vals_partial; + _TileValues __wg_copy_if_values; + std::size_t __current_num_wgs; - oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng, __num_rng); - __hdl.parallel_for( - sycl::nd_range<1>(__num_workitems, __wgsize), - [=](const sycl::nd_item<1>& __item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { + [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]] void + operator()(const sycl::nd_item<1>& __item) const + { auto __group = __item.get_group(); auto __wg_local_id = __item.get_local_id(0); auto __sg = __item.get_sub_group(); @@ -707,7 +678,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ if ((__tile_id + 1) * __elems_in_tile <= __n) { #pragma unroll - for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize) + for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size) { // TODO: explore scalar impl. Does this allow us to avoid the group broadcast (sync)? // if load is done in a scalar fashion and provides the same performance, we @@ -725,7 +696,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ if (__satisfies_pred) __wg_copy_if_values[__count] = __val; - __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1); + __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1); } } else @@ -733,7 +704,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ // Edge of input, have to handle memory bounds // Might have unneccessary group_barrier calls #pragma unroll - for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize) + for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size) { _SizeT __satisfies_pred = 0; oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val; @@ -753,7 +724,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ __val.__v.~_Type(); } - __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1); + __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1); } } @@ -765,17 +736,111 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ //TODO: explore above comment about scalar load // Phase 3: copy values to global memory - for (int __i = __wg_local_id; __i < __wg_count; __i += __wgsize) + for (int __i = __wg_local_id; __i < __wg_count; __i += __workgroup_size) { __out_rng[__copied_elements + __i] = __wg_copy_if_values[__i]; } - if (__tile_id == (__num_wgs - 1) && __group.leader()) + if (__tile_id == (__current_num_wgs - 1) && __group.leader()) __num_rng[0] = __copied_elements + __wg_count; - }); - }); + } + +}; + + + +template +struct __copy_if_submitter; + +template +struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType, + oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>> +{ + + template + sycl::event + operator()(sycl::queue __q, _Event __fill_event, _InRng&& __in_rng, _OutRng&& __out_rng, + _NumSelectedRange&& __num_rng, std::size_t __n, _UnaryPredicate __pred, _StatusFlags&& __status_flags, std::size_t __status_flags_size, + _StatusValues&& __status_vals_full, _StatusValues&& __status_vals_partial, std::size_t __current_num_items, std::size_t __current_num_wgs) const + { + using _Type = oneapi::dpl::__internal::__value_t<_InRng>; + using _LocalAccessorType = sycl::local_accessor<_Type, 1>; + using _KernelFunc = + __copy_if_kernel_func<__data_per_workitem, __workgroup_size, _FlagType, std::decay_t<_InRng>, + std::decay_t<_OutRng>, std::decay_t<_NumSelectedRange>, _UnaryPredicate, + std::decay_t<_StatusFlags>, std::decay_t<_StatusValues>, + std::decay_t<_LocalAccessorType>>; + + static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; + + return __q.submit([&](sycl::handler& __hdl) { + auto __wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{__elems_in_tile}, __hdl); + __hdl.depends_on(__fill_event); + + auto __tile_vals = _LocalAccessorType(sycl::range<1>{__elems_in_tile}, __hdl); + oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng, __num_rng); + __hdl.parallel_for<_Name...>(sycl::nd_range<1>(__current_num_items, __workgroup_size), + _KernelFunc{__in_rng, __out_rng, __num_rng, __n, __pred, __status_flags, + __status_flags_size, __status_vals_full, __status_vals_partial, + __tile_vals, __current_num_wgs}); + }); + } +}; + + + +template +void +single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, + _UnaryPredicate __pred, _KernelParam) +{ + using _SizeT = uint64_t; + using _KernelName = __copy_if_kernel; + using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + using _FlagType = __scan_status_flag<_SizeT>; + + using _BinaryOp = std::plus<_SizeT>; + + using _LookbackInitKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __lookback_init_kernel<_KernelName, _SizeT, _BinaryOp>>; + + using _CopyIfKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __copy_if_kernel<_KernelName, _Type, _BinaryOp>>; + + const std::size_t __n = __in_rng.size(); + + constexpr std::size_t __workgroup_size = _KernelParam::workgroup_size; + constexpr std::size_t __elems_per_workitem = _KernelParam::data_per_workitem; + + // Avoid non_uniform n by padding up to a multiple of __workgroup_size + constexpr std::uint32_t __elems_in_tile = __workgroup_size * __elems_per_workitem; + std::size_t __current_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile); + std::size_t __current_num_items = __current_num_wgs * __workgroup_size; + + __scan_lookback_mem_mgr<_FlagType> __device_mem_mgr(__queue, __current_num_wgs); + __device_mem_mgr.allocate(); + + // Memory Structure: + // [Lookback Scan Memory, Tile Id Counter] + auto __status_vals_full = __device_mem_mgr.get_full_values_begin(); + auto __status_vals_partial = __device_mem_mgr.get_partial_values_begin(); + auto __status_flags = __device_mem_mgr.get_flags_begin(); + //adding 1 to the number elements to account for the tile id + std::size_t __status_flags_size = __device_mem_mgr.get_num_elements() + 1; + + auto __fill_event = __lookback_init_submitter<_FlagType, _SizeT, _BinaryOp, _LookbackInitKernel>{}( + __queue, __status_flags, __status_vals_partial, __status_flags_size, _FlagType::__padding); + + auto submitter = __copy_if_submitter<__elems_per_workitem, __workgroup_size, _FlagType, _CopyIfKernel>{}; - __event.wait(); - __scratch.free(); + submitter(__queue, __fill_event,__in_rng, __out_rng, __num_rng, __n, __pred, __status_flags, __status_flags_size, + __status_vals_full, + __status_vals_partial, + __current_num_items, __current_num_wgs).wait(); + __device_mem_mgr.free(); } } // namespace __impl From 246dbf33c37fea7f650928186740399905de6dd8 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 29 May 2024 08:38:42 -0400 Subject: [PATCH 092/134] formatting Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 228 ++++++++---------- 1 file changed, 107 insertions(+), 121 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 9a729ae4356..73a24d4be16 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -381,7 +381,7 @@ struct __lookback_scan_submitter; template struct __lookback_scan_submitter<__data_per_workitem, __workgroup_size, _Type, _FlagType, - oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>> + oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>> { template @@ -495,9 +495,8 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r } } - - -template +template struct __copy_if_single_wg_kernel_func { static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; @@ -530,8 +529,7 @@ struct __copy_if_single_wg_kernel_func _Type __val = __in_rng[__i + __wg_local_id]; _SizeT __satisfies_pred = __pred(__val); - _SizeT __count = - sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); + _SizeT __count = sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); if (__satisfies_pred) __wg_copy_if_values[__count] = __val; @@ -541,8 +539,8 @@ struct __copy_if_single_wg_kernel_func } else { - // Edge of input, have to handle memory bounds - // Might have unneccessary group_barrier calls + // Edge of input, have to handle memory bounds + // Might have unneccessary group_barrier calls #pragma unroll for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size) { @@ -554,8 +552,7 @@ struct __copy_if_single_wg_kernel_func __satisfies_pred = __pred(__val.__v); } - _SizeT __count = - sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); + _SizeT __count = sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); if (__i + __wg_local_id < __n) { if (__satisfies_pred) @@ -576,26 +573,24 @@ struct __copy_if_single_wg_kernel_func } }; - - - template struct __copy_if_single_wg_submitter; template -struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size, +struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size, oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>> { template sycl::event - operator()(sycl::queue __q, _InRng&& __in_rng, _OutRng&& __out_rng, _NumSelectedRange&& __num_rng, std::size_t __n, _UnaryPredicate __pred) const + operator()(sycl::queue __q, _InRng&& __in_rng, _OutRng&& __out_rng, _NumSelectedRange&& __num_rng, std::size_t __n, + _UnaryPredicate __pred) const { using _Type = oneapi::dpl::__internal::__value_t<_InRng>; using _LocalAccessorType = sycl::local_accessor<_Type, 1>; - using _KernelFunc = - __copy_if_single_wg_kernel_func<__data_per_workitem, __workgroup_size, std::decay_t<_InRng>, - std::decay_t<_OutRng>, std::decay_t<_NumSelectedRange>, _UnaryPredicate, std::decay_t<_LocalAccessorType>>; + using _KernelFunc = __copy_if_single_wg_kernel_func<__data_per_workitem, __workgroup_size, std::decay_t<_InRng>, + std::decay_t<_OutRng>, std::decay_t<_NumSelectedRange>, + _UnaryPredicate, std::decay_t<_LocalAccessorType>>; static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; @@ -608,9 +603,6 @@ struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size, } }; - - - template void @@ -624,12 +616,14 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou constexpr ::std::size_t __wgsize = _KernelParam::workgroup_size; constexpr ::std::size_t __elems_per_workitem = _KernelParam::data_per_workitem; - __copy_if_single_wg_submitter<__elems_per_workitem, __wgsize, _KernelName>{}(__queue, __in_rng, __out_rng, __num_rng, __n, __pred).wait(); + __copy_if_single_wg_submitter<__elems_per_workitem, __wgsize, _KernelName>{}(__queue, __in_rng, __out_rng, + __num_rng, __n, __pred) + .wait(); } - -template +template struct __copy_if_kernel_func { static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; @@ -653,125 +647,120 @@ struct __copy_if_kernel_func [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]] void operator()(const sycl::nd_item<1>& __item) const { - auto __group = __item.get_group(); - auto __wg_local_id = __item.get_local_id(0); - auto __sg = __item.get_sub_group(); + auto __group = __item.get_group(); + auto __wg_local_id = __item.get_local_id(0); + auto __sg = __item.get_sub_group(); - std::uint32_t __tile_id = 0; + std::uint32_t __tile_id = 0; - // Obtain unique ID for this work-group that will be used in decoupled lookback - if (__group.leader()) - { - sycl::atomic_ref<_FlagStorageType, sycl::memory_order::relaxed, sycl::memory_scope::device, - sycl::access::address_space::global_space> - __idx_atomic(__status_flags[__status_flags_size - 1]); - __tile_id = __idx_atomic.fetch_add(1); - } + // Obtain unique ID for this work-group that will be used in decoupled lookback + if (__group.leader()) + { + sycl::atomic_ref<_FlagStorageType, sycl::memory_order::relaxed, sycl::memory_scope::device, + sycl::access::address_space::global_space> + __idx_atomic(__status_flags[__status_flags_size - 1]); + __tile_id = __idx_atomic.fetch_add(1); + } - __tile_id = sycl::group_broadcast(__group, __tile_id, 0); + __tile_id = sycl::group_broadcast(__group, __tile_id, 0); - std::size_t __current_offset = static_cast(__tile_id) * __elems_in_tile; + std::size_t __current_offset = static_cast(__tile_id) * __elems_in_tile; - _SizeT __wg_count = 0; + _SizeT __wg_count = 0; - // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values - if ((__tile_id + 1) * __elems_in_tile <= __n) - { -#pragma unroll - for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size) - { - // TODO: explore scalar impl. Does this allow us to avoid the group broadcast (sync)? - // if load is done in a scalar fashion and provides the same performance, we - // can avoid the broadcast (I think) - // would need to loop over the elements per work item first accumulating into - // satisfies pred, copying to "my slot" in SLM then do scan, then the copy to - // global memory needs to be loaded per work item per element, skipping copies - // when they were not saved. - _Type __val = __in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id]; - - _SizeT __satisfies_pred = __pred(__val); - _SizeT __count = - sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); - - if (__satisfies_pred) - __wg_copy_if_values[__count] = __val; - - __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1); - } - } - else - { - // Edge of input, have to handle memory bounds - // Might have unneccessary group_barrier calls + // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values + if ((__tile_id + 1) * __elems_in_tile <= __n) + { #pragma unroll - for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size) - { - _SizeT __satisfies_pred = 0; - oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val; - if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n) - { - new (&__val.__v) _Type(__in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id]); - - __satisfies_pred = __pred(__val.__v); - } - _SizeT __count = - sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); - - if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n) - { - if (__satisfies_pred) - __wg_copy_if_values[__count] = std::move(__val.__v); - __val.__v.~_Type(); - } - - __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1); - } - } + for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size) + { + // TODO: explore scalar impl. Does this allow us to avoid the group broadcast (sync)? + // if load is done in a scalar fashion and provides the same performance, we + // can avoid the broadcast (I think) + // would need to loop over the elements per work item first accumulating into + // satisfies pred, copying to "my slot" in SLM then do scan, then the copy to + // global memory needs to be loaded per work item per element, skipping copies + // when they were not saved. + _Type __val = __in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id]; - // Phase 2: Global scan across __wg_count - _SizeT __copied_elements = 0; + _SizeT __satisfies_pred = __pred(__val); + _SizeT __count = sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); - __lookback_phase<_FlagType>(__group, __sg, __status_flags, __status_vals_full, __status_vals_partial, - __tile_id, __wg_count, __copied_elements, _BinaryOp{}); + if (__satisfies_pred) + __wg_copy_if_values[__count] = __val; - //TODO: explore above comment about scalar load - // Phase 3: copy values to global memory - for (int __i = __wg_local_id; __i < __wg_count; __i += __workgroup_size) + __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1); + } + } + else + { + // Edge of input, have to handle memory bounds + // Might have unneccessary group_barrier calls +#pragma unroll + for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size) + { + _SizeT __satisfies_pred = 0; + oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val; + if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n) { - __out_rng[__copied_elements + __i] = __wg_copy_if_values[__i]; + new (&__val.__v) _Type(__in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id]); + + __satisfies_pred = __pred(__val.__v); } - if (__tile_id == (__current_num_wgs - 1) && __group.leader()) - __num_rng[0] = __copied_elements + __wg_count; + _SizeT __count = sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); + + if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n) + { + if (__satisfies_pred) + __wg_copy_if_values[__count] = std::move(__val.__v); + __val.__v.~_Type(); + } + + __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1); } + } -}; + // Phase 2: Global scan across __wg_count + _SizeT __copied_elements = 0; + __lookback_phase<_FlagType>(__group, __sg, __status_flags, __status_vals_full, __status_vals_partial, __tile_id, + __wg_count, __copied_elements, _BinaryOp{}); + //TODO: explore above comment about scalar load + // Phase 3: copy values to global memory + for (int __i = __wg_local_id; __i < __wg_count; __i += __workgroup_size) + { + __out_rng[__copied_elements + __i] = __wg_copy_if_values[__i]; + } + if (__tile_id == (__current_num_wgs - 1) && __group.leader()) + __num_rng[0] = __copied_elements + __wg_count; + } +}; -template +template struct __copy_if_submitter; -template +template struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType, - oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>> + oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>> { - template + template sycl::event operator()(sycl::queue __q, _Event __fill_event, _InRng&& __in_rng, _OutRng&& __out_rng, - _NumSelectedRange&& __num_rng, std::size_t __n, _UnaryPredicate __pred, _StatusFlags&& __status_flags, std::size_t __status_flags_size, - _StatusValues&& __status_vals_full, _StatusValues&& __status_vals_partial, std::size_t __current_num_items, std::size_t __current_num_wgs) const + _NumSelectedRange&& __num_rng, std::size_t __n, _UnaryPredicate __pred, _StatusFlags&& __status_flags, + std::size_t __status_flags_size, _StatusValues&& __status_vals_full, + _StatusValues&& __status_vals_partial, std::size_t __current_num_items, + std::size_t __current_num_wgs) const { using _Type = oneapi::dpl::__internal::__value_t<_InRng>; using _LocalAccessorType = sycl::local_accessor<_Type, 1>; using _KernelFunc = __copy_if_kernel_func<__data_per_workitem, __workgroup_size, _FlagType, std::decay_t<_InRng>, - std::decay_t<_OutRng>, std::decay_t<_NumSelectedRange>, _UnaryPredicate, - std::decay_t<_StatusFlags>, std::decay_t<_StatusValues>, - std::decay_t<_LocalAccessorType>>; + std::decay_t<_OutRng>, std::decay_t<_NumSelectedRange>, _UnaryPredicate, + std::decay_t<_StatusFlags>, std::decay_t<_StatusValues>, + std::decay_t<_LocalAccessorType>>; static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; @@ -789,8 +778,6 @@ struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType, } }; - - template void @@ -836,10 +823,9 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ auto submitter = __copy_if_submitter<__elems_per_workitem, __workgroup_size, _FlagType, _CopyIfKernel>{}; - submitter(__queue, __fill_event,__in_rng, __out_rng, __num_rng, __n, __pred, __status_flags, __status_flags_size, - __status_vals_full, - __status_vals_partial, - __current_num_items, __current_num_wgs).wait(); + submitter(__queue, __fill_event, __in_rng, __out_rng, __num_rng, __n, __pred, __status_flags, __status_flags_size, + __status_vals_full, __status_vals_partial, __current_num_items, __current_num_wgs) + .wait(); __device_mem_mgr.free(); } From 3023fec71f179fa679cb7899e2ef3e60328d2f18 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 29 May 2024 08:40:24 -0400 Subject: [PATCH 093/134] remove unnecessary variable Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/experimental/kt/single_pass_scan.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 73a24d4be16..70a4a0630c0 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -821,10 +821,9 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ auto __fill_event = __lookback_init_submitter<_FlagType, _SizeT, _BinaryOp, _LookbackInitKernel>{}( __queue, __status_flags, __status_vals_partial, __status_flags_size, _FlagType::__padding); - auto submitter = __copy_if_submitter<__elems_per_workitem, __workgroup_size, _FlagType, _CopyIfKernel>{}; - - submitter(__queue, __fill_event, __in_rng, __out_rng, __num_rng, __n, __pred, __status_flags, __status_flags_size, - __status_vals_full, __status_vals_partial, __current_num_items, __current_num_wgs) + __copy_if_submitter<__elems_per_workitem, __workgroup_size, _FlagType, _CopyIfKernel>{}( + __queue, __fill_event, __in_rng, __out_rng, __num_rng, __n, __pred, __status_flags, __status_flags_size, + __status_vals_full, __status_vals_partial, __current_num_items, __current_num_wgs) .wait(); __device_mem_mgr.free(); } From 52f6d82bd0222d2bc787fa2bf44b1311f693f966 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 29 May 2024 11:02:59 -0400 Subject: [PATCH 094/134] renaming public APIs Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/experimental/kt/single_pass_scan.h | 4 ++-- test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 70a4a0630c0..eaa70c3c271 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -833,7 +833,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ template void -single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, +copy_if_single_wg(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate __pred, _KernelParam __param = {}) { @@ -855,7 +855,7 @@ single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIt template void -single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, +copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate __pred, _KernelParam __param = {}) { auto __n = __in_end - __in_begin; diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp index e0a079eaa3f..48fed4733f6 100644 --- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp @@ -42,7 +42,7 @@ test(Predicate pred, Generator gen, KernelParam param) size_t* out_num = sycl::malloc_device(1, q); q.copy(in.data(), in_ptr, n).wait(); - oneapi::dpl::experimental::kt::gpu::single_pass_copy_if(q, in_ptr, in_ptr + n, out_ptr, out_num, pred, param); + oneapi::dpl::experimental::kt::gpu::copy_if(q, in_ptr, in_ptr + n, out_ptr, out_num, pred, param); Sequence kt_out(n); size_t num_selected = 0; From 8b504d359cb507e4f5f7b0d2a605bdcd41ac6494 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 29 May 2024 11:59:06 -0400 Subject: [PATCH 095/134] sync with scan for asychronicity Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 30 +++++++++++++------ 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index eaa70c3c271..0e748e11261 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -616,9 +616,8 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou constexpr ::std::size_t __wgsize = _KernelParam::workgroup_size; constexpr ::std::size_t __elems_per_workitem = _KernelParam::data_per_workitem; - __copy_if_single_wg_submitter<__elems_per_workitem, __wgsize, _KernelName>{}(__queue, __in_rng, __out_rng, - __num_rng, __n, __pred) - .wait(); + return __copy_if_single_wg_submitter<__elems_per_workitem, __wgsize, _KernelName>{}(__queue, __in_rng, __out_rng, + __num_rng, __n, __pred); } template {}( __queue, __status_flags, __status_vals_partial, __status_flags_size, _FlagType::__padding); - __copy_if_submitter<__elems_per_workitem, __workgroup_size, _FlagType, _CopyIfKernel>{}( + sycl::event __prev_event = __copy_if_submitter<__elems_per_workitem, __workgroup_size, _FlagType, _CopyIfKernel>{}( __queue, __fill_event, __in_rng, __out_rng, __num_rng, __n, __pred, __status_flags, __status_flags_size, - __status_vals_full, __status_vals_partial, __current_num_items, __current_num_wgs) - .wait(); - __device_mem_mgr.free(); + __status_vals_full, __status_vals_partial, __current_num_items, __current_num_wgs); + + // TODO: Currently, the following portion of code makes this entire function synchronous. + // Ideally, we should be able to use the asynchronous free below, but we have found that doing + // so introduces a large unexplainable slowdown. Once this slowdown has been identified and corrected, + // we should replace this code with the asynchronous version below. + if (0) + { + return __device_mem_mgr.async_free(__prev_event); + } + else + { + __prev_event.wait(); + __device_mem_mgr.free(); + return __prev_event; + } } } // namespace __impl @@ -848,7 +860,7 @@ copy_if_single_wg(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_ oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>(); auto __buf_num = __keep2(__num_begin, __num_begin + 1); - __impl::single_pass_copy_if_impl_single_wg(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), + return __impl::single_pass_copy_if_impl_single_wg(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), __pred, __param); } @@ -869,7 +881,7 @@ copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutI oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>(); auto __buf_num = __keep2(__num_begin, __num_begin + 1); - __impl::single_pass_copy_if_impl(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), __pred, + return __impl::single_pass_copy_if_impl(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), __pred, __param); } From 6f60f10d9c150cc57cffb4db640b15cd9606833b Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 29 May 2024 12:02:14 -0400 Subject: [PATCH 096/134] sycl::event returns Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/experimental/kt/single_pass_scan.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 0e748e11261..6d404ca6c03 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -605,7 +605,7 @@ struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size, template -void +sycl::event single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate __pred, _KernelParam) { @@ -779,7 +779,7 @@ struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType, template -void +sycl::event single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate __pred, _KernelParam) { @@ -844,7 +844,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ template -void +sycl::event copy_if_single_wg(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate __pred, _KernelParam __param = {}) @@ -866,7 +866,7 @@ copy_if_single_wg(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_ template -void +sycl::event copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate __pred, _KernelParam __param = {}) { From 0dfcd15e439da0803b0174e1d31f19f02e276452 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 29 May 2024 16:19:03 -0400 Subject: [PATCH 097/134] naming and minor fixes Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 69 ++++++++++--------- 1 file changed, 36 insertions(+), 33 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 6d404ca6c03..c673bedf8b4 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -288,15 +288,15 @@ __lookback_phase(const _Group& __group, const _SubGroup& __subgroup, _StatusFlag } template struct __lookback_kernel_func { using _FlagStorageType = typename _FlagType::_FlagStorageType; static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; - _InRng __in_rng; - _OutRng __out_rng; + _InRange __in_rng; + _OutRange __out_rng; _BinaryOp __binary_op; std::size_t __n; _StatusFlags __status_flags; @@ -384,17 +384,17 @@ struct __lookback_scan_submitter<__data_per_workitem, __workgroup_size, _Type, _ oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>> { - template + template sycl::event - operator()(sycl::queue __q, sycl::event __prev_event, _InRng&& __in_rng, _OutRng&& __out_rng, _BinaryOp __binary_op, + operator()(sycl::queue __q, sycl::event __prev_event, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op, std::size_t __n, _StatusFlags&& __status_flags, std::size_t __status_flags_size, _StatusValues&& __status_vals_full, _StatusValues&& __status_vals_partial, std::size_t __current_num_items) const { using _LocalAccessorType = sycl::local_accessor<_Type, 1>; using _KernelFunc = - __lookback_kernel_func<__data_per_workitem, __workgroup_size, _Type, _FlagType, std::decay_t<_InRng>, - std::decay_t<_OutRng>, std::decay_t<_BinaryOp>, std::decay_t<_StatusFlags>, + __lookback_kernel_func<__data_per_workitem, __workgroup_size, _Type, _FlagType, std::decay_t<_InRange>, + std::decay_t<_OutRange>, std::decay_t<_BinaryOp>, std::decay_t<_StatusFlags>, std::decay_t<_StatusValues>, std::decay_t<_LocalAccessorType>>; static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; @@ -495,17 +495,17 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r } } -template struct __copy_if_single_wg_kernel_func { static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; using _SizeT = std::size_t; using _BinaryOp = std::plus<_SizeT>; - using _Type = oneapi::dpl::__internal::__value_t<_InRng>; + using _Type = oneapi::dpl::__internal::__value_t<_InRange>; - _InRng __in_rng; - _OutRng __out_rng; + _InRange __in_rng; + _OutRange __out_rng; _NumRng __num_rng; _SizeT __n; _UnaryPredicate __pred; @@ -581,15 +581,15 @@ struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size, oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>> { - template + template sycl::event - operator()(sycl::queue __q, _InRng&& __in_rng, _OutRng&& __out_rng, _NumSelectedRange&& __num_rng, std::size_t __n, + operator()(sycl::queue __q, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange&& __num_rng, std::size_t __n, _UnaryPredicate __pred) const { - using _Type = oneapi::dpl::__internal::__value_t<_InRng>; + using _Type = oneapi::dpl::__internal::__value_t<_InRange>; using _LocalAccessorType = sycl::local_accessor<_Type, 1>; - using _KernelFunc = __copy_if_single_wg_kernel_func<__data_per_workitem, __workgroup_size, std::decay_t<_InRng>, - std::decay_t<_OutRng>, std::decay_t<_NumSelectedRange>, + using _KernelFunc = __copy_if_single_wg_kernel_func<__data_per_workitem, __workgroup_size, std::decay_t<_InRange>, + std::decay_t<_OutRange>, std::decay_t<_NumSelectedRange>, _UnaryPredicate, std::decay_t<_LocalAccessorType>>; static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; @@ -609,30 +609,33 @@ sycl::event single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate __pred, _KernelParam) { - using _KernelName = __copy_if_single_wg_kernel; + using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + using _KernelName = __copy_if_single_wg_kernel; + using _CopyIfSingleWgKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __copy_if_kernel<_KernelName, _Type>>; const ::std::size_t __n = __in_rng.size(); constexpr ::std::size_t __wgsize = _KernelParam::workgroup_size; constexpr ::std::size_t __elems_per_workitem = _KernelParam::data_per_workitem; - return __copy_if_single_wg_submitter<__elems_per_workitem, __wgsize, _KernelName>{}(__queue, __in_rng, __out_rng, + return __copy_if_single_wg_submitter<__elems_per_workitem, __wgsize, _CopyIfSingleWgKernel>{}(__queue, __in_rng, __out_rng, __num_rng, __n, __pred); } -template struct __copy_if_kernel_func { static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; using _SizeT = std::size_t; using _BinaryOp = std::plus<_SizeT>; - using _Type = oneapi::dpl::__internal::__value_t<_InRng>; + using _Type = oneapi::dpl::__internal::__value_t<_InRange>; using _FlagStorageType = typename _FlagType::_FlagStorageType; - _InRng __in_rng; - _OutRng __out_rng; + _InRange __in_rng; + _OutRange __out_rng; _NumRng __num_rng; _SizeT __n; _UnaryPredicate __pred; @@ -744,20 +747,20 @@ struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType, oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>> { - template sycl::event - operator()(sycl::queue __q, _Event __fill_event, _InRng&& __in_rng, _OutRng&& __out_rng, + operator()(sycl::queue __q, _Event __fill_event, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange&& __num_rng, std::size_t __n, _UnaryPredicate __pred, _StatusFlags&& __status_flags, std::size_t __status_flags_size, _StatusValues&& __status_vals_full, _StatusValues&& __status_vals_partial, std::size_t __current_num_items, std::size_t __current_num_wgs) const { - using _Type = oneapi::dpl::__internal::__value_t<_InRng>; + using _Type = oneapi::dpl::__internal::__value_t<_InRange>; using _LocalAccessorType = sycl::local_accessor<_Type, 1>; using _KernelFunc = - __copy_if_kernel_func<__data_per_workitem, __workgroup_size, _FlagType, std::decay_t<_InRng>, - std::decay_t<_OutRng>, std::decay_t<_NumSelectedRange>, _UnaryPredicate, + __copy_if_kernel_func<__data_per_workitem, __workgroup_size, _FlagType, std::decay_t<_InRange>, + std::decay_t<_OutRange>, std::decay_t<_NumSelectedRange>, _UnaryPredicate, std::decay_t<_StatusFlags>, std::decay_t<_StatusValues>, std::decay_t<_LocalAccessorType>>; @@ -794,7 +797,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ __lookback_init_kernel<_KernelName, _SizeT, _BinaryOp>>; using _CopyIfKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __copy_if_kernel<_KernelName, _Type, _BinaryOp>>; + __copy_if_kernel<_KernelName, _Type>>; const std::size_t __n = __in_rng.size(); @@ -885,13 +888,13 @@ copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutI __param); } -template +template sycl::event -inclusive_scan(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _BinaryOp __binary_op, +inclusive_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op, _KernelParam __param = {}) { - auto __in_view = oneapi::dpl::__ranges::views::all(std::forward<_InRng>(__in_rng)); - auto __out_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRng>(__out_rng)); + auto __in_view = oneapi::dpl::__ranges::views::all(std::forward<_InRange>(__in_rng)); + auto __out_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRange>(__out_rng)); return __impl::__single_pass_scan(__queue, std::move(__in_view), std::move(__out_view), __binary_op, __param); } From 12db7225976742befda8911e0c579792f3192d0d Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 29 May 2024 16:37:04 -0400 Subject: [PATCH 098/134] removing single_wg public api Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 53 +++++-------------- 1 file changed, 12 insertions(+), 41 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index c673bedf8b4..cbb5ea8d29f 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -603,25 +603,6 @@ struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size, } }; -template -sycl::event -single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, - _NumSelectedRange __num_rng, _UnaryPredicate __pred, _KernelParam) -{ - - using _Type = oneapi::dpl::__internal::__value_t<_InRange>; - using _KernelName = __copy_if_single_wg_kernel; - using _CopyIfSingleWgKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __copy_if_kernel<_KernelName, _Type>>; - const ::std::size_t __n = __in_rng.size(); - - constexpr ::std::size_t __wgsize = _KernelParam::workgroup_size; - constexpr ::std::size_t __elems_per_workitem = _KernelParam::data_per_workitem; - - return __copy_if_single_wg_submitter<__elems_per_workitem, __wgsize, _CopyIfSingleWgKernel>{}(__queue, __in_rng, __out_rng, - __num_rng, __n, __pred); -} template >; + using _CopyIfSingleWgKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __copy_if_single_wg_kernel<_KernelName, _Type>>; + const std::size_t __n = __in_rng.size(); constexpr std::size_t __workgroup_size = _KernelParam::workgroup_size; @@ -809,6 +793,15 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ std::size_t __current_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile); std::size_t __current_num_items = __current_num_wgs * __workgroup_size; + //If we fit in a single WG, use the single wg version + if (__current_num_wgs == 1) + { + return __copy_if_single_wg_submitter<__elems_per_workitem, __workgroup_size, _CopyIfSingleWgKernel>{}(__queue, __in_rng, __out_rng, + __num_rng, __n, __pred); + } + + + __scan_lookback_mem_mgr<_FlagType> __device_mem_mgr(__queue, __current_num_wgs); __device_mem_mgr.allocate(); @@ -845,28 +838,6 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ } // namespace __impl -template -sycl::event -copy_if_single_wg(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, - _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate __pred, - _KernelParam __param = {}) -{ - auto __n = __in_end - __in_begin; - - auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); - auto __buf1 = __keep1(__in_begin, __in_end); - auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); - auto __buf2 = __keep2(__out_begin, __out_begin + __n); - - auto __keep_num = - oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>(); - auto __buf_num = __keep2(__num_begin, __num_begin + 1); - - return __impl::single_pass_copy_if_impl_single_wg(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), - __pred, __param); -} - template sycl::event From 4850fcf3c0616ccbaa07a84c86623768da064a92 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 29 May 2024 17:03:20 -0400 Subject: [PATCH 099/134] temporarily disable single wg version Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/experimental/kt/single_pass_scan.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index cbb5ea8d29f..5aedaa88291 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -794,11 +794,11 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ std::size_t __current_num_items = __current_num_wgs * __workgroup_size; //If we fit in a single WG, use the single wg version - if (__current_num_wgs == 1) - { - return __copy_if_single_wg_submitter<__elems_per_workitem, __workgroup_size, _CopyIfSingleWgKernel>{}(__queue, __in_rng, __out_rng, - __num_rng, __n, __pred); - } + // if (__current_num_wgs == 1) + // { + // return __copy_if_single_wg_submitter<__elems_per_workitem, __workgroup_size, _CopyIfSingleWgKernel>{}(__queue, __in_rng, __out_rng, + // __num_rng, __n, __pred); + // } From a8ccbd104623c5b1061faf2ee82a9ed7b333cef1 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Thu, 30 May 2024 12:10:21 -0400 Subject: [PATCH 100/134] wait after call for async algs Signed-off-by: Dan Hoeflinger --- test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp index 48fed4733f6..a46b76a3be2 100644 --- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp @@ -42,7 +42,7 @@ test(Predicate pred, Generator gen, KernelParam param) size_t* out_num = sycl::malloc_device(1, q); q.copy(in.data(), in_ptr, n).wait(); - oneapi::dpl::experimental::kt::gpu::copy_if(q, in_ptr, in_ptr + n, out_ptr, out_num, pred, param); + oneapi::dpl::experimental::kt::gpu::copy_if(q, in_ptr, in_ptr + n, out_ptr, out_num, pred, param).wait(); Sequence kt_out(n); size_t num_selected = 0; From 70ad4897f93c1b3481b0fede2d4f34523b665348 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Thu, 30 May 2024 12:18:33 -0400 Subject: [PATCH 101/134] reenable single wg Signed-off-by: Dan Hoeflinger --- .../oneapi/dpl/experimental/kt/single_pass_scan.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 5aedaa88291..359ac108382 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -794,13 +794,11 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ std::size_t __current_num_items = __current_num_wgs * __workgroup_size; //If we fit in a single WG, use the single wg version - // if (__current_num_wgs == 1) - // { - // return __copy_if_single_wg_submitter<__elems_per_workitem, __workgroup_size, _CopyIfSingleWgKernel>{}(__queue, __in_rng, __out_rng, - // __num_rng, __n, __pred); - // } - - + if (__current_num_wgs == 1) + { + return __copy_if_single_wg_submitter<__elems_per_workitem, __workgroup_size, _CopyIfSingleWgKernel>{}(__queue, __in_rng, __out_rng, + __num_rng, __n, __pred); + } __scan_lookback_mem_mgr<_FlagType> __device_mem_mgr(__queue, __current_num_wgs); __device_mem_mgr.allocate(); From e78b72c0c017a3ad431516f6c72e6dc277185717 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Thu, 30 May 2024 16:22:43 -0400 Subject: [PATCH 102/134] only need single phase for single wg Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 21 ++++++------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 359ac108382..e2bd3f358e1 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -496,7 +496,7 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r } template + typename _NumRng, typename _UnaryPredicate> struct __copy_if_single_wg_kernel_func { static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; @@ -509,7 +509,6 @@ struct __copy_if_single_wg_kernel_func _NumRng __num_rng; _SizeT __n; _UnaryPredicate __pred; - _TileValues __wg_copy_if_values; [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]] void operator()(const sycl::nd_item<1>& __item) const @@ -520,8 +519,7 @@ struct __copy_if_single_wg_kernel_func // Global load into local _SizeT __wg_count = 0; - // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values - if (__elems_in_tile <= __n) + if (__elems_in_tile == __n) { #pragma unroll for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size) @@ -532,7 +530,7 @@ struct __copy_if_single_wg_kernel_func _SizeT __count = sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); if (__satisfies_pred) - __wg_copy_if_values[__count] = __val; + __out_rng[__count] = __val; __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1); } @@ -556,18 +554,13 @@ struct __copy_if_single_wg_kernel_func if (__i + __wg_local_id < __n) { if (__satisfies_pred) - __wg_copy_if_values[__count] = std::move(__val.__v); + __out_rng[__count] = std::move(__val.__v); __val.__v.~_Type(); } __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1); } } - // Phase 3: copy values to global memory - for (int __i = __wg_local_id; __i < __wg_count; __i += __workgroup_size) - { - __out_rng[__i] = __wg_copy_if_values[__i]; - } if (__group.leader()) __num_rng[0] = __wg_count; } @@ -587,18 +580,16 @@ struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size, _UnaryPredicate __pred) const { using _Type = oneapi::dpl::__internal::__value_t<_InRange>; - using _LocalAccessorType = sycl::local_accessor<_Type, 1>; using _KernelFunc = __copy_if_single_wg_kernel_func<__data_per_workitem, __workgroup_size, std::decay_t<_InRange>, std::decay_t<_OutRange>, std::decay_t<_NumSelectedRange>, - _UnaryPredicate, std::decay_t<_LocalAccessorType>>; + _UnaryPredicate>; static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; return __q.submit([&](sycl::handler& __hdl) { - auto __tile_vals = _LocalAccessorType(sycl::range<1>{__elems_in_tile}, __hdl); oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng, __num_rng); __hdl.parallel_for<_Name...>(sycl::nd_range<1>(__workgroup_size, __workgroup_size), - _KernelFunc{__in_rng, __out_rng, __num_rng, __n, __pred, __tile_vals}); + _KernelFunc{__in_rng, __out_rng, __num_rng, __n, __pred}); }); } }; From 2ca42871824e9bcf48a3998e35ef2347f8eacbaf Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 31 May 2024 09:50:26 -0400 Subject: [PATCH 103/134] reusing single workgroup copy_if from oneDPL main Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 46 ++++---- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 111 ++++++++++-------- .../dpcpp/parallel_backend_sycl_utils.h | 8 +- 3 files changed, 97 insertions(+), 68 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index e2bd3f358e1..29b9fb94ef7 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -386,8 +386,8 @@ struct __lookback_scan_submitter<__data_per_workitem, __workgroup_size, _Type, _ template sycl::event - operator()(sycl::queue __q, sycl::event __prev_event, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op, - std::size_t __n, _StatusFlags&& __status_flags, std::size_t __status_flags_size, + operator()(sycl::queue __q, sycl::event __prev_event, _InRange&& __in_rng, _OutRange&& __out_rng, + _BinaryOp __binary_op, std::size_t __n, _StatusFlags&& __status_flags, std::size_t __status_flags_size, _StatusValues&& __status_vals_full, _StatusValues&& __status_vals_partial, std::size_t __current_num_items) const { @@ -576,13 +576,13 @@ struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size, template sycl::event - operator()(sycl::queue __q, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange&& __num_rng, std::size_t __n, - _UnaryPredicate __pred) const + operator()(sycl::queue __q, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange&& __num_rng, + std::size_t __n, _UnaryPredicate __pred) const { using _Type = oneapi::dpl::__internal::__value_t<_InRange>; - using _KernelFunc = __copy_if_single_wg_kernel_func<__data_per_workitem, __workgroup_size, std::decay_t<_InRange>, - std::decay_t<_OutRange>, std::decay_t<_NumSelectedRange>, - _UnaryPredicate>; + using _KernelFunc = + __copy_if_single_wg_kernel_func<__data_per_workitem, __workgroup_size, std::decay_t<_InRange>, + std::decay_t<_OutRange>, std::decay_t<_NumSelectedRange>, _UnaryPredicate>; static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; @@ -594,7 +594,6 @@ struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size, } }; - template @@ -719,8 +718,8 @@ struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType, oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>> { - template + template sycl::event operator()(sycl::queue __q, _Event __fill_event, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange&& __num_rng, std::size_t __n, _UnaryPredicate __pred, _StatusFlags&& __status_flags, @@ -768,14 +767,26 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ using _LookbackInitKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __lookback_init_kernel<_KernelName, _SizeT, _BinaryOp>>; - using _CopyIfKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __copy_if_kernel<_KernelName, _Type>>; + using _CopyIfKernel = + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__copy_if_kernel<_KernelName, _Type>>; using _CopyIfSingleWgKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __copy_if_single_wg_kernel<_KernelName, _Type>>; const std::size_t __n = __in_rng.size(); + // Next power of 2 greater than or equal to __n + auto __n_uniform = ::oneapi::dpl::__internal::__dpl_bit_ceil(__n); + + //If we fit in a single WG SLM, use the single wg version from oneDPL main + if (oneapi::dpl::__par_backend_hetero::__group_copy_if_fits_in_slm(__queue, __n, __n_uniform)) + { + return oneapi::dpl::__par_backend_hetero::__dispatch_small_copy_if( + oneapi::dpl::execution::__dpl::make_device_policy<_CopyIfSingleWgKernel>(__queue), __n, + std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng), + std::forward<_NumSelectedRange>(__num_rng), __pred); + } + constexpr std::size_t __workgroup_size = _KernelParam::workgroup_size; constexpr std::size_t __elems_per_workitem = _KernelParam::data_per_workitem; @@ -784,13 +795,6 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ std::size_t __current_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile); std::size_t __current_num_items = __current_num_wgs * __workgroup_size; - //If we fit in a single WG, use the single wg version - if (__current_num_wgs == 1) - { - return __copy_if_single_wg_submitter<__elems_per_workitem, __workgroup_size, _CopyIfSingleWgKernel>{}(__queue, __in_rng, __out_rng, - __num_rng, __n, __pred); - } - __scan_lookback_mem_mgr<_FlagType> __device_mem_mgr(__queue, __current_num_wgs); __device_mem_mgr.allocate(); @@ -831,7 +835,7 @@ template sycl::event copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, - _NumSelectedRange __num_begin, _UnaryPredicate __pred, _KernelParam __param = {}) + _NumSelectedRange __num_begin, _UnaryPredicate __pred, _KernelParam __param = {}) { auto __n = __in_end - __in_begin; @@ -845,7 +849,7 @@ copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutI auto __buf_num = __keep2(__num_begin, __num_begin + 1); return __impl::single_pass_copy_if_impl(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), __pred, - __param); + __param); } template diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 162fcf2c282..6edd2625080 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -574,11 +574,11 @@ template > { - template + template auto - operator()(const _Policy& __policy, _InRng&& __in_rng, _OutRng&& __out_rng, ::std::size_t __n, _InitType __init, - _BinaryOperation __bin_op, _UnaryOp __unary_op) + operator()(const _Policy& __policy, _InRng&& __in_rng, _OutRng&& __out_rng, _NumCopiedRng&& __num_copied_rng, + ::std::size_t __n, _InitType __init, _BinaryOperation __bin_op, _UnaryOp __unary_op) { using _ValueType = ::std::uint16_t; @@ -589,16 +589,13 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W constexpr ::std::uint32_t __elems_per_wg = _ElemsPerItem * _WGSize; - sycl::buffer<_Size> __res(sycl::range<1>(1)); - - auto __event = __policy.queue().submit([&](sycl::handler& __hdl) { - oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng); + return __policy.queue().submit([&](sycl::handler& __hdl) { + oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng, __num_copied_rng); // Local memory is split into two parts. The first half stores the result of applying the // predicate on each element of the input range. The second half stores the index of the output // range to copy elements of the input range. auto __lacc = __dpl_sycl::__local_accessor<_ValueType>(sycl::range<1>{__elems_per_wg * 2}, __hdl); - auto __res_acc = __res.template get_access(__hdl); __hdl.parallel_for<_ScanKernelName...>( sycl::nd_range<1>(_WGSize, _WGSize), [=](sycl::nd_item<1> __self_item) { @@ -656,11 +653,10 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W if (__item_id == 0) { // Add predicate of last element to account for the scan's exclusivity - __res_acc[0] = __lacc[__elems_per_wg + __n - 1] + __lacc[__n - 1]; + __num_copied_rng[0] = __lacc[__elems_per_wg + __n - 1] + __lacc[__n - 1]; } }); }); - return __future(__event, __res); } }; @@ -832,9 +828,11 @@ struct __invoke_single_group_copy_if // Specialization for devices that have a max work-group size of at least 1024 static constexpr ::std::uint16_t __targeted_wg_size = 1024; - template <::std::uint16_t _Size, typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Pred> + template <::std::uint16_t _Size, typename _ExecutionPolicy, typename _InRng, typename _OutRng, + typename _NumCopiedRng, typename _Pred> auto - operator()(_ExecutionPolicy&& __exec, ::std::size_t __n, _InRng&& __in_rng, _OutRng&& __out_rng, _Pred&& __pred) + operator()(_ExecutionPolicy&& __exec, ::std::size_t __n, _InRng&& __in_rng, _OutRng&& __out_rng, + _NumCopiedRng&& __num_copied_rng, _Pred&& __pred) { constexpr ::std::uint16_t __wg_size = ::std::min(_Size, __targeted_wg_size); constexpr ::std::uint16_t __num_elems_per_item = ::oneapi::dpl::__internal::__dpl_ceiling_div(_Size, __wg_size); @@ -846,23 +844,23 @@ struct __invoke_single_group_copy_if if (__is_full_group) return __par_backend_hetero::__parallel_copy_if_static_single_group_submitter< _SizeType, __num_elems_per_item, __wg_size, true, - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __scan_copy_single_wg_kernel<::std::integral_constant<::std::uint16_t, __wg_size>, - ::std::integral_constant<::std::uint16_t, __num_elems_per_item>, - /* _IsFullGroup= */ std::true_type, _CustomName>> - >()( - __exec, ::std::forward<_InRng>(__in_rng), ::std::forward<_OutRng>(__out_rng), __n, _InitType{}, - _ReduceOp{}, ::std::forward<_Pred>(__pred)); + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __scan_copy_single_wg_kernel, + std::integral_constant, + /* _IsFullGroup= */ std::true_type, _CustomName>>>()( + __exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), + std::forward<_NumCopiedRng>(__num_copied_rng), __n, _InitType{}, _ReduceOp{}, + std::forward<_Pred>(__pred)); else return __par_backend_hetero::__parallel_copy_if_static_single_group_submitter< _SizeType, __num_elems_per_item, __wg_size, false, - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __scan_copy_single_wg_kernel<::std::integral_constant<::std::uint16_t, __wg_size>, - ::std::integral_constant<::std::uint16_t, __num_elems_per_item>, - /* _IsFullGroup= */ std::false_type, _CustomName>> - >()( - __exec, ::std::forward<_InRng>(__in_rng), ::std::forward<_OutRng>(__out_rng), __n, _InitType{}, - _ReduceOp{}, ::std::forward<_Pred>(__pred)); + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __scan_copy_single_wg_kernel, + std::integral_constant, + /* _IsFullGroup= */ std::false_type, _CustomName>>>()( + __exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), + std::forward<_NumCopiedRng>(__num_copied_rng), __n, _InitType{}, _ReduceOp{}, + std::forward<_Pred>(__pred)); } }; @@ -907,36 +905,57 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag __copy_by_mask_op); } +template +bool +__group_copy_if_fits_in_slm(const sycl::queue& __queue, _Size __n, std::size_t __n_uniform) +{ + using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>; + ::std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__queue); + + // The kernel stores n 16 bit integers for the predicate and another n 16 bit integers for the offsets, + // so check "scan" for a 32 bit type. + return (oneapi::dpl::__par_backend_hetero::__group_scan_fits_in_slm<::std::uint32_t>(__queue, __n, __n_uniform) && + __max_wg_size >= _SingleGroupInvoker::__targeted_wg_size); +} + +template +auto +__dispatch_small_copy_if(_ExecutionPolicy&& __exec, _Size __n, _InRng&& __in_rng, _OutRng&& __out_rng, + _NumCopiedRng&& __num_copied_rng, _Pred __pred) +{ + using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>; + + using _SizeBreakpoints = + std::integer_sequence; + + return __par_backend_hetero::__static_monotonic_dispatcher<_SizeBreakpoints>::__dispatch( + _SingleGroupInvoker{}, __n, std::forward<_ExecutionPolicy>(__exec), __n, std::forward<_InRng>(__in_rng), + std::forward<_OutRng>(__out_rng), std::forward<_NumCopiedRng>(__num_copied_rng), __pred); +} + template auto __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred) { - using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>; - // Next power of 2 greater than or equal to __n auto __n_uniform = ::oneapi::dpl::__internal::__dpl_bit_ceil(static_cast<::std::make_unsigned_t<_Size>>(__n)); - // Pessimistically only use half of the memory to take into account memory used by compiled kernel - const ::std::size_t __max_slm_size = - __exec.queue().get_device().template get_info() / 2; - - // The kernel stores n integers for the predicate and another n integers for the offsets - const auto __req_slm_size = sizeof(::std::uint16_t) * __n_uniform * 2; - - constexpr ::std::uint16_t __single_group_upper_limit = 16384; + if (oneapi::dpl::__par_backend_hetero::__group_copy_if_fits_in_slm(__exec.queue(), __n, __n_uniform)) + { + sycl::buffer<_Size> __res(sycl::range<1>(1)); + auto __res_iterator = oneapi::dpl::begin(__res); - ::std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec); + auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, + decltype(__res_iterator)>(); + auto __res_rng = __keep(__res_iterator, __res_iterator + 1).all_view(); - if (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size && - __max_wg_size >= _SingleGroupInvoker::__targeted_wg_size) - { - using _SizeBreakpoints = - ::std::integer_sequence<::std::uint16_t, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384>; + sycl::event __event = oneapi::dpl::__par_backend_hetero::__dispatch_small_copy_if( + std::forward<_ExecutionPolicy>(__exec), __n, std::forward<_InRng>(__in_rng), + std::forward<_OutRng>(__out_rng), std::move(__res_rng), __pred); - return __par_backend_hetero::__static_monotonic_dispatcher<_SizeBreakpoints>::__dispatch( - _SingleGroupInvoker{}, __n, ::std::forward<_ExecutionPolicy>(__exec), __n, ::std::forward<_InRng>(__in_rng), - ::std::forward<_OutRng>(__out_rng), __pred); + return __future(__event, __res); } else { diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index e0b153e31e2..e29261991b3 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -55,11 +55,17 @@ __device_info(const _ExecutionPolicy& __policy) } #endif +::std::size_t +__max_work_group_size(const sycl::queue& __queue) +{ + return __queue.get_device().template get_info(); +} + template ::std::size_t __max_work_group_size(const _ExecutionPolicy& __policy) { - return __policy.queue().get_device().template get_info(); + return oneapi::dpl::__internal::__max_work_group_size(__policy.queue()); } template From 7538ed6babf05f2cfa42ec7ba3a46d7426fa9070 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 31 May 2024 10:16:52 -0400 Subject: [PATCH 104/134] add option to opt out of compiling single wg Signed-off-by: Dan Hoeflinger --- .../oneapi/dpl/experimental/kt/kernel_param.h | 3 ++- .../dpl/experimental/kt/single_pass_scan.h | 16 +++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/kernel_param.h b/include/oneapi/dpl/experimental/kt/kernel_param.h index b3ee36be189..bbed93e777c 100644 --- a/include/oneapi/dpl/experimental/kt/kernel_param.h +++ b/include/oneapi/dpl/experimental/kt/kernel_param.h @@ -18,12 +18,13 @@ namespace oneapi::dpl::experimental::kt { template + typename _KernelName = oneapi::dpl::execution::DefaultKernelName, typename _SingleWgOptOut = std::false_type> struct kernel_param { static constexpr std::uint16_t data_per_workitem = __data_per_work_item; static constexpr std::uint16_t workgroup_size = __work_group_size; using kernel_name = _KernelName; + using single_wg_opt_out = _SingleWgOptOut; }; } // namespace oneapi::dpl::experimental::kt diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 29b9fb94ef7..6d4d13ead6a 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -778,15 +778,17 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ // Next power of 2 greater than or equal to __n auto __n_uniform = ::oneapi::dpl::__internal::__dpl_bit_ceil(__n); - //If we fit in a single WG SLM, use the single wg version from oneDPL main - if (oneapi::dpl::__par_backend_hetero::__group_copy_if_fits_in_slm(__queue, __n, __n_uniform)) + if constexpr (std::negation_v) { - return oneapi::dpl::__par_backend_hetero::__dispatch_small_copy_if( - oneapi::dpl::execution::__dpl::make_device_policy<_CopyIfSingleWgKernel>(__queue), __n, - std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng), - std::forward<_NumSelectedRange>(__num_rng), __pred); + //If we fit in a single WG SLM, use the single wg version from oneDPL main + if (oneapi::dpl::__par_backend_hetero::__group_copy_if_fits_in_slm(__queue, __n, __n_uniform)) + { + return oneapi::dpl::__par_backend_hetero::__dispatch_small_copy_if( + oneapi::dpl::execution::__dpl::make_device_policy<_CopyIfSingleWgKernel>(__queue), __n, + std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng), + std::forward<_NumSelectedRange>(__num_rng), __pred); + } } - constexpr std::size_t __workgroup_size = _KernelParam::workgroup_size; constexpr std::size_t __elems_per_workitem = _KernelParam::data_per_workitem; From 293d7240e7aba7b9aaa6800af53d4b727382afe2 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 31 May 2024 10:51:08 -0400 Subject: [PATCH 105/134] adding opt out for single wg inclusive scan Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 6d4d13ead6a..87d4309c2f6 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -441,14 +441,18 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r // Next power of 2 greater than or equal to __n auto __n_uniform = ::oneapi::dpl::__internal::__dpl_bit_ceil(__n); - // Perform a single-work group scan if the input is small - if (oneapi::dpl::__par_backend_hetero::__group_scan_fits_in_slm<_Type>(__queue, __n, __n_uniform)) + if constexpr (std::negation_v) { - return oneapi::dpl::__par_backend_hetero::__parallel_transform_scan_single_group( - oneapi::dpl::__internal::__device_backend_tag{}, - oneapi::dpl::execution::__dpl::make_device_policy(__queue), - std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng), __n, - oneapi::dpl::__internal::__no_op{}, unseq_backend::__no_init_value<_Type>{}, __binary_op, std::true_type{}); + // Perform a single-work group scan if the input is small + if (oneapi::dpl::__par_backend_hetero::__group_scan_fits_in_slm<_Type>(__queue, __n, __n_uniform)) + { + return oneapi::dpl::__par_backend_hetero::__parallel_transform_scan_single_group( + oneapi::dpl::__internal::__device_backend_tag{}, + oneapi::dpl::execution::__dpl::make_device_policy(__queue), + std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng), __n, + oneapi::dpl::__internal::__no_op{}, unseq_backend::__no_init_value<_Type>{}, __binary_op, + std::true_type{}); + } } constexpr std::size_t __workgroup_size = _KernelParam::workgroup_size; From 565ba3ba8d667f6a9450750a8d77863d058665ed Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 31 May 2024 15:23:16 -0400 Subject: [PATCH 106/134] remove single_wg kt, in favor of main oneDPL version Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 98 ------------------- 1 file changed, 98 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 87d4309c2f6..3e54ce98bab 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -499,104 +499,6 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r } } -template -struct __copy_if_single_wg_kernel_func -{ - static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; - using _SizeT = std::size_t; - using _BinaryOp = std::plus<_SizeT>; - using _Type = oneapi::dpl::__internal::__value_t<_InRange>; - - _InRange __in_rng; - _OutRange __out_rng; - _NumRng __num_rng; - _SizeT __n; - _UnaryPredicate __pred; - - [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]] void - operator()(const sycl::nd_item<1>& __item) const - { - auto __group = __item.get_group(); - auto __wg_local_id = __item.get_local_id(0); - - // Global load into local - _SizeT __wg_count = 0; - - if (__elems_in_tile == __n) - { -#pragma unroll - for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size) - { - _Type __val = __in_rng[__i + __wg_local_id]; - - _SizeT __satisfies_pred = __pred(__val); - _SizeT __count = sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); - - if (__satisfies_pred) - __out_rng[__count] = __val; - - __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1); - } - } - else - { - // Edge of input, have to handle memory bounds - // Might have unneccessary group_barrier calls -#pragma unroll - for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size) - { - _SizeT __satisfies_pred = 0; - oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val; - if (__i + __wg_local_id < __n) - { - new (&__val.__v) _Type(__in_rng[__i + __wg_local_id]); - - __satisfies_pred = __pred(__val.__v); - } - _SizeT __count = sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); - if (__i + __wg_local_id < __n) - { - if (__satisfies_pred) - __out_rng[__count] = std::move(__val.__v); - __val.__v.~_Type(); - } - __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1); - } - } - - if (__group.leader()) - __num_rng[0] = __wg_count; - } -}; - -template -struct __copy_if_single_wg_submitter; - -template -struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size, - oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>> -{ - - template - sycl::event - operator()(sycl::queue __q, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange&& __num_rng, - std::size_t __n, _UnaryPredicate __pred) const - { - using _Type = oneapi::dpl::__internal::__value_t<_InRange>; - using _KernelFunc = - __copy_if_single_wg_kernel_func<__data_per_workitem, __workgroup_size, std::decay_t<_InRange>, - std::decay_t<_OutRange>, std::decay_t<_NumSelectedRange>, _UnaryPredicate>; - - static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; - - return __q.submit([&](sycl::handler& __hdl) { - oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng, __num_rng); - __hdl.parallel_for<_Name...>(sycl::nd_range<1>(__workgroup_size, __workgroup_size), - _KernelFunc{__in_rng, __out_rng, __num_rng, __n, __pred}); - }); - } -}; template Date: Tue, 4 Jun 2024 09:47:09 -0400 Subject: [PATCH 107/134] trying scalar version of copy_if Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 122 +++++++++++++++++- 1 file changed, 116 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 3e54ce98bab..56fb5a0bb91 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -267,24 +267,24 @@ __lookback_phase(const _Group& __group, const _SubGroup& __subgroup, _StatusFlag _StatusValues __status_vals_full, _StatusValues __status_vals_partial, std::uint32_t __tile_id, _Type& __local_reduction, _Type& __prev_tile_reduction, _BinaryOp __binary_op) { - // The first sub-group will query the previous tiles to find a prefix - if (__subgroup.get_group_id() == 0) + // The last sub-group will query the previous tiles to find a prefix + if (__subgroup.get_group_id() == (__subgroup.get_group_range()[0] - 1)) { _FlagType __flag(__status_flags, __status_vals_full, __status_vals_partial, __tile_id); - if (__subgroup.get_local_id() == 0) + if (__subgroup.get_local_id() == __subgroup.get_local_range()[0] - 1) { __flag.set_partial(__local_reduction); } __prev_tile_reduction = __flag.cooperative_lookback(__subgroup, __binary_op); - if (__subgroup.get_local_id() == 0) + if (__subgroup.get_local_id() == __subgroup.get_local_range()[0] - 1) { __flag.set_full(__binary_op(__prev_tile_reduction, __local_reduction)); } } - __prev_tile_reduction = sycl::group_broadcast(__group, __prev_tile_reduction, 0); + __prev_tile_reduction = sycl::group_broadcast(__group, __prev_tile_reduction, __group.get_local_range()[0] - 1); } template +struct __copy_if_kernel_func_scalar +{ + static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; + using _SizeT = std::size_t; + using _BinaryOp = std::plus<_SizeT>; + using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + using _FlagStorageType = typename _FlagType::_FlagStorageType; + + _InRange __in_rng; + _OutRange __out_rng; + _NumRng __num_rng; + _SizeT __n; + _UnaryPredicate __pred; + _StatusFlags __status_flags; + std::size_t __status_flags_size; + _StatusValues __status_vals_full; + _StatusValues __status_vals_partial; + _TileValues __wg_copy_if_values; + std::size_t __current_num_wgs; + + [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]] void + operator()(const sycl::nd_item<1>& __item) const + { + auto __group = __item.get_group(); + auto __wg_local_id = __item.get_local_id(0); + auto __sg = __item.get_sub_group(); + + std::uint32_t __tile_id = 0; + + // Obtain unique ID for this work-group that will be used in decoupled lookback + if (__group.leader()) + { + sycl::atomic_ref<_FlagStorageType, sycl::memory_order::relaxed, sycl::memory_scope::device, + sycl::access::address_space::global_space> + __idx_atomic(__status_flags[__status_flags_size - 1]); + __tile_id = __idx_atomic.fetch_add(1); + } + + __tile_id = sycl::group_broadcast(__group, __tile_id, 0); + + std::size_t __current_offset = static_cast(__tile_id) * __elems_in_tile; + + std::uint16_t __wi_count = 0; + // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values + + //TODO: check if it is better to check this at a subgroup or wg level rather than work item + if ((__wg_local_id + 1) * __data_per_workitem + __tile_id * __elems_in_tile <= __n) + { +#pragma unroll + for (size_t __i = 0; __i < __data_per_workitem; ++__i) + { + // TODO: explore scalar impl. Does this allow us to avoid the group broadcast (sync)? + // if load is done in a scalar fashion and provides the same performance, we + // can avoid the broadcast (I think) + // would need to loop over the elements per work item first accumulating into + // satisfies pred, copying to "my slot" in SLM then do scan, then the copy to + // global memory needs to be loaded per work item per element, skipping copies + // when they were not saved. + _Type __val = __in_rng[__i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id]; + + if (__pred(__val)) + { + __wg_copy_if_values[__wi_count + __wg_local_id * __data_per_workitem] = __val; + ++__wi_count; + } + } + + } + else + { + // Edge of input, have to handle memory bounds + for (size_t __i = 0; __i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id < __n; ++__i) + { + if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n) + { + _Type __val = __in_rng[__i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id]; + + if (__pred(__val)) + { + __wg_copy_if_values[__wi_count + __wg_local_id * __data_per_workitem] = __val; + ++__wi_count; + } + } + + } + } + _SizeT __wg_count = __wi_count; + __wg_count = sycl::exclusive_scan_over_group(__group, __wg_count, _BinaryOp{}); + + // Phase 2: Global scan across __wg_count + _SizeT __copied_elements = 0; + + __lookback_phase<_FlagType>(__group, __sg, __status_flags, __status_vals_full, __status_vals_partial, __tile_id, + __wg_count, __copied_elements, _BinaryOp{}); + + // Phase 3: copy values to global memory + for (int __i = 0; __i < __wi_count; ++__i) + { + __out_rng[__copied_elements + __wg_count + __i] = __wg_copy_if_values[__wi_count + __wg_local_id * __data_per_workitem]; + } + if (__tile_id == (__current_num_wgs - 1) && __wg_local_id == (__workgroup_size - 1)) + __num_rng[0] = __copied_elements + __wg_count + __wi_count; + } +}; + template struct __copy_if_submitter; @@ -636,7 +746,7 @@ struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType, using _Type = oneapi::dpl::__internal::__value_t<_InRange>; using _LocalAccessorType = sycl::local_accessor<_Type, 1>; using _KernelFunc = - __copy_if_kernel_func<__data_per_workitem, __workgroup_size, _FlagType, std::decay_t<_InRange>, + __copy_if_kernel_func_scalar<__data_per_workitem, __workgroup_size, _FlagType, std::decay_t<_InRange>, std::decay_t<_OutRange>, std::decay_t<_NumSelectedRange>, _UnaryPredicate, std::decay_t<_StatusFlags>, std::decay_t<_StatusValues>, std::decay_t<_LocalAccessorType>>; From e49bc9f4140b26798a2db498291d086ef2e3c20b Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Tue, 4 Jun 2024 09:54:43 -0400 Subject: [PATCH 108/134] fix Signed-off-by: Dan Hoeflinger --- .../oneapi/dpl/experimental/kt/single_pass_scan.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 56fb5a0bb91..dfb633df854 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -551,7 +551,7 @@ struct __copy_if_kernel_func if ((__tile_id + 1) * __elems_in_tile <= __n) { #pragma unroll - for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size) + for (std::size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size) { // TODO: explore scalar impl. Does this allow us to avoid the group broadcast (sync)? // if load is done in a scalar fashion and provides the same performance, we @@ -576,7 +576,7 @@ struct __copy_if_kernel_func // Edge of input, have to handle memory bounds // Might have unneccessary group_barrier calls #pragma unroll - for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size) + for (std::size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size) { _SizeT __satisfies_pred = 0; oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val; @@ -607,7 +607,7 @@ struct __copy_if_kernel_func //TODO: explore above comment about scalar load // Phase 3: copy values to global memory - for (int __i = __wg_local_id; __i < __wg_count; __i += __workgroup_size) + for (std::size_t __i = __wg_local_id; __i < __wg_count; __i += __workgroup_size) { __out_rng[__copied_elements + __i] = __wg_copy_if_values[__i]; } @@ -670,7 +670,7 @@ struct __copy_if_kernel_func_scalar if ((__wg_local_id + 1) * __data_per_workitem + __tile_id * __elems_in_tile <= __n) { #pragma unroll - for (size_t __i = 0; __i < __data_per_workitem; ++__i) + for (std::uint16_t __i = 0; __i < __data_per_workitem; ++__i) { // TODO: explore scalar impl. Does this allow us to avoid the group broadcast (sync)? // if load is done in a scalar fashion and provides the same performance, we @@ -692,7 +692,7 @@ struct __copy_if_kernel_func_scalar else { // Edge of input, have to handle memory bounds - for (size_t __i = 0; __i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id < __n; ++__i) + for (std::uint16_t __i = 0; __i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id < __n; ++__i) { if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n) { @@ -717,9 +717,9 @@ struct __copy_if_kernel_func_scalar __wg_count, __copied_elements, _BinaryOp{}); // Phase 3: copy values to global memory - for (int __i = 0; __i < __wi_count; ++__i) + for (std::uint16_t __i = 0; __i < __wi_count; ++__i) { - __out_rng[__copied_elements + __wg_count + __i] = __wg_copy_if_values[__wi_count + __wg_local_id * __data_per_workitem]; + __out_rng[__copied_elements + __wg_count + __i] = __wg_copy_if_values[__i + __wg_local_id * __data_per_workitem]; } if (__tile_id == (__current_num_wgs - 1) && __wg_local_id == (__workgroup_size - 1)) __num_rng[0] = __copied_elements + __wg_count + __wi_count; From 92438ee85f272d0bb47f25af3a6fc2596605d917 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Tue, 4 Jun 2024 09:57:25 -0400 Subject: [PATCH 109/134] fix Signed-off-by: Dan Hoeflinger --- .../oneapi/dpl/experimental/kt/single_pass_scan.h | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index dfb633df854..3f0da5a4407 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -694,17 +694,13 @@ struct __copy_if_kernel_func_scalar // Edge of input, have to handle memory bounds for (std::uint16_t __i = 0; __i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id < __n; ++__i) { - if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n) - { - _Type __val = __in_rng[__i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id]; + _Type __val = __in_rng[__i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id]; - if (__pred(__val)) - { - __wg_copy_if_values[__wi_count + __wg_local_id * __data_per_workitem] = __val; - ++__wi_count; - } + if (__pred(__val)) + { + __wg_copy_if_values[__wi_count + __wg_local_id * __data_per_workitem] = __val; + ++__wi_count; } - } } _SizeT __wg_count = __wi_count; From 5495da2cf81c14c9d6bd62436d53ec6ef0b492a8 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Tue, 4 Jun 2024 10:05:24 -0400 Subject: [PATCH 110/134] full sum Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/experimental/kt/single_pass_scan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 3f0da5a4407..858df2990ad 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -710,7 +710,7 @@ struct __copy_if_kernel_func_scalar _SizeT __copied_elements = 0; __lookback_phase<_FlagType>(__group, __sg, __status_flags, __status_vals_full, __status_vals_partial, __tile_id, - __wg_count, __copied_elements, _BinaryOp{}); + __wg_count + __wi_count, __copied_elements, _BinaryOp{}); // Phase 3: copy values to global memory for (std::uint16_t __i = 0; __i < __wi_count; ++__i) From a7ca1b5695be9d8834bb4be93fe59c8527e614a3 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Tue, 4 Jun 2024 10:07:05 -0400 Subject: [PATCH 111/134] switching arg to const ref Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/experimental/kt/single_pass_scan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 858df2990ad..2be524cfe24 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -265,7 +265,7 @@ template Date: Tue, 4 Jun 2024 07:34:52 -0700 Subject: [PATCH 112/134] branch by tile, not by workitem --- include/oneapi/dpl/experimental/kt/single_pass_scan.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 2be524cfe24..4175d68725e 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -667,18 +667,11 @@ struct __copy_if_kernel_func_scalar // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values //TODO: check if it is better to check this at a subgroup or wg level rather than work item - if ((__wg_local_id + 1) * __data_per_workitem + __tile_id * __elems_in_tile <= __n) + if ((__tile_id + 1) * __elems_in_tile <= __n) { #pragma unroll for (std::uint16_t __i = 0; __i < __data_per_workitem; ++__i) { - // TODO: explore scalar impl. Does this allow us to avoid the group broadcast (sync)? - // if load is done in a scalar fashion and provides the same performance, we - // can avoid the broadcast (I think) - // would need to loop over the elements per work item first accumulating into - // satisfies pred, copying to "my slot" in SLM then do scan, then the copy to - // global memory needs to be loaded per work item per element, skipping copies - // when they were not saved. _Type __val = __in_rng[__i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id]; if (__pred(__val)) From f26aff0fa4bc4858bb748f9aa64e09e6caf5287d Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Tue, 4 Jun 2024 11:48:58 -0400 Subject: [PATCH 113/134] removing unused block_strided version Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 122 +----------------- 1 file changed, 1 insertion(+), 121 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 4175d68725e..d70e0aca63d 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -499,7 +499,6 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r } } - template @@ -523,124 +522,6 @@ struct __copy_if_kernel_func _TileValues __wg_copy_if_values; std::size_t __current_num_wgs; - [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]] void - operator()(const sycl::nd_item<1>& __item) const - { - auto __group = __item.get_group(); - auto __wg_local_id = __item.get_local_id(0); - auto __sg = __item.get_sub_group(); - - std::uint32_t __tile_id = 0; - - // Obtain unique ID for this work-group that will be used in decoupled lookback - if (__group.leader()) - { - sycl::atomic_ref<_FlagStorageType, sycl::memory_order::relaxed, sycl::memory_scope::device, - sycl::access::address_space::global_space> - __idx_atomic(__status_flags[__status_flags_size - 1]); - __tile_id = __idx_atomic.fetch_add(1); - } - - __tile_id = sycl::group_broadcast(__group, __tile_id, 0); - - std::size_t __current_offset = static_cast(__tile_id) * __elems_in_tile; - - _SizeT __wg_count = 0; - - // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values - if ((__tile_id + 1) * __elems_in_tile <= __n) - { -#pragma unroll - for (std::size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size) - { - // TODO: explore scalar impl. Does this allow us to avoid the group broadcast (sync)? - // if load is done in a scalar fashion and provides the same performance, we - // can avoid the broadcast (I think) - // would need to loop over the elements per work item first accumulating into - // satisfies pred, copying to "my slot" in SLM then do scan, then the copy to - // global memory needs to be loaded per work item per element, skipping copies - // when they were not saved. - _Type __val = __in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id]; - - _SizeT __satisfies_pred = __pred(__val); - _SizeT __count = sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); - - if (__satisfies_pred) - __wg_copy_if_values[__count] = __val; - - __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1); - } - } - else - { - // Edge of input, have to handle memory bounds - // Might have unneccessary group_barrier calls -#pragma unroll - for (std::size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size) - { - _SizeT __satisfies_pred = 0; - oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val; - if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n) - { - new (&__val.__v) _Type(__in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id]); - - __satisfies_pred = __pred(__val.__v); - } - _SizeT __count = sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{}); - - if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n) - { - if (__satisfies_pred) - __wg_copy_if_values[__count] = std::move(__val.__v); - __val.__v.~_Type(); - } - - __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1); - } - } - - // Phase 2: Global scan across __wg_count - _SizeT __copied_elements = 0; - - __lookback_phase<_FlagType>(__group, __sg, __status_flags, __status_vals_full, __status_vals_partial, __tile_id, - __wg_count, __copied_elements, _BinaryOp{}); - - //TODO: explore above comment about scalar load - // Phase 3: copy values to global memory - for (std::size_t __i = __wg_local_id; __i < __wg_count; __i += __workgroup_size) - { - __out_rng[__copied_elements + __i] = __wg_copy_if_values[__i]; - } - if (__tile_id == (__current_num_wgs - 1) && __group.leader()) - __num_rng[0] = __copied_elements + __wg_count; - } -}; - - - -template -struct __copy_if_kernel_func_scalar -{ - static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; - using _SizeT = std::size_t; - using _BinaryOp = std::plus<_SizeT>; - using _Type = oneapi::dpl::__internal::__value_t<_InRange>; - using _FlagStorageType = typename _FlagType::_FlagStorageType; - - _InRange __in_rng; - _OutRange __out_rng; - _NumRng __num_rng; - _SizeT __n; - _UnaryPredicate __pred; - _StatusFlags __status_flags; - std::size_t __status_flags_size; - _StatusValues __status_vals_full; - _StatusValues __status_vals_partial; - _TileValues __wg_copy_if_values; - std::size_t __current_num_wgs; - [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]] void operator()(const sycl::nd_item<1>& __item) const { @@ -666,7 +547,6 @@ struct __copy_if_kernel_func_scalar std::uint16_t __wi_count = 0; // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values - //TODO: check if it is better to check this at a subgroup or wg level rather than work item if ((__tile_id + 1) * __elems_in_tile <= __n) { #pragma unroll @@ -735,7 +615,7 @@ struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType, using _Type = oneapi::dpl::__internal::__value_t<_InRange>; using _LocalAccessorType = sycl::local_accessor<_Type, 1>; using _KernelFunc = - __copy_if_kernel_func_scalar<__data_per_workitem, __workgroup_size, _FlagType, std::decay_t<_InRange>, + __copy_if_kernel_func<__data_per_workitem, __workgroup_size, _FlagType, std::decay_t<_InRange>, std::decay_t<_OutRange>, std::decay_t<_NumSelectedRange>, _UnaryPredicate, std::decay_t<_StatusFlags>, std::decay_t<_StatusValues>, std::decay_t<_LocalAccessorType>>; From c0ab6512e0cfa36cb77ae7f0f0e18d28112afe4f Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Tue, 4 Jun 2024 11:55:37 -0400 Subject: [PATCH 114/134] range API and formatting Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 37 +++++++++++++------ 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index d70e0aca63d..42f7fcc7f78 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -560,12 +560,12 @@ struct __copy_if_kernel_func ++__wi_count; } } - } else { // Edge of input, have to handle memory bounds - for (std::uint16_t __i = 0; __i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id < __n; ++__i) + for (std::uint16_t __i = 0; __i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id < __n; + ++__i) { _Type __val = __in_rng[__i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id]; @@ -576,7 +576,7 @@ struct __copy_if_kernel_func } } } - _SizeT __wg_count = __wi_count; + _SizeT __wg_count = __wi_count; __wg_count = sycl::exclusive_scan_over_group(__group, __wg_count, _BinaryOp{}); // Phase 2: Global scan across __wg_count @@ -588,7 +588,8 @@ struct __copy_if_kernel_func // Phase 3: copy values to global memory for (std::uint16_t __i = 0; __i < __wi_count; ++__i) { - __out_rng[__copied_elements + __wg_count + __i] = __wg_copy_if_values[__i + __wg_local_id * __data_per_workitem]; + __out_rng[__copied_elements + __wg_count + __i] = + __wg_copy_if_values[__i + __wg_local_id * __data_per_workitem]; } if (__tile_id == (__current_num_wgs - 1) && __wg_local_id == (__workgroup_size - 1)) __num_rng[0] = __copied_elements + __wg_count + __wi_count; @@ -718,25 +719,39 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ } // namespace __impl -template +sycl::event +copy_if(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumCopiedRange&& __num_rng, + _UnaryPredicate __pred, _KernelParam __param = {}) +{ + auto __in_view = oneapi::dpl::__ranges::views::all(std::forward<_InRange>(__in_rng)); + auto __out_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRange>(__out_rng)); + auto __num_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRange>(__num_rng)); + + return __impl::single_pass_copy_if_impl(__queue, std::move(__in_view), std::move(__out_view), std::move(__num_view), + __pred, __param); +} + +template sycl::event copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, - _NumSelectedRange __num_begin, _UnaryPredicate __pred, _KernelParam __param = {}) + _NumCopiedIterator __num_begin, _UnaryPredicate __pred, _KernelParam __param = {}) { auto __n = __in_end - __in_begin; auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); - auto __buf1 = __keep1(__in_begin, __in_end); + auto __buf_in = __keep1(__in_begin, __in_end); auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); - auto __buf2 = __keep2(__out_begin, __out_begin + __n); + auto __buf_out = __keep2(__out_begin, __out_begin + __n); auto __keep_num = - oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>(); + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumCopiedIterator>(); auto __buf_num = __keep2(__num_begin, __num_begin + 1); - return __impl::single_pass_copy_if_impl(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), __pred, - __param); + return __impl::single_pass_copy_if_impl(__queue, __buf_in.all_view(), __buf_out.all_view(), __buf_num.all_view(), + __pred, __param); } template From 517c34163d532783eeea3777db4dee7642b0f52f Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Tue, 4 Jun 2024 14:25:23 -0400 Subject: [PATCH 115/134] removing unnecessary stuff Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 13 +- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 1 - .../hetero/dpcpp/parallel_backend_sycl_scan.h | 723 ------------------ include/oneapi/dpl/pstl/utils.h | 8 - .../numeric/numeric.ops/scan_kt.pass.cpp | 66 -- 5 files changed, 2 insertions(+), 809 deletions(-) delete mode 100644 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h delete mode 100644 test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 42f7fcc7f78..466fe8548d9 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -39,12 +39,6 @@ namespace __impl template class __copy_if_kernel; -template -class __copy_if_single_wg_kernel; - -template -class __inclusive_scan_kernel; - template class __lookback_init_kernel; @@ -420,7 +414,7 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r using _FlagType = __scan_status_flag<_Type>; using _FlagStorageType = typename _FlagType::_FlagStorageType; - using _KernelName = __inclusive_scan_kernel; + using _KernelName = typename _KernelParam::kernel_name; using _LookbackInitKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __lookback_init_kernel<_KernelName, _Type, _BinaryOp>>; using _LookbackKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< @@ -656,9 +650,6 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ using _CopyIfKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__copy_if_kernel<_KernelName, _Type>>; - using _CopyIfSingleWgKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __copy_if_single_wg_kernel<_KernelName, _Type>>; - const std::size_t __n = __in_rng.size(); // Next power of 2 greater than or equal to __n @@ -670,7 +661,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ if (oneapi::dpl::__par_backend_hetero::__group_copy_if_fits_in_slm(__queue, __n, __n_uniform)) { return oneapi::dpl::__par_backend_hetero::__dispatch_small_copy_if( - oneapi::dpl::execution::__dpl::make_device_policy<_CopyIfSingleWgKernel>(__queue), __n, + oneapi::dpl::execution::__dpl::make_device_policy<_KernelName>(__queue), __n, std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng), std::forward<_NumSelectedRange>(__num_rng), __pred); } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 6edd2625080..39318972cb4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -46,7 +46,6 @@ #endif #include "sycl_traits.h" //SYCL traits specialization for some oneDPL types. -#include "parallel_backend_sycl_scan.h" namespace oneapi { diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h deleted file mode 100644 index 8752c4baf0e..00000000000 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h +++ /dev/null @@ -1,723 +0,0 @@ -// -*- C++ -*- -//===----------------------------------------------------------------------===// -// -// Copyright (C) Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// This file incorporates work covered by the following copyright and permission -// notice: -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// -//===----------------------------------------------------------------------===// - -#ifndef _ONEDPL_parallel_backend_sycl_scan_H -#define _ONEDPL_parallel_backend_sycl_scan_H - -#include -#include - -namespace oneapi::dpl::experimental::kt -{ - -inline namespace igpu { - -constexpr ::std::size_t SUBGROUP_SIZE = 32; - -template typename LookbackScanMemory, - typename TileId> -struct ScanMemoryManager -{ - using _TileIdT = typename TileId::_TileIdT; - using _LookbackScanMemory = LookbackScanMemory; - using _FlagT = typename _LookbackScanMemory::_FlagT; - - ScanMemoryManager(sycl::queue q) : q{q} {}; - - ::std::uint8_t* - scan_memory_ptr() noexcept - { - return scan_memory_begin; - }; - - _TileIdT* - tile_id_ptr() noexcept - { - return tile_id_begin; - }; - - void - allocate(::std::size_t num_wgs) - { - ::std::size_t scan_memory_size = _LookbackScanMemory::get_memory_size(num_wgs); - constexpr ::std::size_t padded_tileid_size = TileId::get_padded_memory_size(); - constexpr ::std::size_t tileid_size = TileId::get_memory_size(); - - auto mem_size_bytes = scan_memory_size + padded_tileid_size; - - scratch = sycl::malloc_device<::std::uint8_t>(mem_size_bytes, q); - - scan_memory_begin = scratch; - - void* base_tileid_ptr = reinterpret_cast(scan_memory_begin + scan_memory_size); - size_t remainder = mem_size_bytes - scan_memory_size; - - tile_id_begin = reinterpret_cast<_TileIdT*>( - ::std::align(::std::alignment_of_v<_TileIdT>, tileid_size, base_tileid_ptr, remainder)); - } - - sycl::event - async_free(sycl::event dependency) - { - return q.submit( - [e = dependency, ptr = scratch, q_ = q](sycl::handler& hdl) - { - hdl.depends_on(e); - hdl.host_task([=]() { sycl::free(ptr, q_); }); - }); - } - - void - free() - { - sycl::free(scratch, q); - } - - private: - ::std::uint8_t* scratch = nullptr; - ::std::uint8_t* scan_memory_begin = nullptr; - _TileIdT* tile_id_begin = nullptr; - - sycl::queue q; -}; - -template -struct LookbackScanMemory; - -template -struct LookbackScanMemory<_T, /* UseAtomic64=*/::std::false_type> -{ - using _FlagT = ::std::uint32_t; - using _AtomicFlagRefT = sycl::atomic_ref<_FlagT, sycl::memory_order::acq_rel, sycl::memory_scope::device, - sycl::access::address_space::global_space>; - - static constexpr _FlagT NOT_READY = 0; - static constexpr _FlagT PARTIAL_MASK = 1; - static constexpr _FlagT FULL_MASK = 2; - static constexpr _FlagT OUT_OF_BOUNDS = 4; - - static constexpr ::std::size_t padding = SUBGROUP_SIZE; - - // LookbackScanMemory: [Partial Value, ..., Full Value, ..., Flag, ...] - // Each section has num_wgs + padding elements - LookbackScanMemory(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs) - : num_elements(get_num_elements(num_wgs)), tile_values_begin(reinterpret_cast<_T*>(scan_memory_begin)), - flags_begin(get_flags_begin(scan_memory_begin, num_wgs)) - { - } - - void - set_partial(::std::size_t tile_id, _T val) - { - _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding)); - - tile_values_begin[tile_id + padding] = val; - atomic_flag.store(PARTIAL_MASK); - } - - void - set_full(::std::size_t tile_id, _T val) - { - _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding)); - - tile_values_begin[tile_id + padding + num_elements] = val; - atomic_flag.store(FULL_MASK); - } - - _AtomicFlagRefT - get_flag(::std::size_t tile_id) const - { - return _AtomicFlagRefT(*(flags_begin + tile_id + padding)); - } - - _T - get_value(::std::size_t tile_id, _FlagT flag) const - { - // full_value and partial_value are num_elements apart - return *(tile_values_begin + tile_id + padding + num_elements * is_full(flag)); - } - - static ::std::size_t - get_tile_values_bytes(::std::size_t num_elements) - { - return (2 * num_elements) * sizeof(_T); - } - - static ::std::size_t - get_flag_bytes(::std::size_t num_elements) - { - return num_elements * sizeof(_FlagT); - } - - static ::std::size_t - get_padded_flag_bytes(::std::size_t num_elements) - { - // sizeof(_FlagT) extra bytes for possible intenal alignment - return get_flag_bytes(num_elements) + sizeof(_FlagT); - } - - static _FlagT* - get_flags_begin(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs) - { - // Aligned flags - ::std::size_t num_elements = get_num_elements(num_wgs); - ::std::size_t tile_values_bytes = get_tile_values_bytes(num_elements); - void* base_flags = reinterpret_cast(scan_memory_begin + tile_values_bytes); - auto remainder = get_padded_flag_bytes(num_elements); // scan_memory_bytes - tile_values_bytes - return reinterpret_cast<_FlagT*>( - ::std::align(::std::alignment_of_v<_FlagT>, get_flag_bytes(num_elements), base_flags, remainder)); - } - - static ::std::size_t - get_memory_size(::std::size_t num_wgs) - { - ::std::size_t num_elements = get_num_elements(num_wgs); - // sizeof(_T) extra bytes are not needed because LookbackScanMemory is going at the beginning of the scratch - ::std::size_t tile_values_bytes = get_tile_values_bytes(num_elements); - // Padding to provide room for aligment - ::std::size_t flag_bytes = get_padded_flag_bytes(num_elements); - - return tile_values_bytes + flag_bytes; - } - - static ::std::size_t - get_num_elements(::std::size_t num_wgs) - { - return padding + num_wgs; - } - - static bool - is_ready(_FlagT flag) - { - return flag != NOT_READY; - } - - static bool - is_full(_FlagT flag) - { - return flag == FULL_MASK; - } - - static bool - is_out_of_bounds(_FlagT flag) - { - return flag == OUT_OF_BOUNDS; - } - - private: - ::std::size_t num_elements; - _FlagT* flags_begin; - _T* tile_values_begin; -}; - -template -struct LookbackScanMemory<_T, /* UseAtomic64=*/::std::true_type> -{ - using _FlagT = ::std::uint64_t; - using _AtomicFlagRefT = sycl::atomic_ref<_FlagT, sycl::memory_order::relaxed, sycl::memory_scope::device, - sycl::access::address_space::global_space>; - - // Each flag is divided in 2 32bit values - // 32..63 status bits - // 00..31 value bits - // Example: status = full scanned value, int value = 15: - // 1000 0000 0000 0000 0000 0000 0000 0000 | 0000 0000 0000 0000 0000 0000 0000 1111 - - // Status values: - // 00xxxx - not computed - // 01xxxx - partial - // 10xxxx - full - // 110000 - out of bounds - - static constexpr _FlagT NOT_READY = 0; - static constexpr _FlagT PARTIAL_MASK = 1l << (sizeof(_FlagT) * 8 - 2); - static constexpr _FlagT FULL_MASK = 1l << (sizeof(_FlagT) * 8 - 1); - static constexpr _FlagT OUT_OF_BOUNDS = PARTIAL_MASK | FULL_MASK; - - static constexpr _FlagT VALUE_MASK = (1l << sizeof(::std::uint32_t) * 8) - 1; // 32 bit mask to store value - - static constexpr ::std::size_t padding = SUBGROUP_SIZE; - - LookbackScanMemory(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs) - : num_elements(get_num_elements(num_wgs)), flags_begin(get_flags_begin(scan_memory_begin, num_wgs)) - { - } - - void - set_partial(::std::size_t tile_id, _T val) - { - _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding)); - - atomic_flag.store(PARTIAL_MASK | static_cast<::std::uint32_t>(val)); - } - - void - set_full(::std::size_t tile_id, _T val) - { - _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding)); - - atomic_flag.store(FULL_MASK | static_cast<::std::uint32_t>(val)); - } - - _AtomicFlagRefT - get_flag(::std::size_t tile_id) const - { - return _AtomicFlagRefT(*(flags_begin + tile_id + padding)); - } - - _T - get_value(::std::size_t, _FlagT flag) const - { - return static_cast<_T>(flag & VALUE_MASK); - } - - static _FlagT* - get_flags_begin(::std::uint8_t* scan_memory_begin, ::std::size_t) - { - return reinterpret_cast<_FlagT*>(scan_memory_begin); - } - - static ::std::size_t - get_memory_size(::std::size_t num_wgs) - { - ::std::size_t num_elements = get_num_elements(num_wgs); - return num_elements * sizeof(_FlagT); - } - - static ::std::size_t - get_num_elements(::std::size_t num_wgs) - { - return padding + num_wgs; - } - - static bool - is_ready(_FlagT flag) - { - // flag & OUT_OF_BOUNDS != NOT_READY means it has either partial or full value, or is out of bounds - return (flag & OUT_OF_BOUNDS) != NOT_READY; - } - - static bool - is_full(_FlagT flag) - { - return (flag & OUT_OF_BOUNDS) == FULL_MASK; - } - - static bool - is_out_of_bounds(_FlagT flag) - { - return (flag & OUT_OF_BOUNDS) == OUT_OF_BOUNDS; - } - - private: - ::std::size_t num_elements; - _FlagT* flags_begin; -}; - -struct TileId -{ - using _TileIdT = ::std::uint32_t; - using _AtomicTileRefT = sycl::atomic_ref<_TileIdT, sycl::memory_order::relaxed, sycl::memory_scope::device, - sycl::access::address_space::global_space>; - - TileId(_TileIdT* tileid_memory) : tile_counter(*(tileid_memory)) {} - - constexpr static ::std::size_t - get_padded_memory_size() - { - // extra sizeof(_TileIdT) for possible aligment issues - return sizeof(_TileIdT) + sizeof(_TileIdT); - } - - constexpr static ::std::size_t - get_memory_size() - { - // extra sizeof(_TileIdT) for possible aligment issues - return sizeof(_TileIdT); - } - - _TileIdT - fetch_inc() - { - return tile_counter.fetch_add(1); - } - - _AtomicTileRefT tile_counter; -}; - -struct cooperative_lookback -{ - - template typename LookbackScanMemory, typename UseAtomic64> - _T - operator()(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, - LookbackScanMemory<_T, UseAtomic64> memory) - { - using _LookbackScanMemory = LookbackScanMemory<_T, UseAtomic64>; - using FlagT = typename _LookbackScanMemory::_FlagT; - - _T sum = 0; - constexpr int offset = -1; - int local_id = subgroup.get_local_id(); - - for (int tile = static_cast(tile_id) + offset; tile >= 0; tile -= SUBGROUP_SIZE) - { - auto atomic_flag = memory.get_flag(tile - local_id); // - FlagT flag; - do - { - flag = atomic_flag.load(); - } while (!sycl::all_of_group(subgroup, _LookbackScanMemory::is_ready(flag) || - (tile - local_id < 0))); // Loop till all ready - - bool is_full = _LookbackScanMemory::is_full(flag); - auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full); - auto lowest_item_with_full = is_full_ballot.find_low(); - - // TODO: Use identity_fn for out of bounds values - _T contribution = local_id <= lowest_item_with_full && (tile - local_id >= 0) - ? memory.get_value(tile - local_id, flag) - : _T{0}; - - // Sum all of the partial results from the tiles found, as well as the full contribution from the closest tile (if any) - sum = bin_op(sum, contribution); - // If we found a full value, we can stop looking at previous tiles. Otherwise, - // keep going through tiles until we either find a full tile or we've completely - // recomputed the prefix using partial values - if (is_full_ballot.any()) - break; - - } - sum = sycl::reduce_over_group(subgroup, sum, bin_op); - - return sum; - } -}; - -template -void -single_pass_scan_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op) -{ - using _Type = oneapi::dpl::__internal::__value_t<_InRange>; - - static_assert(std::is_same_v<_Inclusive, ::std::true_type>, "Single-pass scan only available for inclusive scan"); - - const ::std::size_t n = __in_rng.size(); - - constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; - constexpr ::std::size_t elems_per_workitem = _KernelParam::data_per_workitem; - // Avoid non_uniform n by padding up to a multiple of wgsize - constexpr ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem; - constexpr ::std::size_t num_workitems = wgsize; - - auto event = __queue.submit([&](sycl::handler& hdl) { - auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); - - oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); - hdl.parallel_for( - sycl::nd_range<1>(num_workitems, wgsize), [= - ](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { - auto group = item.get_group(); - ::std::uint32_t local_id = item.get_local_id(0); - constexpr ::std::uint32_t stride = wgsize; - auto subgroup = item.get_sub_group(); - - constexpr std::uint32_t tile_id = 0; - constexpr std::uint32_t wg_begin = 0; - constexpr std::uint32_t wg_end = elems_in_tile; - - std::uint32_t wg_local_memory_size = elems_in_tile; - - auto out_begin = __out_rng.begin(); - _Type carry = 0; - - // Global load into local - if (wg_end > n) - wg_local_memory_size = n; - - //TODO: assumes default ctor produces identity w.r.t. __binary_op - // _Type my_reducer{}; - if (wg_end <= n) - { -#pragma unroll - for (std::uint32_t step = 0; step < elems_per_workitem; ++step) - { - ::std::uint32_t i = stride * step; - _Type in_val = __in_rng[i + local_id]; - // my_reducer = __binary_op(my_reducer, in_val); - _Type out = sycl::inclusive_scan_over_group(group, in_val, __binary_op, carry); - out_begin[i + local_id] = out; - carry = group_broadcast(group, out, stride - 1); - } - } - else - { -#pragma unroll - for (std::uint32_t step = 0; step < elems_per_workitem; ++step) - { - ::std::uint32_t i = stride * step; - _Type in_val; - - if (i + local_id < n) - { - in_val = __in_rng[i + local_id]; - // my_reducer = __binary_op(my_reducer, in_val); - } - _Type out = sycl::inclusive_scan_over_group(group, in_val, __binary_op, carry); - if (i + local_id < n) - { - out_begin[i + local_id] = out; - } - carry = group_broadcast(group, out, stride - 1); - } - } - }); - }); - - event.wait(); -} - -template -void -single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op) -{ - using _Type = oneapi::dpl::__internal::__value_t<_InRange>; - using _TileIdT = TileId::_TileIdT; - using _LookbackScanMemory = LookbackScanMemory<_Type, _UseAtomic64>; - using _FlagT = typename _LookbackScanMemory::_FlagT; - - static_assert(std::is_same_v<_Inclusive, ::std::true_type>, "Single-pass scan only available for inclusive scan"); - - const ::std::size_t n = __in_rng.size(); - - constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; - constexpr ::std::size_t elems_per_workitem = _KernelParam::data_per_workitem; - // Avoid non_uniform n by padding up to a multiple of wgsize - constexpr ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem; - ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile); - ::std::size_t num_workitems = num_wgs * wgsize; - - ScanMemoryManager<_Type, _UseAtomic64, LookbackScanMemory, TileId> scratch(__queue); - scratch.allocate(num_wgs); - - // Memory Structure: - // [Lookback Scan Memory, Tile Id Counter] - auto scan_memory_begin = scratch.scan_memory_ptr(); - auto status_flags_begin = _LookbackScanMemory::get_flags_begin(scan_memory_begin, num_wgs); - auto tile_id_begin = scratch.tile_id_ptr(); - - ::std::size_t num_elements = _LookbackScanMemory::get_num_elements(num_wgs); - // fill_num_wgs num_elements + 1 to also initialize tile_id_counter - ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize); - - auto fill_event = __queue.memset(status_flags_begin, 0, num_elements * sizeof(_FlagT) + 1 * sizeof(_TileIdT)); - - auto event = __queue.submit([&](sycl::handler& hdl) { - auto tile_id_lacc = sycl::local_accessor(sycl::range<1>{1}, hdl); - auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl); - hdl.depends_on(fill_event); - - oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng); - hdl.parallel_for( - sycl::nd_range<1>(num_workitems, wgsize), [= - ](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] { - auto group = item.get_group(); - ::std::uint32_t local_id = item.get_local_id(0); - constexpr ::std::uint32_t stride = wgsize; - auto subgroup = item.get_sub_group(); - - std::uint32_t tile_id; - if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>) - { - // Obtain unique ID for this work-group that will be used in decoupled lookback - TileId dynamic_tile_id(tile_id_begin); - if (group.leader()) - { - tile_id_lacc[0] = dynamic_tile_id.fetch_inc(); - } - sycl::group_barrier(group); - tile_id = tile_id_lacc[0]; - } - else - { - tile_id = group.get_group_linear_id(); - } - - // Global load into local - auto wg_current_offset = (tile_id * elems_in_tile); - auto wg_next_offset = ((tile_id + 1) * elems_in_tile); - auto wg_local_memory_size = elems_in_tile; - - if (wg_next_offset > n) - wg_local_memory_size = n - wg_current_offset; - //TODO: assumes default ctor produces identity w.r.t. __binary_op - _Type my_reducer{}; - if (wg_next_offset <= n) - { -#pragma unroll - for (std::uint32_t i = 0; i < elems_per_workitem; ++i) - { - _Type in_val = __in_rng[wg_current_offset + local_id + stride * i]; - my_reducer = __binary_op(my_reducer, in_val); - tile_vals[local_id + stride * i] = in_val; - } - } - else - { -#pragma unroll - for (std::uint32_t i = 0; i < elems_per_workitem; ++i) - { - if (wg_current_offset + local_id + stride * i < n) - { - _Type in_val = __in_rng[wg_current_offset + local_id + stride * i]; - my_reducer = __binary_op(my_reducer, in_val); - tile_vals[local_id + stride * i] = in_val; - } - } - } - - auto local_sum = sycl::reduce_over_group(group, my_reducer, __binary_op); - - auto in_begin = tile_vals.template get_multi_ptr().get(); - auto out_begin = __out_rng.begin() + wg_current_offset; - - _Type prev_sum = 0; - - // The first sub-group will query the previous tiles to find a prefix - if (subgroup.get_group_id() == 0) - { - _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs); - - if (group.leader()) - scan_mem.set_partial(tile_id, local_sum); - - // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum - prev_sum = cooperative_lookback()(tile_id, subgroup, __binary_op, scan_mem); - - if (group.leader()) - scan_mem.set_full(tile_id, prev_sum + local_sum); - } - - _Type carry = sycl::group_broadcast(group, prev_sum, 0); -// TODO: Find a fix for _ONEDPL_PRAGMA_UNROLL -#pragma unroll - for (::std::uint32_t step = 0; step < elems_per_workitem; ++step) - { - ::std::uint32_t i = stride * step; - _Type x; - if (i + local_id < wg_local_memory_size) - { - x = in_begin[i + local_id]; - } - _Type out = sycl::inclusive_scan_over_group(group, x, __binary_op, carry); - if (i + local_id < wg_local_memory_size) - { - out_begin[i + local_id] = out; - } - carry = group_broadcast(group, out, stride - 1); - } - }); - }); - - scratch.async_free(event); - - event.wait(); -} - -template -void -single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, - _BinaryOp __binary_op) -{ - auto __n = __in_end - __in_begin; - - auto __keep1 = - oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); - auto __buf1 = __keep1(__in_begin, __in_end); - auto __keep2 = - oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); - auto __buf2 = __keep2(__out_begin, __out_begin + __n); - - // Avoid aspect query overhead for sizeof(Types) > 32 bits - if constexpr (sizeof(typename std::iterator_traits<_InIterator>::value_type) <= sizeof(std::uint32_t)) - { - if (__queue.get_device().has(sycl::aspect::atomic64)) - { - single_pass_scan_impl<_KernelParam, _Inclusive, /* UseAtomic64 */ std::true_type, - /* UseDynamicTileID */ std::false_type>(__queue, __buf1.all_view(), __buf2.all_view(), - __binary_op); - } - else - { - single_pass_scan_impl<_KernelParam, _Inclusive, /* UseAtomic64 */ std::false_type, - /* UseDynamicTileID */ std::false_type>(__queue, __buf1.all_view(), __buf2.all_view(), - __binary_op); - } - } - else - { - single_pass_scan_impl<_KernelParam, _Inclusive, /* UseAtomic64 */ std::false_type, - /* UseDynamicTileID */ std::false_type>(__queue, __buf1.all_view(), __buf2.all_view(), - __binary_op); - } -} - -template -void -single_pass_single_wg_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, - _OutIterator __out_begin, _BinaryOp __binary_op) -{ - auto __n = __in_end - __in_begin; - - auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>(); - auto __buf1 = __keep1(__in_begin, __in_end); - auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); - auto __buf2 = __keep2(__out_begin, __out_begin + __n); - - // Avoid aspect query overhead for sizeof(Types) > 32 bits - single_pass_scan_impl_single_wg<_KernelParam, /* Inclusive */ std::true_type>(__queue, __buf1.all_view(), - __buf2.all_view(), __binary_op); -} - -template -void -single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, - _BinaryOp __binary_op) -{ - constexpr ::std::size_t wgsize = _KernelParam::workgroup_size; - constexpr ::std::size_t elems_per_workitem = _KernelParam::data_per_workitem; - // Avoid non_uniform n by padding up to a multiple of wgsize - constexpr ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem; - auto __n = __in_end - __in_begin; - - if (__n <= elems_in_tile) - { - single_pass_single_wg_inclusive_scan<_KernelParam, /* Inclusive */ std::true_type>( - __queue, __in_begin, __in_end, __out_begin, __binary_op); - } - else - { - single_pass_inclusive_scan<_KernelParam, /* Inclusive */ std::true_type>(__queue, __in_begin, __in_end, - __out_begin, __binary_op); - } -} - -} // inline namespace igpu - -} // namespace oneapi::dpl::experimental::kt - -#endif /* _ONEDPL_parallel_backend_sycl_scan_H */ diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h index c68e74e6ef7..e8bbde63c04 100644 --- a/include/oneapi/dpl/pstl/utils.h +++ b/include/oneapi/dpl/pstl/utils.h @@ -765,14 +765,6 @@ struct __is_iterator_type<_T, std::void_t::dif template static constexpr bool __is_iterator_type_v = __is_iterator_type<_T>::value; -//For use to lazily create objects values of type _Tp without requiring a default constructibility of _Tp -template -union __lazy_ctor_storage -{ - _Tp __v; - __lazy_ctor_storage() {} -}; - } // namespace __internal } // namespace dpl } // namespace oneapi diff --git a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp deleted file mode 100644 index b3407581f37..00000000000 --- a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp +++ /dev/null @@ -1,66 +0,0 @@ -// -*- C++ -*- -//===-- scan.pass.cpp -----------------------------------------------------===// -// -// Copyright (C) Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// This file incorporates work covered by the following copyright and permission -// notice: -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// -//===----------------------------------------------------------------------===// - -#include "support/test_config.h" - -#include _PSTL_TEST_HEADER(execution) -#include _PSTL_TEST_HEADER(numeric) - -int -main() -{ - bool all_passed = true; - sycl::queue q; - - for (int logn : {4, 8, 11, 16, 19, 21}) - { - std::cout << "Testing 2^" << logn << std::endl; - int n = 1 << logn; - std::vector v(n, 1); - int* in_ptr = sycl::malloc_device(n, q); - int* out_ptr = sycl::malloc_device(n, q); - - q.copy(v.data(), in_ptr, n).wait(); - using KernelParams = oneapi::dpl::experimental::kt::kernel_param<8, 128, class ScanKernel>; - oneapi::dpl::experimental::kt::single_pass_inclusive_scan(q, in_ptr, in_ptr+n, out_ptr, ::std::plus()); - - std::vector tmp(n, 0); - q.copy(out_ptr, tmp.data(), n); - q.wait(); - - std::inclusive_scan(v.begin(), v.end(), v.begin()); - - bool passed = true; - for (size_t i = 0; i < n; ++i) - { - if (tmp[i] != v[i]) - { - passed = false; - std::cout << "expected " << i << ' ' << v[i] << ' ' << tmp[i] << '\n'; - } - } - - if (passed) - std::cout << " passed" << std::endl; - else - std::cout << " failed" << std::endl; - - all_passed &= passed; - sycl::free(in_ptr, q); - sycl::free(out_ptr, q); - } - - return !all_passed; -} From 427a5f462ac5a97bc82e84bf7a7893bbe4031410 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Tue, 4 Jun 2024 14:31:11 -0400 Subject: [PATCH 116/134] naming consistency Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 72 +++++++++---------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 466fe8548d9..bd7cd8a9cfb 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -282,15 +282,15 @@ __lookback_phase(const _Group& __group, const _SubGroup& __subgroup, _StatusFlag } template struct __lookback_kernel_func { using _FlagStorageType = typename _FlagType::_FlagStorageType; static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; - _InRange __in_rng; - _OutRange __out_rng; + _InRng __in_rng; + _OutRng __out_rng; _BinaryOp __binary_op; std::size_t __n; _StatusFlags __status_flags; @@ -378,17 +378,17 @@ struct __lookback_scan_submitter<__data_per_workitem, __workgroup_size, _Type, _ oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>> { - template + template sycl::event - operator()(sycl::queue __q, sycl::event __prev_event, _InRange&& __in_rng, _OutRange&& __out_rng, + operator()(sycl::queue __q, sycl::event __prev_event, _InRng&& __in_rng, _OutRng&& __out_rng, _BinaryOp __binary_op, std::size_t __n, _StatusFlags&& __status_flags, std::size_t __status_flags_size, _StatusValues&& __status_vals_full, _StatusValues&& __status_vals_partial, std::size_t __current_num_items) const { using _LocalAccessorType = sycl::local_accessor<_Type, 1>; using _KernelFunc = - __lookback_kernel_func<__data_per_workitem, __workgroup_size, _Type, _FlagType, std::decay_t<_InRange>, - std::decay_t<_OutRange>, std::decay_t<_BinaryOp>, std::decay_t<_StatusFlags>, + __lookback_kernel_func<__data_per_workitem, __workgroup_size, _Type, _FlagType, std::decay_t<_InRng>, + std::decay_t<_OutRng>, std::decay_t<_BinaryOp>, std::decay_t<_StatusFlags>, std::decay_t<_StatusValues>, std::decay_t<_LocalAccessorType>>; static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; @@ -406,11 +406,11 @@ struct __lookback_scan_submitter<__data_per_workitem, __workgroup_size, _Type, _ } }; -template +template sycl::event -__single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op, _KernelParam) +__single_pass_scan(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _BinaryOp __binary_op, _KernelParam) { - using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + using _Type = oneapi::dpl::__internal::__value_t<_InRng>; using _FlagType = __scan_status_flag<_Type>; using _FlagStorageType = typename _FlagType::_FlagStorageType; @@ -443,7 +443,7 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r return oneapi::dpl::__par_backend_hetero::__parallel_transform_scan_single_group( oneapi::dpl::__internal::__device_backend_tag{}, oneapi::dpl::execution::__dpl::make_device_policy(__queue), - std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng), __n, + std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, oneapi::dpl::__internal::__no_op{}, unseq_backend::__no_init_value<_Type>{}, __binary_op, std::true_type{}); } @@ -493,19 +493,19 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r } } -template struct __copy_if_kernel_func { static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem; using _SizeT = std::size_t; using _BinaryOp = std::plus<_SizeT>; - using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + using _Type = oneapi::dpl::__internal::__value_t<_InRng>; using _FlagStorageType = typename _FlagType::_FlagStorageType; - _InRange __in_rng; - _OutRange __out_rng; + _InRng __in_rng; + _OutRng __out_rng; _NumRng __num_rng; _SizeT __n; _UnaryPredicate __pred; @@ -598,20 +598,20 @@ struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType, oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>> { - template sycl::event - operator()(sycl::queue __q, _Event __fill_event, _InRange&& __in_rng, _OutRange&& __out_rng, - _NumSelectedRange&& __num_rng, std::size_t __n, _UnaryPredicate __pred, _StatusFlags&& __status_flags, + operator()(sycl::queue __q, _Event __fill_event, _InRng&& __in_rng, _OutRng&& __out_rng, + _NumCopiedRng&& __num_rng, std::size_t __n, _UnaryPredicate __pred, _StatusFlags&& __status_flags, std::size_t __status_flags_size, _StatusValues&& __status_vals_full, _StatusValues&& __status_vals_partial, std::size_t __current_num_items, std::size_t __current_num_wgs) const { - using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + using _Type = oneapi::dpl::__internal::__value_t<_InRng>; using _LocalAccessorType = sycl::local_accessor<_Type, 1>; using _KernelFunc = - __copy_if_kernel_func<__data_per_workitem, __workgroup_size, _FlagType, std::decay_t<_InRange>, - std::decay_t<_OutRange>, std::decay_t<_NumSelectedRange>, _UnaryPredicate, + __copy_if_kernel_func<__data_per_workitem, __workgroup_size, _FlagType, std::decay_t<_InRng>, + std::decay_t<_OutRng>, std::decay_t<_NumCopiedRng>, _UnaryPredicate, std::decay_t<_StatusFlags>, std::decay_t<_StatusValues>, std::decay_t<_LocalAccessorType>>; @@ -631,15 +631,15 @@ struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType, } }; -template sycl::event -single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, +single_pass_copy_if_impl(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _NumCopiedRng __num_rng, _UnaryPredicate __pred, _KernelParam) { using _SizeT = uint64_t; using _KernelName = __copy_if_kernel; - using _Type = oneapi::dpl::__internal::__value_t<_InRange>; + using _Type = oneapi::dpl::__internal::__value_t<_InRng>; using _FlagType = __scan_status_flag<_SizeT>; using _BinaryOp = std::plus<_SizeT>; @@ -662,8 +662,8 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ { return oneapi::dpl::__par_backend_hetero::__dispatch_small_copy_if( oneapi::dpl::execution::__dpl::make_device_policy<_KernelName>(__queue), __n, - std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng), - std::forward<_NumSelectedRange>(__num_rng), __pred); + std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), + std::forward<_NumCopiedRng>(__num_rng), __pred); } } constexpr std::size_t __workgroup_size = _KernelParam::workgroup_size; @@ -710,15 +710,15 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _ } // namespace __impl -template sycl::event -copy_if(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumCopiedRange&& __num_rng, +copy_if(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _NumCopiedRng&& __num_rng, _UnaryPredicate __pred, _KernelParam __param = {}) { - auto __in_view = oneapi::dpl::__ranges::views::all(std::forward<_InRange>(__in_rng)); - auto __out_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRange>(__out_rng)); - auto __num_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRange>(__num_rng)); + auto __in_view = oneapi::dpl::__ranges::views::all(std::forward<_InRng>(__in_rng)); + auto __out_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRng>(__out_rng)); + auto __num_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRng>(__num_rng)); return __impl::single_pass_copy_if_impl(__queue, std::move(__in_view), std::move(__out_view), std::move(__num_view), __pred, __param); @@ -745,13 +745,13 @@ copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutI __pred, __param); } -template +template sycl::event -inclusive_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op, +inclusive_scan(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _BinaryOp __binary_op, _KernelParam __param = {}) { - auto __in_view = oneapi::dpl::__ranges::views::all(std::forward<_InRange>(__in_rng)); - auto __out_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRange>(__out_rng)); + auto __in_view = oneapi::dpl::__ranges::views::all(std::forward<_InRng>(__in_rng)); + auto __out_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRng>(__out_rng)); return __impl::__single_pass_scan(__queue, std::move(__in_view), std::move(__out_view), __binary_op, __param); } From e9091e1c85d3e71cbd25572cae58a79f4f9ca3b3 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Tue, 4 Jun 2024 14:33:34 -0400 Subject: [PATCH 117/134] formatting Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 27 +++++++++---------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index bd7cd8a9cfb..b735f1e77fe 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -380,8 +380,8 @@ struct __lookback_scan_submitter<__data_per_workitem, __workgroup_size, _Type, _ template sycl::event - operator()(sycl::queue __q, sycl::event __prev_event, _InRng&& __in_rng, _OutRng&& __out_rng, - _BinaryOp __binary_op, std::size_t __n, _StatusFlags&& __status_flags, std::size_t __status_flags_size, + operator()(sycl::queue __q, sycl::event __prev_event, _InRng&& __in_rng, _OutRng&& __out_rng, _BinaryOp __binary_op, + std::size_t __n, _StatusFlags&& __status_flags, std::size_t __status_flags_size, _StatusValues&& __status_vals_full, _StatusValues&& __status_vals_partial, std::size_t __current_num_items) const { @@ -598,14 +598,13 @@ struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType, oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>> { - template + template sycl::event - operator()(sycl::queue __q, _Event __fill_event, _InRng&& __in_rng, _OutRng&& __out_rng, - _NumCopiedRng&& __num_rng, std::size_t __n, _UnaryPredicate __pred, _StatusFlags&& __status_flags, - std::size_t __status_flags_size, _StatusValues&& __status_vals_full, - _StatusValues&& __status_vals_partial, std::size_t __current_num_items, - std::size_t __current_num_wgs) const + operator()(sycl::queue __q, _Event __fill_event, _InRng&& __in_rng, _OutRng&& __out_rng, _NumCopiedRng&& __num_rng, + std::size_t __n, _UnaryPredicate __pred, _StatusFlags&& __status_flags, std::size_t __status_flags_size, + _StatusValues&& __status_vals_full, _StatusValues&& __status_vals_partial, + std::size_t __current_num_items, std::size_t __current_num_wgs) const { using _Type = oneapi::dpl::__internal::__value_t<_InRng>; using _LocalAccessorType = sycl::local_accessor<_Type, 1>; @@ -631,8 +630,7 @@ struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType, } }; -template +template sycl::event single_pass_copy_if_impl(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _NumCopiedRng __num_rng, _UnaryPredicate __pred, _KernelParam) @@ -710,11 +708,10 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out } // namespace __impl -template +template sycl::event -copy_if(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _NumCopiedRng&& __num_rng, - _UnaryPredicate __pred, _KernelParam __param = {}) +copy_if(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _NumCopiedRng&& __num_rng, _UnaryPredicate __pred, + _KernelParam __param = {}) { auto __in_view = oneapi::dpl::__ranges::views::all(std::forward<_InRng>(__in_rng)); auto __out_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRng>(__out_rng)); From 7315d2bc4720b7b8d4aebe1ffb97d0d20fe95fae Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Tue, 4 Jun 2024 14:34:57 -0400 Subject: [PATCH 118/134] reverting overreach Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/experimental/kt/single_pass_scan.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index b735f1e77fe..5204f5d4f7f 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -406,11 +406,11 @@ struct __lookback_scan_submitter<__data_per_workitem, __workgroup_size, _Type, _ } }; -template +template sycl::event -__single_pass_scan(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _BinaryOp __binary_op, _KernelParam) +__single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op, _KernelParam) { - using _Type = oneapi::dpl::__internal::__value_t<_InRng>; + using _Type = oneapi::dpl::__internal::__value_t<_InRange>; using _FlagType = __scan_status_flag<_Type>; using _FlagStorageType = typename _FlagType::_FlagStorageType; From 0983043aba315f0743982f0fe9c2f492afd6fdb5 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 5 Jun 2024 14:03:32 -0400 Subject: [PATCH 119/134] upgrading tests to match scan, cmake Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 4 +- test/kt/CMakeLists.txt | 47 ++-- test/kt/single_pass_copy_if.cpp | 266 ++++++++++++++++++ test/kt/single_pass_scan.cpp | 4 +- 4 files changed, 298 insertions(+), 23 deletions(-) create mode 100644 test/kt/single_pass_copy_if.cpp diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 5204f5d4f7f..bc04d1da4dd 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -443,7 +443,7 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r return oneapi::dpl::__par_backend_hetero::__parallel_transform_scan_single_group( oneapi::dpl::__internal::__device_backend_tag{}, oneapi::dpl::execution::__dpl::make_device_policy(__queue), - std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, + std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng), __n, oneapi::dpl::__internal::__no_op{}, unseq_backend::__no_init_value<_Type>{}, __binary_op, std::true_type{}); } @@ -715,7 +715,7 @@ copy_if(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _NumCopiedR { auto __in_view = oneapi::dpl::__ranges::views::all(std::forward<_InRng>(__in_rng)); auto __out_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRng>(__out_rng)); - auto __num_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRng>(__num_rng)); + auto __num_view = oneapi::dpl::__ranges::views::all(std::forward<_NumCopiedRng>(__num_rng)); return __impl::single_pass_copy_if_impl(__queue, std::move(__in_view), std::move(__out_view), std::move(__num_view), __pred, __param); diff --git a/test/kt/CMakeLists.txt b/test/kt/CMakeLists.txt index 3e312108819..3159cfdae7e 100644 --- a/test/kt/CMakeLists.txt +++ b/test/kt/CMakeLists.txt @@ -130,50 +130,57 @@ if (ONEDPL_TEST_ENABLE_KT_ESIMD) _generate_esimd_sort_test("esimd_radix_sort" "256" "32" "double" "" 1000) # segfault endif() -function (_generate_gpu_scan_test _data_per_work_item _work_group_size _type) - - if ((NOT TARGET build-scan-kt-tests) AND (NOT TARGET run-scan-kt-tests)) - add_custom_target(build-scan-kt-tests COMMENT "Build all scan kernel template tests") - add_custom_target(run-scan-kt-tests - COMMAND "${CMAKE_CTEST_COMMAND}" -R "^run-scan-kt-tests$" --output-on-failure --no-label-summary - DEPENDS build-scan-kt-tests - COMMENT "Build and run all scan kernel template tests") +function (_generate_gpu_single_pass_test _alg _data_per_work_item _work_group_size _type _single_wg_optout) + + if ((NOT TARGET "build-${_alg}-kt-tests") AND (NOT TARGET "run-${_alg}-kt-tests")) + add_custom_target("build-${_alg}-kt-tests" COMMENT "Build all ${_alg} kernel template tests") + add_custom_target("run-${_alg}-kt-tests" + COMMAND "${CMAKE_CTEST_COMMAND}" -R "^run-${_alg}-kt-tests$" --output-on-failure --no-label-summary + DEPENDS "build-${_alg}-kt-tests" + COMMENT "Build and run all ${_alg} kernel template tests") endif() string(REPLACE "_t" "" _type_short ${_type}) - set(_target_name "single_pass_scan_dpwi${_data_per_work_item}_wgs${_work_group_size}_${_type_short}") - set(_test_path "single_pass_scan.cpp") + set(_target_name "single_pass_${_alg}_dpwi${_data_per_work_item}_wgs${_work_group_size}_${_type_short}_${_single_wg_optout}") + set(_test_path "single_pass_${_alg}.cpp") #_generate_test_randomly(${_target_name} ${_test_path} ${_probability_permille}) _generate_test(${_target_name} ${_test_path}) if(TARGET ${_target_name}) - add_dependencies(build-scan-kt-tests ${_target_name}) - add_dependencies(run-scan-kt-tests ${_target_name}) + add_dependencies("build-${_alg}-kt-tests" ${_target_name}) + add_dependencies("run-${_alg}-kt-tests" ${_target_name}) target_compile_definitions(${_target_name} PRIVATE TEST_DATA_PER_WORK_ITEM=${_data_per_work_item}) target_compile_definitions(${_target_name} PRIVATE TEST_WORK_GROUP_SIZE=${_work_group_size}) + target_compile_definitions(${_target_name} PRIVATE TEST_SINGLE_WG_OPTOUT=${_single_wg_optout}) target_compile_definitions(${_target_name} PRIVATE TEST_TYPE=${_type}) endif() endfunction() -function(_generate_gpu_scan_tests) +function(_generate_gpu_single_pass_tests) + set(_alg_all "scan" "copy_if") set(_data_per_work_item_all "1" "2" "4" "8" "16" "32") set(_work_group_size_all "64" "128" "256" "512" "1024") set(_type_all "uint32_t" "int32_t" "float" "int64_t" "uint64_t" "double") - foreach (_data_per_work_item ${_data_per_work_item_all}) - foreach (_work_group_size ${_work_group_size_all}) - foreach (_type ${_type_all}) - _generate_gpu_scan_test(${_data_per_work_item} ${_work_group_size} ${_type}) + foreach (_alg ${_alg_all}) + foreach (_data_per_work_item ${_data_per_work_item_all}) + foreach (_work_group_size ${_work_group_size_all}) + foreach (_type ${_type_all}) + _generate_gpu_single_pass_test(${_alg} ${_data_per_work_item} ${_work_group_size} ${_type} "false") + endforeach() endforeach() endforeach() + # to not double the number of tests, check single wg output with a single test per alg + _generate_gpu_single_pass_test(${_alg} "8" "512" "float" "true") + + _generate_test("single_pass_${_alg}" "single_pass_${_alg}.cpp") + target_compile_definitions("single_pass_${_alg}" PRIVATE TEST_DATA_PER_WORK_ITEM=8 TEST_WORK_GROUP_SIZE=256 TEST_TYPE=uint32_t TEST_SINGLE_WG_OPTOUT=false) endforeach() - _generate_test("single_pass_scan" "single_pass_scan.cpp") - target_compile_definitions("single_pass_scan" PRIVATE TEST_DATA_PER_WORK_ITEM=8 TEST_WORK_GROUP_SIZE=256 TEST_TYPE=uint32_t) endfunction() if (ONEDPL_TEST_ENABLE_KT_SYCL) - _generate_gpu_scan_tests() + _generate_gpu_single_pass_tests() endif() diff --git a/test/kt/single_pass_copy_if.cpp b/test/kt/single_pass_copy_if.cpp new file mode 100644 index 00000000000..86449355ac2 --- /dev/null +++ b/test/kt/single_pass_copy_if.cpp @@ -0,0 +1,266 @@ +// -*- C++ -*- +//===-- single_pass_copy_if.cpp -------------------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#include "../support/test_config.h" + +#include + +#if LOG_TEST_INFO +# include +#endif + +#if _ENABLE_RANGES_TESTING +# include +#endif + +#include "../support/utils.h" +#include "../support/sycl_alloc_utils.h" +#include "../support/scan_serial_impl.h" + +#include "esimd_radix_sort_utils.h" + +#include +#include +#include +#include +#include +#include + +inline const std::vector copy_if_sizes = { + 1, 6, 16, 43, 256, 316, 2048, + 5072, 8192, 14001, 1 << 14, (1 << 14) + 1, 50000, 67543, + 100'000, 1 << 17, 179'581, 250'000, 1 << 18, (1 << 18) + 1, 500'000, + 888'235, 1'000'000, 1 << 20, 10'000'000}; + +template +struct __less_than_val +{ + const T __val; + __less_than_val() : __val{std::is_signed_v ? 0 : std::numeric_limits::max()/T{2}} + { + } + __less_than_val(const T& __v) : __val{__v} + { + } + bool + operator()(const T& __v) const + { + return __v < __val; + } +}; + +template +auto +generate_copy_if_data(T* input, std::size_t size, std::uint32_t seed) +{ + // Integer numbers are generated even for floating point types in order to avoid rounding errors, + // and simplify the final check + using substitute_t = std::conditional_t, std::int64_t, std::uint64_t>; + + std::default_random_engine gen{seed}; + std::uniform_int_distribution dist(std::numeric_limits::lowest(), std::numeric_limits::max()); + std::generate(input, input + size, [&] { return dist(gen); }); +} + +#if _ENABLE_RANGES_TESTING +template +void +test_all_view(sycl::queue q, std::size_t size, Predicate pred, KernelParam param) +{ +# if LOG_TEST_INFO + std::cout << "\ttest_all_view(" << size << ") : " << TypeInfo().name() << std::endl; +# endif + std::vector input(size); + generate_copy_if_data(input.data(), size, 42); + std::vector ref(input); + std::vector out(size); + sycl::buffer buf_out(input.size()); + std::size_t num_copied = 0; + sycl::buffer buf_num_copied(&num_copied, 1); + auto out_end = std::copy_if(std::begin(ref), std::end(ref), std::begin(out), pred); + std::size_t num_copied_ref = out_end - std::begin(out); + { + sycl::buffer buf(input.data(), input.size()); + + oneapi::dpl::experimental::ranges::all_view view(buf); + oneapi::dpl::experimental::ranges::all_view view_out(buf_out); + oneapi::dpl::experimental::ranges::all_view view_num_copied(buf_num_copied); + oneapi::dpl::experimental::kt::gpu::copy_if(q, view, view_out, view_num_copied, pred, param).wait(); + } + + auto acc = buf_out.get_host_access(); + auto num_copied_acc = buf_num_copied.get_host_access(); + + std::string msg1 = "wrong num copied with all_view, n: " + std::to_string(size); + EXPECT_EQ(num_copied_ref, num_copied_acc[0], msg1.c_str()); + std::string msg2 = "wrong results with all_view, n: " + std::to_string(size); + EXPECT_EQ_RANGES(ref, acc, msg2.c_str()); +} + +template +void +test_buffer(sycl::queue q, std::size_t size, Predicate pred, KernelParam param) +{ +# if LOG_TEST_INFO + std::cout << "\ttest_buffer(" << size << ") : " << TypeInfo().name() << std::endl; +# endif + std::vector input(size); + generate_copy_if_data(input.data(), size, 42); + std::vector ref(input); + std::vector out_ref(size); + sycl::buffer buf_out(size); + std::size_t num_copied = 0; + sycl::buffer buf_num_copied(&num_copied, 1); + auto out_end = std::copy_if(std::begin(ref), std::end(ref), std::begin(out_ref), pred); + std::size_t num_copied_ref = out_end - std::begin(out_ref); + { + sycl::buffer buf(input.data(), input.size()); + + oneapi::dpl::experimental::kt::gpu::copy_if(q, buf, buf_out, buf_num_copied, pred, param).wait(); + } + + auto acc = buf_out.get_host_access(); + auto num_copied_acc = buf_num_copied.get_host_access(); + + std::string msg1 = "wrong num copied with buffer, n: " + std::to_string(size); + EXPECT_EQ(num_copied_ref, num_copied_acc[0], msg1.c_str()); + std::string msg2 = "wrong results with buffer, n: " + std::to_string(size); + EXPECT_EQ_RANGES(ref, acc, msg2.c_str()); + +} +#endif + +template +void +test_usm(sycl::queue q, std::size_t size, Predicate pred, KernelParam param) +{ +#if LOG_TEST_INFO + std::cout << "\t\ttest_usm<" << TypeInfo().name() << ", " << USMAllocPresentation().name<_alloc_type>() << ">(" + << size << ");" << std::endl; +#endif + std::vector expected(size); + generate_copy_if_data(expected.data(), size, 42); + std::vector out_ref(size); + + TestUtils::usm_data_transfer<_alloc_type, T> dt_input(q, expected.begin(), expected.end()); + TestUtils::usm_data_transfer<_alloc_type, T> dt_output(q, size); + TestUtils::usm_data_transfer<_alloc_type, std::size_t> dt_num_copied(q, 1); + + std::size_t num_copied = 0; + auto out_end = std::copy_if(std::begin(expected), std::end(expected), std::begin(out_ref), pred); + std::size_t num_copied_ref = out_end - std::begin(out_ref); + + oneapi::dpl::experimental::kt::gpu::copy_if(q, dt_input.get_data(), dt_input.get_data() + size, + dt_output.get_data(), dt_num_copied.get_data(), pred, param) + .wait(); + + std::vector actual(size); + dt_output.retrieve_data(actual.begin()); + std::vector num_copied_host(1); + dt_num_copied.retrieve_data(num_copied_host.begin()); + + std::string msg1 = "wrong num copied with USM, n: " + std::to_string(size); + EXPECT_EQ(num_copied_ref, num_copied_host[0], msg1.c_str()); + std::string msg2 = "wrong results with USM, n: " + std::to_string(size); + EXPECT_EQ_N(expected.begin(), actual.begin(), size, msg2.c_str()); +} + +/////////////////// + +template +void +test_sycl_iterators(sycl::queue q, std::size_t size, Predicate pred, KernelParam param) +{ +#if LOG_TEST_INFO + std::cout << "\t\ttest_sycl_iterators<" << TypeInfo().name() << ">(" << size << ");" << std::endl; +#endif + std::vector input(size); + std::vector output(size); + generate_copy_if_data(input.data(), size, 42); + std::vector ref(input); + std::vector out_ref(size); + std::size_t num_copied = 0; + auto out_end = std::copy_if(std::begin(ref), std::end(ref), std::begin(out_ref), pred); + std::size_t num_copied_ref = out_end - std::begin(out_ref); + { + sycl::buffer buf(input.data(), input.size()); + sycl::buffer buf_out(output.data(), output.size()); + sycl::buffer buf_num(&num_copied, 1); + oneapi::dpl::experimental::kt::gpu::copy_if(q, oneapi::dpl::begin(buf), oneapi::dpl::end(buf), + oneapi::dpl::begin(buf_out), oneapi::dpl::begin(buf_num), pred, + param) + .wait(); + } + + std::string msg1 = "wrong num copied with oneapi::dpl::begin/end, n: " + std::to_string(size); + EXPECT_EQ(num_copied_ref, num_copied, msg1.c_str()); + std::string msg2 = "wrong results with oneapi::dpl::begin/end, n: " + std::to_string(size); + EXPECT_EQ_RANGES(ref, output, msg2.c_str()); +} + +template +void +test_general_cases(sycl::queue q, std::size_t size, Predicate pred, KernelParam param) +{ + test_usm(q, size, pred, TestUtils::get_new_kernel_params<0>(param)); + test_usm(q, size, pred, TestUtils::get_new_kernel_params<1>(param)); + test_sycl_iterators(q, size, pred, TestUtils::get_new_kernel_params<2>(param)); +#if _ENABLE_RANGES_TESTING + test_all_view(q, size, pred, TestUtils::get_new_kernel_params<3>(param)); + test_buffer(q, size, pred, TestUtils::get_new_kernel_params<4>(param)); +#endif +} + +template +void +test_all_cases(sycl::queue q, std::size_t size, Predicate pred, KernelParam param) +{ + test_general_cases(q, size, pred, TestUtils::get_new_kernel_params<0>(param)); + +} + +int +main() +{ +#if LOG_TEST_INFO + std::cout << "TEST_DATA_PER_WORK_ITEM : " << TEST_DATA_PER_WORK_ITEM << "\n" + << "TEST_WORK_GROUP_SIZE : " << TEST_WORK_GROUP_SIZE << "\n" + << "TEST_SINGLE_WG_OPTOUT : " << TEST_SINGLE_WG_OPTOUT << "\n" + << "TEST_TYPE : " << TypeInfo().name() << std::endl; +#endif + + constexpr oneapi::dpl::experimental::kt::kernel_param> params; + auto q = TestUtils::get_test_queue(); + bool run_test = can_run_test(q, params); + + auto __predicate = __less_than_val{}; + if (run_test) + { + + try + { + for (auto size : copy_if_sizes) + test_all_cases(q, size, __predicate, params); + } + catch (const std::exception& exc) + { + std::cerr << "Exception: " << exc.what() << std::endl; + return EXIT_FAILURE; + } + } + + return TestUtils::done(run_test); +} diff --git a/test/kt/single_pass_scan.cpp b/test/kt/single_pass_scan.cpp index 860db88d2b3..a92b451cdcf 100644 --- a/test/kt/single_pass_scan.cpp +++ b/test/kt/single_pass_scan.cpp @@ -206,10 +206,12 @@ main() #if LOG_TEST_INFO std::cout << "TEST_DATA_PER_WORK_ITEM : " << TEST_DATA_PER_WORK_ITEM << "\n" << "TEST_WORK_GROUP_SIZE : " << TEST_WORK_GROUP_SIZE << "\n" + << "TEST_SINGLE_WG_OPTOUT : " << TEST_SINGLE_WG_OPTOUT << "\n" << "TEST_TYPE : " << TypeInfo().name() << std::endl; #endif - constexpr oneapi::dpl::experimental::kt::kernel_param params; + constexpr oneapi::dpl::experimental::kt::kernel_param> params; auto q = TestUtils::get_test_queue(); bool run_test = can_run_test(q, params); From e09ccaebf7957ea22265482a8e0b437fc875abcd Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 5 Jun 2024 14:34:34 -0400 Subject: [PATCH 120/134] test bugfix Signed-off-by: Dan Hoeflinger --- test/kt/single_pass_copy_if.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/test/kt/single_pass_copy_if.cpp b/test/kt/single_pass_copy_if.cpp index 86449355ac2..2fdd04e0c32 100644 --- a/test/kt/single_pass_copy_if.cpp +++ b/test/kt/single_pass_copy_if.cpp @@ -85,12 +85,12 @@ test_all_view(sycl::queue q, std::size_t size, Predicate pred, KernelParam param std::vector input(size); generate_copy_if_data(input.data(), size, 42); std::vector ref(input); - std::vector out(size); + std::vector out_ref(size); sycl::buffer buf_out(input.size()); std::size_t num_copied = 0; sycl::buffer buf_num_copied(&num_copied, 1); - auto out_end = std::copy_if(std::begin(ref), std::end(ref), std::begin(out), pred); - std::size_t num_copied_ref = out_end - std::begin(out); + auto out_end = std::copy_if(std::begin(ref), std::end(ref), std::begin(out_ref), pred); + std::size_t num_copied_ref = out_end - std::begin(out_ref); { sycl::buffer buf(input.data(), input.size()); @@ -106,7 +106,7 @@ test_all_view(sycl::queue q, std::size_t size, Predicate pred, KernelParam param std::string msg1 = "wrong num copied with all_view, n: " + std::to_string(size); EXPECT_EQ(num_copied_ref, num_copied_acc[0], msg1.c_str()); std::string msg2 = "wrong results with all_view, n: " + std::to_string(size); - EXPECT_EQ_RANGES(ref, acc, msg2.c_str()); + EXPECT_EQ_RANGES(out_ref, acc, msg2.c_str()); } template @@ -137,7 +137,7 @@ test_buffer(sycl::queue q, std::size_t size, Predicate pred, KernelParam param) std::string msg1 = "wrong num copied with buffer, n: " + std::to_string(size); EXPECT_EQ(num_copied_ref, num_copied_acc[0], msg1.c_str()); std::string msg2 = "wrong results with buffer, n: " + std::to_string(size); - EXPECT_EQ_RANGES(ref, acc, msg2.c_str()); + EXPECT_EQ_RANGES(out_ref, acc, msg2.c_str()); } #endif @@ -150,16 +150,16 @@ test_usm(sycl::queue q, std::size_t size, Predicate pred, KernelParam param) std::cout << "\t\ttest_usm<" << TypeInfo().name() << ", " << USMAllocPresentation().name<_alloc_type>() << ">(" << size << ");" << std::endl; #endif - std::vector expected(size); - generate_copy_if_data(expected.data(), size, 42); + std::vector in_ref(size); + generate_copy_if_data(in_ref.data(), size, 42); std::vector out_ref(size); - TestUtils::usm_data_transfer<_alloc_type, T> dt_input(q, expected.begin(), expected.end()); + TestUtils::usm_data_transfer<_alloc_type, T> dt_input(q, in_ref.begin(), in_ref.end()); TestUtils::usm_data_transfer<_alloc_type, T> dt_output(q, size); TestUtils::usm_data_transfer<_alloc_type, std::size_t> dt_num_copied(q, 1); std::size_t num_copied = 0; - auto out_end = std::copy_if(std::begin(expected), std::end(expected), std::begin(out_ref), pred); + auto out_end = std::copy_if(std::begin(in_ref), std::end(in_ref), std::begin(out_ref), pred); std::size_t num_copied_ref = out_end - std::begin(out_ref); oneapi::dpl::experimental::kt::gpu::copy_if(q, dt_input.get_data(), dt_input.get_data() + size, @@ -174,7 +174,7 @@ test_usm(sycl::queue q, std::size_t size, Predicate pred, KernelParam param) std::string msg1 = "wrong num copied with USM, n: " + std::to_string(size); EXPECT_EQ(num_copied_ref, num_copied_host[0], msg1.c_str()); std::string msg2 = "wrong results with USM, n: " + std::to_string(size); - EXPECT_EQ_N(expected.begin(), actual.begin(), size, msg2.c_str()); + EXPECT_EQ_N(out_ref.begin(), actual.begin(), size, msg2.c_str()); } /////////////////// @@ -207,7 +207,7 @@ test_sycl_iterators(sycl::queue q, std::size_t size, Predicate pred, KernelParam std::string msg1 = "wrong num copied with oneapi::dpl::begin/end, n: " + std::to_string(size); EXPECT_EQ(num_copied_ref, num_copied, msg1.c_str()); std::string msg2 = "wrong results with oneapi::dpl::begin/end, n: " + std::to_string(size); - EXPECT_EQ_RANGES(ref, output, msg2.c_str()); + EXPECT_EQ_RANGES(out_ref, output, msg2.c_str()); } template From 96acd30ca2456ff589e6d9a8d78aa4bd1d2f2472 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 5 Jun 2024 16:57:50 -0400 Subject: [PATCH 121/134] bugfix for non-full case Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/experimental/kt/single_pass_scan.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index bc04d1da4dd..85192b3bd92 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -558,8 +558,8 @@ struct __copy_if_kernel_func else { // Edge of input, have to handle memory bounds - for (std::uint16_t __i = 0; __i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id < __n; - ++__i) + std::uint16_t __end = std::min(std::size_t{__data_per_workitem}, __n - __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id); + for (std::uint16_t __i = 0; __i < __end; ++__i) { _Type __val = __in_rng[__i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id]; From f15c759fb197df30ce228ee911425a08f48d97e9 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 5 Jun 2024 14:05:29 -0700 Subject: [PATCH 122/134] fix range to check --- test/kt/single_pass_copy_if.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/test/kt/single_pass_copy_if.cpp b/test/kt/single_pass_copy_if.cpp index 2fdd04e0c32..857343c9ef7 100644 --- a/test/kt/single_pass_copy_if.cpp +++ b/test/kt/single_pass_copy_if.cpp @@ -106,7 +106,7 @@ test_all_view(sycl::queue q, std::size_t size, Predicate pred, KernelParam param std::string msg1 = "wrong num copied with all_view, n: " + std::to_string(size); EXPECT_EQ(num_copied_ref, num_copied_acc[0], msg1.c_str()); std::string msg2 = "wrong results with all_view, n: " + std::to_string(size); - EXPECT_EQ_RANGES(out_ref, acc, msg2.c_str()); + EXPECT_EQ_N(out_ref.begin(), acc.begin(), num_copied_ref, msg2.c_str()); } template @@ -137,7 +137,7 @@ test_buffer(sycl::queue q, std::size_t size, Predicate pred, KernelParam param) std::string msg1 = "wrong num copied with buffer, n: " + std::to_string(size); EXPECT_EQ(num_copied_ref, num_copied_acc[0], msg1.c_str()); std::string msg2 = "wrong results with buffer, n: " + std::to_string(size); - EXPECT_EQ_RANGES(out_ref, acc, msg2.c_str()); + EXPECT_EQ_N(out_ref.begin(), acc.begin(), num_copied_ref, msg2.c_str()); } #endif @@ -174,11 +174,9 @@ test_usm(sycl::queue q, std::size_t size, Predicate pred, KernelParam param) std::string msg1 = "wrong num copied with USM, n: " + std::to_string(size); EXPECT_EQ(num_copied_ref, num_copied_host[0], msg1.c_str()); std::string msg2 = "wrong results with USM, n: " + std::to_string(size); - EXPECT_EQ_N(out_ref.begin(), actual.begin(), size, msg2.c_str()); + EXPECT_EQ_N(out_ref.begin(), actual.begin(), num_copied_ref, msg2.c_str()); } -/////////////////// - template void test_sycl_iterators(sycl::queue q, std::size_t size, Predicate pred, KernelParam param) @@ -207,7 +205,7 @@ test_sycl_iterators(sycl::queue q, std::size_t size, Predicate pred, KernelParam std::string msg1 = "wrong num copied with oneapi::dpl::begin/end, n: " + std::to_string(size); EXPECT_EQ(num_copied_ref, num_copied, msg1.c_str()); std::string msg2 = "wrong results with oneapi::dpl::begin/end, n: " + std::to_string(size); - EXPECT_EQ_RANGES(out_ref, output, msg2.c_str()); + EXPECT_EQ_N(out_ref.begin(), output.begin(), num_copied_ref, msg2.c_str()); } template From b195e4d5d5e6c3eb17688f5a3eb87169f8abefff Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 5 Jun 2024 17:26:13 -0400 Subject: [PATCH 123/134] adjust data generation Signed-off-by: Dan Hoeflinger --- test/kt/single_pass_copy_if.cpp | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/test/kt/single_pass_copy_if.cpp b/test/kt/single_pass_copy_if.cpp index 857343c9ef7..88311e741ad 100644 --- a/test/kt/single_pass_copy_if.cpp +++ b/test/kt/single_pass_copy_if.cpp @@ -48,12 +48,6 @@ template struct __less_than_val { const T __val; - __less_than_val() : __val{std::is_signed_v ? 0 : std::numeric_limits::max()/T{2}} - { - } - __less_than_val(const T& __v) : __val{__v} - { - } bool operator()(const T& __v) const { @@ -70,7 +64,9 @@ generate_copy_if_data(T* input, std::size_t size, std::uint32_t seed) using substitute_t = std::conditional_t, std::int64_t, std::uint64_t>; std::default_random_engine gen{seed}; - std::uniform_int_distribution dist(std::numeric_limits::lowest(), std::numeric_limits::max()); + substitute_t start = std::is_signed_v ? -10 : 0; + substitute_t end = std::is_signed_v ? 10 : 20; + std::uniform_int_distribution dist(start, end); std::generate(input, input + size, [&] { return dist(gen); }); } @@ -244,7 +240,7 @@ main() auto q = TestUtils::get_test_queue(); bool run_test = can_run_test(q, params); - auto __predicate = __less_than_val{}; + auto __predicate = __less_than_val{std::is_signed_v ? TEST_TYPE{0} : TEST_TYPE{10}}; if (run_test) { From 4528cbbe54c3a53d3e43e8c1d19c7e49ac6446fb Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 5 Jun 2024 17:33:50 -0400 Subject: [PATCH 124/134] better fix for non-full case Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 85192b3bd92..7848fc1fd8f 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -558,15 +558,18 @@ struct __copy_if_kernel_func else { // Edge of input, have to handle memory bounds - std::uint16_t __end = std::min(std::size_t{__data_per_workitem}, __n - __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id); - for (std::uint16_t __i = 0; __i < __end; ++__i) +#pragma unroll + for (std::uint16_t __i = 0; __i < __data_per_workitem; ++__i) { - _Type __val = __in_rng[__i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id]; - - if (__pred(__val)) + if (__i + (__wg_local_id) * __data_per_workitem + __elems_in_tile * __tile_id < __n) { - __wg_copy_if_values[__wi_count + __wg_local_id * __data_per_workitem] = __val; - ++__wi_count; + _Type __val = __in_rng[__i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id]; + + if (__pred(__val)) + { + __wg_copy_if_values[__wi_count + __wg_local_id * __data_per_workitem] = __val; + ++__wi_count; + } } } } From 831f9c9860a847f35c635248f4f0ede5fc1added Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 5 Jun 2024 17:46:06 -0400 Subject: [PATCH 125/134] removing old test Signed-off-by: Dan Hoeflinger --- .../numeric/numeric.ops/copy_if_kt.pass.cpp | 104 ------------------ 1 file changed, 104 deletions(-) delete mode 100644 test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp deleted file mode 100644 index a46b76a3be2..00000000000 --- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// -*- C++ -*- -//===-- scan.pass.cpp -----------------------------------------------------===// -// -// Copyright (C) Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// This file incorporates work covered by the following copyright and permission -// notice: -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// -//===----------------------------------------------------------------------===// - -#include "support/test_config.h" -#include "support/utils.h" - -#include -#include _PSTL_TEST_HEADER(execution) -#include _PSTL_TEST_HEADER(numeric) - -using namespace TestUtils; - -template -bool -test(Predicate pred, Generator gen, KernelParam param) -{ - bool all_passed = true; - sycl::queue q; - - for (int logn : {4, 8, 10, 12, 14, 15, 18}) - { - int n = 1 << logn; - - Sequence in(n, [&](size_t k) -> T { return gen(n ^ k); }); - - Sequence std_out(n); - - T* in_ptr = sycl::malloc_device(n, q); - T* out_ptr = sycl::malloc_device(n, q); - size_t* out_num = sycl::malloc_device(1, q); - - q.copy(in.data(), in_ptr, n).wait(); - oneapi::dpl::experimental::kt::gpu::copy_if(q, in_ptr, in_ptr + n, out_ptr, out_num, pred, param).wait(); - - Sequence kt_out(n); - size_t num_selected = 0; - q.copy(out_ptr, kt_out.data(), n); - q.copy(out_num, &num_selected, 1); - q.wait(); - - auto std_out_end = std::copy_if(in.begin(), in.end(), std_out.begin(), pred); - - bool passed = true; - if (num_selected != (std_out_end - std_out.begin())) - { - passed = false; - std::cout << "Num selected wrong: expected " << (std_out_end - std_out.begin()) << " " << num_selected - << "\n"; - } - - for (size_t i = 0; i < (std_out_end - std_out.begin()); ++i) - { - if (kt_out[i] != std_out[i]) - { - passed = false; - std::cout << "expected " << i << ' ' << std_out[i] << ' ' << kt_out[i] << '\n'; - } - } - - if (passed) - std::cout << " passed" << std::endl; - else - std::cout << " failed" << std::endl; - - all_passed &= passed; - sycl::free(in_ptr, q); - sycl::free(out_ptr, q); - sycl::free(out_num, q); - } - - return all_passed; -} - -int -main() -{ - bool all_passed = true; - constexpr int n_elements_per_workitem = 8; - - auto param = oneapi::dpl::experimental::kt::kernel_param{}; - all_passed &= - test([](const float64_t& x) { return x * x <= 1024; }, - [](size_t j) { return ((j + 1) % 7 & 2) != 0 ? float64_t(j % 32) : float64_t(j % 33 + 34); }, - TestUtils::get_new_kernel_params<0>(param)); - all_passed &= test([](const int&) { return true; }, [](size_t j) { return j; }, - TestUtils::get_new_kernel_params<1>(param)); - all_passed &= test([](const std::int32_t& x) { return x != 42; }, - [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? std::int32_t(j + 1) : 42; }, - TestUtils::get_new_kernel_params<2>(param)); - - return all_passed; -} From 7be4594ec4fbaaffbc12732f4325113f944f91c4 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Thu, 6 Jun 2024 09:04:58 -0400 Subject: [PATCH 126/134] undo change to unroll version check Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/onedpl_config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/onedpl_config.h b/include/oneapi/dpl/pstl/onedpl_config.h index fff5f2405b5..450cae9a347 100644 --- a/include/oneapi/dpl/pstl/onedpl_config.h +++ b/include/oneapi/dpl/pstl/onedpl_config.h @@ -123,7 +123,7 @@ // Enable loop unrolling pragmas where supported #if (__INTEL_LLVM_COMPILER || __INTEL_COMPILER || \ (!defined(__INTEL_LLVM_COMPILER) && !defined(__INTEL_COMPILER) && \ - ((_ONEDPL_GCC_VERSION >= 80000) || (_ONEDPL_CLANG_VERSION >= 40000)))) + ((_ONEDPL_GCC_VERSION >= 80000) || (_ONEDPL_CLANG_VERSION >= 30700)))) # define _ONEDPL_PRAGMA_UNROLL _ONEDPL_PRAGMA(unroll) #else //no pragma unroll # define _ONEDPL_PRAGMA_UNROLL From 68c258d67129e828ea088863a8fb8ccc3dfb04b0 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Thu, 6 Jun 2024 09:16:51 -0400 Subject: [PATCH 127/134] formatting Signed-off-by: Dan Hoeflinger --- .../oneapi/dpl/experimental/kt/single_pass_scan.h | 2 +- test/kt/single_pass_copy_if.cpp | 15 ++++++++------- test/kt/single_pass_scan.cpp | 6 ++++-- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 7848fc1fd8f..66481abb1c8 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -561,7 +561,7 @@ struct __copy_if_kernel_func #pragma unroll for (std::uint16_t __i = 0; __i < __data_per_workitem; ++__i) { - if (__i + (__wg_local_id) * __data_per_workitem + __elems_in_tile * __tile_id < __n) + if (__i + (__wg_local_id)*__data_per_workitem + __elems_in_tile * __tile_id < __n) { _Type __val = __in_rng[__i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id]; diff --git a/test/kt/single_pass_copy_if.cpp b/test/kt/single_pass_copy_if.cpp index 88311e741ad..9896e394534 100644 --- a/test/kt/single_pass_copy_if.cpp +++ b/test/kt/single_pass_copy_if.cpp @@ -92,7 +92,8 @@ test_all_view(sycl::queue q, std::size_t size, Predicate pred, KernelParam param oneapi::dpl::experimental::ranges::all_view view(buf); oneapi::dpl::experimental::ranges::all_view view_out(buf_out); - oneapi::dpl::experimental::ranges::all_view view_num_copied(buf_num_copied); + oneapi::dpl::experimental::ranges::all_view view_num_copied( + buf_num_copied); oneapi::dpl::experimental::kt::gpu::copy_if(q, view, view_out, view_num_copied, pred, param).wait(); } @@ -134,7 +135,6 @@ test_buffer(sycl::queue q, std::size_t size, Predicate pred, KernelParam param) EXPECT_EQ(num_copied_ref, num_copied_acc[0], msg1.c_str()); std::string msg2 = "wrong results with buffer, n: " + std::to_string(size); EXPECT_EQ_N(out_ref.begin(), acc.begin(), num_copied_ref, msg2.c_str()); - } #endif @@ -159,14 +159,14 @@ test_usm(sycl::queue q, std::size_t size, Predicate pred, KernelParam param) std::size_t num_copied_ref = out_end - std::begin(out_ref); oneapi::dpl::experimental::kt::gpu::copy_if(q, dt_input.get_data(), dt_input.get_data() + size, - dt_output.get_data(), dt_num_copied.get_data(), pred, param) + dt_output.get_data(), dt_num_copied.get_data(), pred, param) .wait(); std::vector actual(size); dt_output.retrieve_data(actual.begin()); std::vector num_copied_host(1); dt_num_copied.retrieve_data(num_copied_host.begin()); - + std::string msg1 = "wrong num copied with USM, n: " + std::to_string(size); EXPECT_EQ(num_copied_ref, num_copied_host[0], msg1.c_str()); std::string msg2 = "wrong results with USM, n: " + std::to_string(size); @@ -222,7 +222,6 @@ void test_all_cases(sycl::queue q, std::size_t size, Predicate pred, KernelParam param) { test_general_cases(q, size, pred, TestUtils::get_new_kernel_params<0>(param)); - } int @@ -235,8 +234,10 @@ main() << "TEST_TYPE : " << TypeInfo().name() << std::endl; #endif - constexpr oneapi::dpl::experimental::kt::kernel_param> params; + constexpr oneapi::dpl::experimental::kt::kernel_param< + TEST_DATA_PER_WORK_ITEM, TEST_WORK_GROUP_SIZE, + /*opt_out_single_wg=*/std::bool_constant> + params; auto q = TestUtils::get_test_queue(); bool run_test = can_run_test(q, params); diff --git a/test/kt/single_pass_scan.cpp b/test/kt/single_pass_scan.cpp index a92b451cdcf..06c8a545748 100644 --- a/test/kt/single_pass_scan.cpp +++ b/test/kt/single_pass_scan.cpp @@ -210,8 +210,10 @@ main() << "TEST_TYPE : " << TypeInfo().name() << std::endl; #endif - constexpr oneapi::dpl::experimental::kt::kernel_param> params; + constexpr oneapi::dpl::experimental::kt::kernel_param< + TEST_DATA_PER_WORK_ITEM, TEST_WORK_GROUP_SIZE, + /*opt_out_single_wg=*/std::bool_constant> + params; auto q = TestUtils::get_test_queue(); bool run_test = can_run_test(q, params); From 347bcf5d44b397742d42ff2e0f6941b7a4ad0434 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Thu, 6 Jun 2024 16:48:36 -0400 Subject: [PATCH 128/134] allowing alg to dictate active threads Signed-off-by: Dan Hoeflinger --- .../dpl/experimental/kt/single_pass_scan.h | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 66481abb1c8..c1eddba4a0d 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -259,26 +259,27 @@ template (__group, __subgroup, __status_flags, __status_vals_full, __status_vals_partial, - __tile_id, __local_reduction, __prev_tile_reduction, __binary_op); + __tile_id, __local_reduction, __prev_tile_reduction, __is_active_sg, __is_active_wi, __active_wg_wi_id, __binary_op); sycl::joint_inclusive_scan(__group, __tile_vals_ptr, __tile_vals_ptr + __wg_local_memory_size, __out_begin, __binary_op, __prev_tile_reduction); @@ -523,6 +528,12 @@ struct __copy_if_kernel_func auto __wg_local_id = __item.get_local_id(0); auto __sg = __item.get_sub_group(); + constexpr std::uint16_t __active_sg_id = __workgroup_size / SUBGROUP_SIZE - 1; + constexpr std::uint16_t __active_sg_wi_id = SUBGROUP_SIZE - 1; + bool __is_active_sg = (__subgroup.get_group_id() == __active_sg_id); + bool __is_active_wi = (__subgroup.get_local_id() == __active_sg_wi_id); + constexpr std::uint16_t __active_wg_wi_id = __workgroup_size - 1; + std::uint32_t __tile_id = 0; // Obtain unique ID for this work-group that will be used in decoupled lookback @@ -580,7 +591,8 @@ struct __copy_if_kernel_func _SizeT __copied_elements = 0; __lookback_phase<_FlagType>(__group, __sg, __status_flags, __status_vals_full, __status_vals_partial, __tile_id, - __wg_count + __wi_count, __copied_elements, _BinaryOp{}); + __wg_count + __wi_count, __copied_elements, __is_active_sg, __is_active_wi, + __active_wg_wi_id, _BinaryOp{}); // Phase 3: copy values to global memory for (std::uint16_t __i = 0; __i < __wi_count; ++__i) From 62f58e224fc101454fb4a67bb1f781d7fe64f176 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Thu, 6 Jun 2024 13:57:50 -0700 Subject: [PATCH 129/134] bugfix for indexes --- include/oneapi/dpl/experimental/kt/single_pass_scan.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index c1eddba4a0d..57014969de0 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -306,8 +306,8 @@ struct __lookback_kernel_func auto __group = __item.get_group(); auto __subgroup = __item.get_sub_group(); auto __local_id = __item.get_local_id(0); - constexpr bool __is_active_sg = (__subgroup.get_group_id() == 0); - constexpr bool __is_active_wi = (__subgroup.get_local_id() == 0); + bool __is_active_sg = (__subgroup.get_group_id() == 0); + bool __is_active_wi = (__subgroup.get_local_id() == 0); constexpr std::uint16_t __active_wg_wi_id = 0; @@ -530,8 +530,8 @@ struct __copy_if_kernel_func constexpr std::uint16_t __active_sg_id = __workgroup_size / SUBGROUP_SIZE - 1; constexpr std::uint16_t __active_sg_wi_id = SUBGROUP_SIZE - 1; - bool __is_active_sg = (__subgroup.get_group_id() == __active_sg_id); - bool __is_active_wi = (__subgroup.get_local_id() == __active_sg_wi_id); + bool __is_active_sg = (__sg.get_group_id() == __active_sg_id); + bool __is_active_wi = (__sg.get_local_id() == __active_sg_wi_id); constexpr std::uint16_t __active_wg_wi_id = __workgroup_size - 1; std::uint32_t __tile_id = 0; From 5c4bd74213db66f153a7f608154feaba151987d3 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Thu, 6 Jun 2024 17:13:09 -0400 Subject: [PATCH 130/134] clang format Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/experimental/kt/single_pass_scan.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 57014969de0..ea756308fd9 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -259,8 +259,8 @@ template (__group, __subgroup, __status_flags, __status_vals_full, __status_vals_partial, - __tile_id, __local_reduction, __prev_tile_reduction, __is_active_sg, __is_active_wi, __active_wg_wi_id, __binary_op); + __tile_id, __local_reduction, __prev_tile_reduction, __is_active_sg, __is_active_wi, + __active_wg_wi_id, __binary_op); sycl::joint_inclusive_scan(__group, __tile_vals_ptr, __tile_vals_ptr + __wg_local_memory_size, __out_begin, __binary_op, __prev_tile_reduction); From 98324ed68c422f8914ddf467f51930dce2aa42b3 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Tue, 18 Jun 2024 10:53:42 -0400 Subject: [PATCH 131/134] address reviewer comments Signed-off-by: Dan Hoeflinger --- .../oneapi/dpl/experimental/kt/single_pass_scan.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index ea756308fd9..24ee071ee3d 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -647,7 +647,7 @@ struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType, template sycl::event -single_pass_copy_if_impl(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _NumCopiedRng __num_rng, +single_pass_copy_if_impl(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _NumCopiedRng&& __num_rng, _UnaryPredicate __pred, _KernelParam) { using _SizeT = uint64_t; @@ -702,8 +702,9 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out __queue, __status_flags, __status_vals_partial, __status_flags_size, _FlagType::__padding); sycl::event __prev_event = __copy_if_submitter<__elems_per_workitem, __workgroup_size, _FlagType, _CopyIfKernel>{}( - __queue, __fill_event, __in_rng, __out_rng, __num_rng, __n, __pred, __status_flags, __status_flags_size, - __status_vals_full, __status_vals_partial, __current_num_items, __current_num_wgs); + __queue, __fill_event, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), + std::forward<_NumCopiedRng>(__num_rng), __n, __pred, __status_flags, __status_flags_size, __status_vals_full, + __status_vals_partial, __current_num_items, __current_num_wgs); // TODO: Currently, the following portion of code makes this entire function synchronous. // Ideally, we should be able to use the asynchronous free below, but we have found that doing @@ -728,9 +729,9 @@ sycl::event copy_if(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _NumCopiedRng&& __num_rng, _UnaryPredicate __pred, _KernelParam __param = {}) { - auto __in_view = oneapi::dpl::__ranges::views::all(std::forward<_InRng>(__in_rng)); - auto __out_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRng>(__out_rng)); - auto __num_view = oneapi::dpl::__ranges::views::all(std::forward<_NumCopiedRng>(__num_rng)); + auto __in_view = oneapi::dpl::__ranges::views::all_read(std::forward<_InRng>(__in_rng)); + auto __out_view = oneapi::dpl::__ranges::views::all_write(std::forward<_OutRng>(__out_rng)); + auto __num_view = oneapi::dpl::__ranges::views::all_write(std::forward<_NumCopiedRng>(__num_rng)); return __impl::single_pass_copy_if_impl(__queue, std::move(__in_view), std::move(__out_view), std::move(__num_view), __pred, __param); From 4377c2de470d969cf2c3bc5da6a6ca0b9237ad00 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Tue, 18 Jun 2024 11:39:04 -0400 Subject: [PATCH 132/134] simplify data generation and cutoff calculation. Signed-off-by: Dan Hoeflinger --- test/kt/single_pass_copy_if.cpp | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/test/kt/single_pass_copy_if.cpp b/test/kt/single_pass_copy_if.cpp index 9896e394534..327e15563d5 100644 --- a/test/kt/single_pass_copy_if.cpp +++ b/test/kt/single_pass_copy_if.cpp @@ -61,13 +61,18 @@ generate_copy_if_data(T* input, std::size_t size, std::uint32_t seed) { // Integer numbers are generated even for floating point types in order to avoid rounding errors, // and simplify the final check - using substitute_t = std::conditional_t, std::int64_t, std::uint64_t>; - std::default_random_engine gen{seed}; - substitute_t start = std::is_signed_v ? -10 : 0; - substitute_t end = std::is_signed_v ? 10 : 20; - std::uniform_int_distribution dist(start, end); - std::generate(input, input + size, [&] { return dist(gen); }); + + if constexpr (std::is_integral_v) + { + std::uniform_int_distribution dist(std::numeric_limits::min(), std::numeric_limits::max()); + std::generate(input, input + size, [&] { return dist(gen); }); + } + else + { + std::uniform_real_distribution dist(std::numeric_limits::min(), std::numeric_limits::max()); + std::generate(input, input + size, [&] { return dist(gen); }); + } } #if _ENABLE_RANGES_TESTING @@ -87,15 +92,12 @@ test_all_view(sycl::queue q, std::size_t size, Predicate pred, KernelParam param sycl::buffer buf_num_copied(&num_copied, 1); auto out_end = std::copy_if(std::begin(ref), std::end(ref), std::begin(out_ref), pred); std::size_t num_copied_ref = out_end - std::begin(out_ref); - { - sycl::buffer buf(input.data(), input.size()); + sycl::buffer buf(input.data(), input.size()); - oneapi::dpl::experimental::ranges::all_view view(buf); - oneapi::dpl::experimental::ranges::all_view view_out(buf_out); - oneapi::dpl::experimental::ranges::all_view view_num_copied( - buf_num_copied); - oneapi::dpl::experimental::kt::gpu::copy_if(q, view, view_out, view_num_copied, pred, param).wait(); - } + oneapi::dpl::experimental::ranges::all_view view(buf); + oneapi::dpl::experimental::ranges::all_view view_out(buf_out); + oneapi::dpl::experimental::ranges::all_view view_num_copied(buf_num_copied); + oneapi::dpl::experimental::kt::gpu::copy_if(q, view, view_out, view_num_copied, pred, param).wait(); auto acc = buf_out.get_host_access(); auto num_copied_acc = buf_num_copied.get_host_access(); @@ -241,7 +243,8 @@ main() auto q = TestUtils::get_test_queue(); bool run_test = can_run_test(q, params); - auto __predicate = __less_than_val{std::is_signed_v ? TEST_TYPE{0} : TEST_TYPE{10}}; + TEST_TYPE cutoff = std::is_signed_v ? TEST_TYPE{0} : std::numeric_limits::max() / 2; + auto __predicate = __less_than_val{cutoff}; if (run_test) { From 496ed4d3eef3826873c453861d622901fee6072b Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Tue, 18 Jun 2024 11:57:36 -0400 Subject: [PATCH 133/134] strip out single workgroup opt out Signed-off-by: Dan Hoeflinger --- .../oneapi/dpl/experimental/kt/kernel_param.h | 3 +- .../dpl/experimental/kt/single_pass_scan.h | 33 ++++++++----------- test/kt/CMakeLists.txt | 11 +++---- test/kt/single_pass_copy_if.cpp | 6 +--- test/kt/single_pass_scan.cpp | 6 +--- 5 files changed, 20 insertions(+), 39 deletions(-) diff --git a/include/oneapi/dpl/experimental/kt/kernel_param.h b/include/oneapi/dpl/experimental/kt/kernel_param.h index bbed93e777c..b3ee36be189 100644 --- a/include/oneapi/dpl/experimental/kt/kernel_param.h +++ b/include/oneapi/dpl/experimental/kt/kernel_param.h @@ -18,13 +18,12 @@ namespace oneapi::dpl::experimental::kt { template + typename _KernelName = oneapi::dpl::execution::DefaultKernelName> struct kernel_param { static constexpr std::uint16_t data_per_workitem = __data_per_work_item; static constexpr std::uint16_t workgroup_size = __work_group_size; using kernel_name = _KernelName; - using single_wg_opt_out = _SingleWgOptOut; }; } // namespace oneapi::dpl::experimental::kt diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 24ee071ee3d..48e860b3880 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -440,18 +440,14 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r // Next power of 2 greater than or equal to __n auto __n_uniform = ::oneapi::dpl::__internal::__dpl_bit_ceil(__n); - if constexpr (std::negation_v) + // Perform a single-work group scan if the input is small + if (oneapi::dpl::__par_backend_hetero::__group_scan_fits_in_slm<_Type>(__queue, __n, __n_uniform)) { - // Perform a single-work group scan if the input is small - if (oneapi::dpl::__par_backend_hetero::__group_scan_fits_in_slm<_Type>(__queue, __n, __n_uniform)) - { - return oneapi::dpl::__par_backend_hetero::__parallel_transform_scan_single_group( - oneapi::dpl::__internal::__device_backend_tag{}, - oneapi::dpl::execution::__dpl::make_device_policy(__queue), - std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng), __n, - oneapi::dpl::__internal::__no_op{}, unseq_backend::__no_init_value<_Type>{}, __binary_op, - std::true_type{}); - } + return oneapi::dpl::__par_backend_hetero::__parallel_transform_scan_single_group( + oneapi::dpl::__internal::__device_backend_tag{}, + oneapi::dpl::execution::__dpl::make_device_policy(__queue), + std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng), __n, + oneapi::dpl::__internal::__no_op{}, unseq_backend::__no_init_value<_Type>{}, __binary_op, std::true_type{}); } constexpr std::size_t __workgroup_size = _KernelParam::workgroup_size; @@ -668,16 +664,13 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out // Next power of 2 greater than or equal to __n auto __n_uniform = ::oneapi::dpl::__internal::__dpl_bit_ceil(__n); - if constexpr (std::negation_v) + //If we fit in a single WG SLM, use the single wg version from oneDPL main + if (oneapi::dpl::__par_backend_hetero::__group_copy_if_fits_in_slm(__queue, __n, __n_uniform)) { - //If we fit in a single WG SLM, use the single wg version from oneDPL main - if (oneapi::dpl::__par_backend_hetero::__group_copy_if_fits_in_slm(__queue, __n, __n_uniform)) - { - return oneapi::dpl::__par_backend_hetero::__dispatch_small_copy_if( - oneapi::dpl::execution::__dpl::make_device_policy<_KernelName>(__queue), __n, - std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), - std::forward<_NumCopiedRng>(__num_rng), __pred); - } + return oneapi::dpl::__par_backend_hetero::__dispatch_small_copy_if( + oneapi::dpl::execution::__dpl::make_device_policy<_KernelName>(__queue), __n, + std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), std::forward<_NumCopiedRng>(__num_rng), + __pred); } constexpr std::size_t __workgroup_size = _KernelParam::workgroup_size; constexpr std::size_t __elems_per_workitem = _KernelParam::data_per_workitem; diff --git a/test/kt/CMakeLists.txt b/test/kt/CMakeLists.txt index 3159cfdae7e..cd2c600e1c5 100644 --- a/test/kt/CMakeLists.txt +++ b/test/kt/CMakeLists.txt @@ -130,7 +130,7 @@ if (ONEDPL_TEST_ENABLE_KT_ESIMD) _generate_esimd_sort_test("esimd_radix_sort" "256" "32" "double" "" 1000) # segfault endif() -function (_generate_gpu_single_pass_test _alg _data_per_work_item _work_group_size _type _single_wg_optout) +function (_generate_gpu_single_pass_test _alg _data_per_work_item _work_group_size _type) if ((NOT TARGET "build-${_alg}-kt-tests") AND (NOT TARGET "run-${_alg}-kt-tests")) add_custom_target("build-${_alg}-kt-tests" COMMENT "Build all ${_alg} kernel template tests") @@ -141,7 +141,7 @@ function (_generate_gpu_single_pass_test _alg _data_per_work_item _work_group_si endif() string(REPLACE "_t" "" _type_short ${_type}) - set(_target_name "single_pass_${_alg}_dpwi${_data_per_work_item}_wgs${_work_group_size}_${_type_short}_${_single_wg_optout}") + set(_target_name "single_pass_${_alg}_dpwi${_data_per_work_item}_wgs${_work_group_size}_${_type_short}") set(_test_path "single_pass_${_alg}.cpp") #_generate_test_randomly(${_target_name} ${_test_path} ${_probability_permille}) @@ -152,7 +152,6 @@ function (_generate_gpu_single_pass_test _alg _data_per_work_item _work_group_si target_compile_definitions(${_target_name} PRIVATE TEST_DATA_PER_WORK_ITEM=${_data_per_work_item}) target_compile_definitions(${_target_name} PRIVATE TEST_WORK_GROUP_SIZE=${_work_group_size}) - target_compile_definitions(${_target_name} PRIVATE TEST_SINGLE_WG_OPTOUT=${_single_wg_optout}) target_compile_definitions(${_target_name} PRIVATE TEST_TYPE=${_type}) endif() @@ -168,15 +167,13 @@ function(_generate_gpu_single_pass_tests) foreach (_data_per_work_item ${_data_per_work_item_all}) foreach (_work_group_size ${_work_group_size_all}) foreach (_type ${_type_all}) - _generate_gpu_single_pass_test(${_alg} ${_data_per_work_item} ${_work_group_size} ${_type} "false") + _generate_gpu_single_pass_test(${_alg} ${_data_per_work_item} ${_work_group_size} ${_type}) endforeach() endforeach() endforeach() - # to not double the number of tests, check single wg output with a single test per alg - _generate_gpu_single_pass_test(${_alg} "8" "512" "float" "true") _generate_test("single_pass_${_alg}" "single_pass_${_alg}.cpp") - target_compile_definitions("single_pass_${_alg}" PRIVATE TEST_DATA_PER_WORK_ITEM=8 TEST_WORK_GROUP_SIZE=256 TEST_TYPE=uint32_t TEST_SINGLE_WG_OPTOUT=false) + target_compile_definitions("single_pass_${_alg}" PRIVATE TEST_DATA_PER_WORK_ITEM=8 TEST_WORK_GROUP_SIZE=256 TEST_TYPE=uint32_t) endforeach() endfunction() diff --git a/test/kt/single_pass_copy_if.cpp b/test/kt/single_pass_copy_if.cpp index 327e15563d5..66a373643a1 100644 --- a/test/kt/single_pass_copy_if.cpp +++ b/test/kt/single_pass_copy_if.cpp @@ -232,14 +232,10 @@ main() #if LOG_TEST_INFO std::cout << "TEST_DATA_PER_WORK_ITEM : " << TEST_DATA_PER_WORK_ITEM << "\n" << "TEST_WORK_GROUP_SIZE : " << TEST_WORK_GROUP_SIZE << "\n" - << "TEST_SINGLE_WG_OPTOUT : " << TEST_SINGLE_WG_OPTOUT << "\n" << "TEST_TYPE : " << TypeInfo().name() << std::endl; #endif - constexpr oneapi::dpl::experimental::kt::kernel_param< - TEST_DATA_PER_WORK_ITEM, TEST_WORK_GROUP_SIZE, - /*opt_out_single_wg=*/std::bool_constant> - params; + constexpr oneapi::dpl::experimental::kt::kernel_param params; auto q = TestUtils::get_test_queue(); bool run_test = can_run_test(q, params); diff --git a/test/kt/single_pass_scan.cpp b/test/kt/single_pass_scan.cpp index 06c8a545748..860db88d2b3 100644 --- a/test/kt/single_pass_scan.cpp +++ b/test/kt/single_pass_scan.cpp @@ -206,14 +206,10 @@ main() #if LOG_TEST_INFO std::cout << "TEST_DATA_PER_WORK_ITEM : " << TEST_DATA_PER_WORK_ITEM << "\n" << "TEST_WORK_GROUP_SIZE : " << TEST_WORK_GROUP_SIZE << "\n" - << "TEST_SINGLE_WG_OPTOUT : " << TEST_SINGLE_WG_OPTOUT << "\n" << "TEST_TYPE : " << TypeInfo().name() << std::endl; #endif - constexpr oneapi::dpl::experimental::kt::kernel_param< - TEST_DATA_PER_WORK_ITEM, TEST_WORK_GROUP_SIZE, - /*opt_out_single_wg=*/std::bool_constant> - params; + constexpr oneapi::dpl::experimental::kt::kernel_param params; auto q = TestUtils::get_test_queue(); bool run_test = can_run_test(q, params); From 2d9e8a76c6f2f1169dea87ebce4a9f71bce7456f Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Tue, 18 Jun 2024 14:17:18 -0400 Subject: [PATCH 134/134] minimal data generation changes Signed-off-by: Dan Hoeflinger --- test/kt/single_pass_copy_if.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/test/kt/single_pass_copy_if.cpp b/test/kt/single_pass_copy_if.cpp index 66a373643a1..86b3cc46820 100644 --- a/test/kt/single_pass_copy_if.cpp +++ b/test/kt/single_pass_copy_if.cpp @@ -59,18 +59,16 @@ template auto generate_copy_if_data(T* input, std::size_t size, std::uint32_t seed) { - // Integer numbers are generated even for floating point types in order to avoid rounding errors, - // and simplify the final check std::default_random_engine gen{seed}; if constexpr (std::is_integral_v) { - std::uniform_int_distribution dist(std::numeric_limits::min(), std::numeric_limits::max()); + std::uniform_int_distribution dist(std::numeric_limits::lowest(), std::numeric_limits::max()); std::generate(input, input + size, [&] { return dist(gen); }); } else { - std::uniform_real_distribution dist(std::numeric_limits::min(), std::numeric_limits::max()); + std::uniform_real_distribution dist(std::numeric_limits::lowest(), std::numeric_limits::max()); std::generate(input, input + size, [&] { return dist(gen); }); } }