From d8a491f28ad96be12824650eebad7096e4033dbc Mon Sep 17 00:00:00 2001
From: Stephen Nicholas Swatman <stephen.nicholas.swatman@cern.ch>
Date: Wed, 18 Sep 2024 17:49:09 +0200
Subject: [PATCH] Add sanity checks for predicates on vectors

This commit adds four new sanity checks, `true_for_all`,
`false_for_all`, `true_for_any`, and `false_for_any` which basically do
what they say on the tin.
---
 cmake/traccc-compiler-options-cuda.cmake |   3 +
 device/cuda/src/sanity/predicate.cuh     | 129 ++++++++++++++++++++
 device/sycl/src/sanity/predicate.hpp     | 130 ++++++++++++++++++++
 tests/cuda/CMakeLists.txt                |   1 +
 tests/cuda/test_sanity_predicate.cu      | 146 ++++++++++++++++++++++
 tests/sycl/CMakeLists.txt                |   1 +
 tests/sycl/test_sanity_predicate.sycl    | 147 +++++++++++++++++++++++
 7 files changed, 557 insertions(+)
 create mode 100644 device/cuda/src/sanity/predicate.cuh
 create mode 100644 device/sycl/src/sanity/predicate.hpp
 create mode 100644 tests/cuda/test_sanity_predicate.cu
 create mode 100644 tests/sycl/test_sanity_predicate.sycl
diff --git a/cmake/traccc-compiler-options-cuda.cmake b/cmake/traccc-compiler-options-cuda.cmake
index 42cf2ade48..49166cec64 100644
--- a/cmake/traccc-compiler-options-cuda.cmake
+++ b/cmake/traccc-compiler-options-cuda.cmake
@@ -27,6 +27,9 @@ set( CMAKE_CUDA_ARCHITECTURES "52" CACHE STRING
 # not marked with __device__.
 traccc_add_flag( CMAKE_CUDA_FLAGS "--expt-relaxed-constexpr" )
 
+# Allow the use of lambdas with __device__ specifiers.
+traccc_add_flag( CMAKE_CUDA_FLAGS "--extended-lambda" )
+
 # Make CUDA generate debug symbols for the device code as well in a debug
 # build.
 traccc_add_flag( CMAKE_CUDA_FLAGS_DEBUG "-G --keep -src-in-ptx" )
diff --git a/device/cuda/src/sanity/predicate.cuh b/device/cuda/src/sanity/predicate.cuh
new file mode 100644
index 0000000000..de5a8c88f2
--- /dev/null
+++ b/device/cuda/src/sanity/predicate.cuh
@@ -0,0 +1,129 @@
+/**
+ * traccc library, part of the ACTS project (R&D line)
+ *
+ * (c) 2024 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#pragma once
+
+// Project include(s).
+#include "../utils/cuda_error_handling.hpp"
+#include "traccc/cuda/utils/stream.hpp"
+
+// VecMem include(s).
+#include <vecmem/containers/data/vector_view.hpp>
+#include <vecmem/containers/device_vector.hpp>
+#include <vecmem/memory/memory_resource.hpp>
+#include <vecmem/memory/unique_ptr.hpp>
+#include <vecmem/utils/copy.hpp>
+
+// CUDA include
+#include <cuda_runtime.h>
+
+// System include
+#include <concepts>
+
+namespace traccc::cuda {
+namespace kernels {
+template <typename P, typename T>
+requires std::predicate<P, T> __global__ void true_for_all_kernel(
+    P projection, vecmem::data::vector_view<T> _in, bool* out) {
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    vecmem::device_vector<T> in(_in);
+
+    if (tid < in.size()) {
+        if (!projection(in.at(tid))) {
+            *out = false;
+        }
+    }
+}
+}  // namespace kernels
+
+/**
+ * @brief Sanity check that a predicate is true for all elements of a vector.
+ *
+ * @note This function runs in O(n) time.
+ *
+ * @tparam P The type of the predicate.
+ * @tparam T The type of the vector.
+ * @param predicate A projection object of type `P`.
+ * @param mr A memory resource used for allocating intermediate memory.
+ * @param copy A copy object.
+ * @param stream A wrapped CUDA stream.
+ * @param vector The vector which to check for contiguity.
+ * @return true If `predicate` is true for all elements of `vector`.
+ * @return false Otherwise.
+ */
+template <typename P, typename T>
+requires std::predicate<P, T> bool true_for_all(
+    P&& predicate, vecmem::memory_resource& mr, vecmem::copy& copy,
+    stream& stream, vecmem::data::vector_view<T> vector) {
+    // This should never be a performance-critical step, so we can keep the
+    // block size fixed.
+    constexpr int block_size = 512;
+
+    cudaStream_t cuda_stream =
+        reinterpret_cast<cudaStream_t>(stream.cudaStream());
+
+    // Grab the number of elements in our vector.
+    const std::uint32_t n = copy.get_size(vector);
+
+    // Allocate memory for outputs, then set them up.
+    vecmem::unique_alloc_ptr<bool> device_out =
+        vecmem::make_unique_alloc<bool>(mr);
+
+    bool initial_out = true;
+
+    TRACCC_CUDA_ERROR_CHECK(
+        cudaMemcpyAsync(device_out.get(), &initial_out, sizeof(bool),
+                        cudaMemcpyHostToDevice, cuda_stream));
+
+    // Launch the main kernel.
+    kernels::true_for_all_kernel<P, T>
+        <<<(n + block_size - 1) / block_size, block_size, 0, cuda_stream>>>(
+            predicate, vector, device_out.get());
+
+    TRACCC_CUDA_ERROR_CHECK(cudaGetLastError());
+
+    // Copy the total number of squashed elements, e.g. the size of the
+    // resulting vector.
+    bool host_out;
+
+    TRACCC_CUDA_ERROR_CHECK(
+        cudaMemcpyAsync(&host_out, device_out.get(), sizeof(bool),
+                        cudaMemcpyDeviceToHost, cuda_stream));
+
+    stream.synchronize();
+
+    return host_out;
+}
+
+template <typename P, typename T>
+requires std::predicate<P, T> bool false_for_all(
+    P&& projection, vecmem::memory_resource& mr, vecmem::copy& copy,
+    stream& stream, vecmem::data::vector_view<T> vector) {
+    return true_for_all(
+        [projection] __device__<typename... Args>(Args && ... args) {
+            return !projection(std::forward<Args>(args)...);
+        },
+        mr, copy, stream, vector);
+}
+
+template <typename P, typename T>
+requires std::predicate<P, T> bool true_for_any(
+    P&& projection, vecmem::memory_resource& mr, vecmem::copy& copy,
+    stream& stream, vecmem::data::vector_view<T> vector) {
+    return !false_for_all(std::forward<P>(projection), mr, copy, stream,
+                          vector);
+}
+
+template <typename P, typename T>
+requires std::predicate<P, T> bool false_for_any(
+    P&& projection, vecmem::memory_resource& mr, vecmem::copy& copy,
+    stream& stream, vecmem::data::vector_view<T> vector) {
+    return !true_for_all(std::forward<P>(projection), mr, copy, stream, vector);
+}
+}  // namespace traccc::cuda
diff --git a/device/sycl/src/sanity/predicate.hpp b/device/sycl/src/sanity/predicate.hpp
new file mode 100644
index 0000000000..40b1dfe4a9
--- /dev/null
+++ b/device/sycl/src/sanity/predicate.hpp
@@ -0,0 +1,130 @@
+/**
+ * traccc library, part of the ACTS project (R&D line)
+ *
+ * (c) 2024 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#pragma once
+
+// Project include(s).
+#include <traccc/sycl/utils/queue_wrapper.hpp>
+
+#include "../utils/get_queue.hpp"
+
+// VecMem include(s).
+#include <vecmem/containers/data/vector_view.hpp>
+#include <vecmem/containers/device_vector.hpp>
+#include <vecmem/memory/memory_resource.hpp>
+#include <vecmem/memory/unique_ptr.hpp>
+#include <vecmem/utils/copy.hpp>
+
+// SYCL include
+#include <CL/sycl.hpp>
+
+// System include
+#include <concepts>
+
+namespace traccc::sycl {
+namespace kernels {
+template <typename P, typename T>
+class TrueForAllPredicate {};
+}  // namespace kernels
+
+/**
+ * @brief Sanity check that a predicate is true for all elements of a vector.
+ *
+ * @note This function runs in O(n) time.
+ *
+ * @tparam P The type of the predicate.
+ * @tparam T The type of the vector.
+ * @param predicate A projection object of type `P`.
+ * @param mr A memory resource used for allocating intermediate memory.
+ * @param copy A copy object.
+ * @param stream A wrapped CUDA stream.
+ * @param vector The vector which to check for contiguity.
+ * @return true If `predicate` is true for all elements of `vector`.
+ * @return false Otherwise.
+ */
+template <typename P, typename T>
+requires std::predicate<P, T> bool true_for_all(
+    P&& predicate, vecmem::memory_resource& mr, vecmem::copy& copy,
+    queue_wrapper& queue_wrapper, vecmem::data::vector_view<T> vector) {
+    // This should never be a performance-critical step, so we can keep the
+    // block size fixed.
+    constexpr int block_size = 512;
+
+    cl::sycl::queue& queue = details::get_queue(queue_wrapper);
+
+    // Grab the number of elements in our vector.
+    const std::uint32_t n = copy.get_size(vector);
+
+    // Allocate memory for outputs, then set them up.
+    vecmem::unique_alloc_ptr<bool> device_out =
+        vecmem::make_unique_alloc<bool>(mr);
+
+    bool initial_out = true;
+
+    cl::sycl::event kernel1_memcpy1 =
+        queue.memcpy(device_out.get(), &initial_out, sizeof(bool));
+
+    // Launch the main kernel.
+    cl::sycl::nd_range<1> kernel_range{
+        cl::sycl::range<1>(((n + block_size - 1) / block_size) * block_size),
+        cl::sycl::range<1>(block_size)};
+
+    cl::sycl::event kernel1 = queue.submit([&](cl::sycl::handler& h) {
+        h.depends_on(kernel1_memcpy1);
+        h.parallel_for<kernels::TrueForAllPredicate<P, T>>(
+            kernel_range, [projection, vector,
+                           out = device_out.get()](cl::sycl::nd_item<1> item) {
+                std::size_t tid = item.get_global_linear_id();
+
+                vecmem::device_vector<T> in(vector);
+
+                if (tid < in.size()) {
+                    if (!projection(in.at(tid))) {
+                        *out = false;
+                    }
+                }
+            });
+    });
+
+    // Copy the total number of squashed elements, e.g. the size of the
+    // resulting vector.
+    bool host_out;
+
+    queue.memcpy(&host_out, out.get(), sizeof(bool), {kernel1})
+        .wait_and_throw();
+
+    return host_out;
+}
+
+template <typename P, typename T>
+requires std::predicate<P, T> bool false_for_all(
+    P&& projection, vecmem::memory_resource& mr, vecmem::copy& copy,
+    queue_wrapper& queue_wrapper, vecmem::data::vector_view<T> vector) {
+    return true_for_all(
+        [projection]<typename... Args>(Args && ... args) {
+            return !projection(std::forward<Args>(args)...);
+        },
+        mr, copy, queue_wrapper, vector);
+}
+
+template <typename P, typename T>
+requires std::predicate<P, T> bool true_for_any(
+    P&& projection, vecmem::memory_resource& mr, vecmem::copy& copy,
+    queue_wrapper& queue_wrapper, vecmem::data::vector_view<T> vector) {
+    return !false_for_all(std::forward<P>(projection), mr, copy, queue_wrapper,
+                          vector);
+}
+
+template <typename P, typename T>
+requires std::predicate<P, T> bool false_for_any(
+    P&& projection, vecmem::memory_resource& mr, vecmem::copy& copy,
+    queue_wrapper& queue_wrapper, vecmem::data::vector_view<T> vector) {
+    return !true_for_all(std::forward<P>(projection), mr, copy, queue_wrapper,
+                         vector);
+}
+}  // namespace traccc::sycl
diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt
index 458592d5a5..9fe921f00b 100644
--- a/tests/cuda/CMakeLists.txt
+++ b/tests/cuda/CMakeLists.txt
@@ -44,6 +44,7 @@ traccc_add_test(
     test_unique_lock.cu
     test_sanity_contiguous_on.cu
     test_sanity_ordered_on.cu
+    test_sanity_predicate.cu
     test_sort.cu
 
     LINK_LIBRARIES
diff --git a/tests/cuda/test_sanity_predicate.cu b/tests/cuda/test_sanity_predicate.cu
new file mode 100644
index 0000000000..95d229288a
--- /dev/null
+++ b/tests/cuda/test_sanity_predicate.cu
@@ -0,0 +1,146 @@
+/*
+ * traccc library, part of the ACTS project (R&D line)
+ *
+ * (c) 2024 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+// vecmem includes
+#include <vecmem/memory/cuda/device_memory_resource.hpp>
+#include <vecmem/utils/cuda/async_copy.hpp>
+
+// traccc includes
+#include <traccc/definitions/qualifiers.hpp>
+
+#include "../../device/cuda/src/sanity/predicate.cuh"
+
+// GTest include(s).
+#include <gtest/gtest.h>
+
+struct IsLessThan {
+    __device__ bool operator()(int i) const { return i < m_val; }
+
+    int m_val;
+};
+
+class CUDASanityPredicate : public testing::Test {
+    protected:
+    CUDASanityPredicate() : copy(stream.cudaStream()) {}
+
+    vecmem::cuda::device_memory_resource mr;
+    traccc::cuda::stream stream;
+    vecmem::cuda::async_copy copy;
+};
+
+TEST_F(CUDASanityPredicate, TrueForAllTrue) {
+    std::vector<int> host_vector;
+
+    for (int i = 0; i < 5000; ++i) {
+        host_vector.push_back(i);
+    }
+
+    auto device_data = copy.to(vecmem::get_data(host_vector), mr,
+                               vecmem::copy::type::host_to_device);
+
+    ASSERT_TRUE(traccc::cuda::true_for_all(IsLessThan{5001}, mr, copy, stream,
+                                           device_data));
+}
+
+TEST_F(CUDASanityPredicate, TrueForAllFalse) {
+    std::vector<int> host_vector;
+
+    for (int i = 0; i < 5000; ++i) {
+        host_vector.push_back(i);
+    }
+
+    auto device_data = copy.to(vecmem::get_data(host_vector), mr,
+                               vecmem::copy::type::host_to_device);
+
+    ASSERT_FALSE(traccc::cuda::true_for_all(IsLessThan{4500}, mr, copy, stream,
+                                            device_data));
+}
+
+TEST_F(CUDASanityPredicate, TrueForAnyTrue) {
+    std::vector<int> host_vector;
+
+    for (int i = 0; i < 5000; ++i) {
+        host_vector.push_back(i);
+    }
+
+    auto device_data = copy.to(vecmem::get_data(host_vector), mr,
+                               vecmem::copy::type::host_to_device);
+
+    ASSERT_TRUE(traccc::cuda::true_for_any(IsLessThan{1}, mr, copy, stream,
+                                           device_data));
+}
+
+TEST_F(CUDASanityPredicate, TrueForAnyFalse) {
+    std::vector<int> host_vector;
+
+    for (int i = 0; i < 5000; ++i) {
+        host_vector.push_back(i);
+    }
+
+    auto device_data = copy.to(vecmem::get_data(host_vector), mr,
+                               vecmem::copy::type::host_to_device);
+
+    ASSERT_FALSE(traccc::cuda::true_for_any(IsLessThan{0}, mr, copy, stream,
+                                            device_data));
+}
+
+TEST_F(CUDASanityPredicate, FalseForAllTrue) {
+    std::vector<int> host_vector;
+
+    for (int i = 0; i < 5000; ++i) {
+        host_vector.push_back(i);
+    }
+
+    auto device_data = copy.to(vecmem::get_data(host_vector), mr,
+                               vecmem::copy::type::host_to_device);
+
+    ASSERT_TRUE(traccc::cuda::false_for_all(IsLessThan{0}, mr, copy, stream,
+                                            device_data));
+}
+
+TEST_F(CUDASanityPredicate, FalseForAllFalse) {
+    std::vector<int> host_vector;
+
+    for (int i = 0; i < 5000; ++i) {
+        host_vector.push_back(i);
+    }
+
+    auto device_data = copy.to(vecmem::get_data(host_vector), mr,
+                               vecmem::copy::type::host_to_device);
+
+    ASSERT_FALSE(traccc::cuda::false_for_all(IsLessThan{1}, mr, copy, stream,
+                                             device_data));
+}
+
+TEST_F(CUDASanityPredicate, FalseForAnyTrue) {
+    std::vector<int> host_vector;
+
+    for (int i = 0; i < 5000; ++i) {
+        host_vector.push_back(i);
+    }
+
+    auto device_data = copy.to(vecmem::get_data(host_vector), mr,
+                               vecmem::copy::type::host_to_device);
+
+    ASSERT_TRUE(traccc::cuda::false_for_any(IsLessThan{1}, mr, copy, stream,
+                                            device_data));
+}
+
+TEST_F(CUDASanityPredicate, FalseForAnyFalse) {
+    std::vector<int> host_vector;
+
+    for (int i = 0; i < 5000; ++i) {
+        host_vector.push_back(i);
+    }
+
+    auto device_data = copy.to(vecmem::get_data(host_vector), mr,
+                               vecmem::copy::type::host_to_device);
+
+    ASSERT_FALSE(traccc::cuda::false_for_any(IsLessThan{6000}, mr, copy, stream,
+                                             device_data));
+}
diff --git a/tests/sycl/CMakeLists.txt b/tests/sycl/CMakeLists.txt
index 63dbf9ff43..235d3e34db 100644
--- a/tests/sycl/CMakeLists.txt
+++ b/tests/sycl/CMakeLists.txt
@@ -22,6 +22,7 @@ traccc_add_test(
     test_cca.sycl
     test_sanity_contiguous_on.sycl
     test_sanity_ordered_on.sycl
+    test_sanity_predicate.sycl
     test_sort.sycl
 
     LINK_LIBRARIES
diff --git a/tests/sycl/test_sanity_predicate.sycl b/tests/sycl/test_sanity_predicate.sycl
new file mode 100644
index 0000000000..ad24301462
--- /dev/null
+++ b/tests/sycl/test_sanity_predicate.sycl
@@ -0,0 +1,147 @@
+/*
+ * traccc library, part of the ACTS project (R&D line)
+ *
+ * (c) 2024 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+// vecmem includes
+#include <vecmem/memory/sycl/device_memory_resource.hpp>
+#include <vecmem/utils/sycl/async_copy.hpp>
+
+// traccc includes
+#include <traccc/definitions/qualifiers.hpp>
+
+#include "../../device/sycl/src/sanity/predicate.hpp"
+
+// GTest include(s).
+#include <gtest/gtest.h>
+
+struct IsLessThan {
+    bool operator()(int i) const { return i < m_val; }
+
+    int m_val;
+};
+
+class SYCLSanityPredicate : public testing::Test {
+    protected:
+    SYCLSanityPredicate() : queue_wrapper(&queue), copy(&queue) {}
+
+    vecmem::sycl::device_memory_resource mr;
+    cl::sycl::queue queue;
+    traccc::sycl::queue_wrapper queue_wrapper;
+    vecmem::sycl::async_copy copy;
+};
+
+TEST_F(SYCLSanityPredicate, TrueForAllTrue) {
+    std::vector<int> host_vector;
+
+    for (int i = 0; i < 5000; ++i) {
+        host_vector.push_back(i);
+    }
+
+    auto device_data = copy.to(vecmem::get_data(host_vector), mr,
+                               vecmem::copy::type::host_to_device);
+
+    ASSERT_TRUE(traccc::sycl::true_for_all(IsLessThan{5001}, mr, copy,
+                                           queue_wrapper, device_data));
+}
+
+TEST_F(SYCLSanityPredicate, TrueForAllFalse) {
+    std::vector<int> host_vector;
+
+    for (int i = 0; i < 5000; ++i) {
+        host_vector.push_back(i);
+    }
+
+    auto device_data = copy.to(vecmem::get_data(host_vector), mr,
+                               vecmem::copy::type::host_to_device);
+
+    ASSERT_FALSE(traccc::sycl::true_for_all(IsLessThan{4500}, mr, copy,
+                                            queue_wrapper, device_data));
+}
+
+TEST_F(SYCLSanityPredicate, TrueForAnyTrue) {
+    std::vector<int> host_vector;
+
+    for (int i = 0; i < 5000; ++i) {
+        host_vector.push_back(i);
+    }
+
+    auto device_data = copy.to(vecmem::get_data(host_vector), mr,
+                               vecmem::copy::type::host_to_device);
+
+    ASSERT_TRUE(traccc::sycl::true_for_any(IsLessThan{1}, mr, copy,
+                                           queue_wrapper, device_data));
+}
+
+TEST_F(SYCLSanityPredicate, TrueForAnyFalse) {
+    std::vector<int> host_vector;
+
+    for (int i = 0; i < 5000; ++i) {
+        host_vector.push_back(i);
+    }
+
+    auto device_data = copy.to(vecmem::get_data(host_vector), mr,
+                               vecmem::copy::type::host_to_device);
+
+    ASSERT_FALSE(traccc::sycl::true_for_any(IsLessThan{0}, mr, copy,
+                                            queue_wrapper, device_data));
+}
+
+TEST_F(SYCLSanityPredicate, FalseForAllTrue) {
+    std::vector<int> host_vector;
+
+    for (int i = 0; i < 5000; ++i) {
+        host_vector.push_back(i);
+    }
+
+    auto device_data = copy.to(vecmem::get_data(host_vector), mr,
+                               vecmem::copy::type::host_to_device);
+
+    ASSERT_TRUE(traccc::sycl::false_for_all(IsLessThan{0}, mr, copy,
+                                            queue_wrapper, device_data));
+}
+
+TEST_F(SYCLSanityPredicate, FalseForAllFalse) {
+    std::vector<int> host_vector;
+
+    for (int i = 0; i < 5000; ++i) {
+        host_vector.push_back(i);
+    }
+
+    auto device_data = copy.to(vecmem::get_data(host_vector), mr,
+                               vecmem::copy::type::host_to_device);
+
+    ASSERT_FALSE(traccc::sycl::false_for_all(IsLessThan{1}, mr, copy,
+                                             queue_wrapper, device_data));
+}
+
+TEST_F(SYCLSanityPredicate, FalseForAnyTrue) {
+    std::vector<int> host_vector;
+
+    for (int i = 0; i < 5000; ++i) {
+        host_vector.push_back(i);
+    }
+
+    auto device_data = copy.to(vecmem::get_data(host_vector), mr,
+                               vecmem::copy::type::host_to_device);
+
+    ASSERT_TRUE(traccc::sycl::false_for_any(IsLessThan{1}, mr, copy,
+                                            queue_wrapper, device_data));
+}
+
+TEST_F(SYCLSanityPredicate, FalseForAnyFalse) {
+    std::vector<int> host_vector;
+
+    for (int i = 0; i < 5000; ++i) {
+        host_vector.push_back(i);
+    }
+
+    auto device_data = copy.to(vecmem::get_data(host_vector), mr,
+                               vecmem::copy::type::host_to_device);
+
+    ASSERT_FALSE(traccc::sycl::false_for_any(IsLessThan{6000}, mr, copy,
+                                             queue_wrapper, device_data));
+}