diff --git a/cmake/traccc-compiler-options-cuda.cmake b/cmake/traccc-compiler-options-cuda.cmake index 42cf2ade48..49166cec64 100644 --- a/cmake/traccc-compiler-options-cuda.cmake +++ b/cmake/traccc-compiler-options-cuda.cmake @@ -27,6 +27,9 @@ set( CMAKE_CUDA_ARCHITECTURES "52" CACHE STRING # not marked with __device__. traccc_add_flag( CMAKE_CUDA_FLAGS "--expt-relaxed-constexpr" ) +# Allow the use of lambdas with __device__ specifiers. +traccc_add_flag( CMAKE_CUDA_FLAGS "--extended-lambda" ) + # Make CUDA generate debug symbols for the device code as well in a debug # build. traccc_add_flag( CMAKE_CUDA_FLAGS_DEBUG "-G --keep -src-in-ptx" ) diff --git a/device/cuda/src/sanity/predicate.cuh b/device/cuda/src/sanity/predicate.cuh new file mode 100644 index 0000000000..de5a8c88f2 --- /dev/null +++ b/device/cuda/src/sanity/predicate.cuh @@ -0,0 +1,129 @@ +/** + * traccc library, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#pragma once + +// Project include(s). +#include "../utils/cuda_error_handling.hpp" +#include "traccc/cuda/utils/stream.hpp" + +// VecMem include(s). +#include +#include +#include +#include +#include + +// CUDA include +#include + +// System include +#include + +namespace traccc::cuda { +namespace kernels { +template +requires std::predicate __global__ void true_for_all_kernel( + P projection, vecmem::data::vector_view _in, bool* out) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + + vecmem::device_vector in(_in); + + if (tid < in.size()) { + if (!projection(in.at(tid))) { + *out = false; + } + } +} +} // namespace kernels + +/** + * @brief Sanity check that a predicate is true for all elements of a vector. + * + * @note This function runs in O(n) time. + * + * @tparam P The type of the predicate. + * @tparam T The type of the vector. + * @param predicate A projection object of type `P`. + * @param mr A memory resource used for allocating intermediate memory. + * @param copy A copy object. + * @param stream A wrapped CUDA stream. + * @param vector The vector which to check for contiguity. + * @return true If `predicate` is true for all elements of `vector`. + * @return false Otherwise. + */ +template +requires std::predicate bool true_for_all( + P&& predicate, vecmem::memory_resource& mr, vecmem::copy& copy, + stream& stream, vecmem::data::vector_view vector) { + // This should never be a performance-critical step, so we can keep the + // block size fixed. + constexpr int block_size = 512; + + cudaStream_t cuda_stream = + reinterpret_cast(stream.cudaStream()); + + // Grab the number of elements in our vector. + const std::uint32_t n = copy.get_size(vector); + + // Allocate memory for outputs, then set them up. + vecmem::unique_alloc_ptr device_out = + vecmem::make_unique_alloc(mr); + + bool initial_out = true; + + TRACCC_CUDA_ERROR_CHECK( + cudaMemcpyAsync(device_out.get(), &initial_out, sizeof(bool), + cudaMemcpyHostToDevice, cuda_stream)); + + // Launch the main kernel. + kernels::true_for_all_kernel + <<<(n + block_size - 1) / block_size, block_size, 0, cuda_stream>>>( + predicate, vector, device_out.get()); + + TRACCC_CUDA_ERROR_CHECK(cudaGetLastError()); + + // Copy the total number of squashed elements, e.g. the size of the + // resulting vector. + bool host_out; + + TRACCC_CUDA_ERROR_CHECK( + cudaMemcpyAsync(&host_out, device_out.get(), sizeof(bool), + cudaMemcpyDeviceToHost, cuda_stream)); + + stream.synchronize(); + + return host_out; +} + +template +requires std::predicate bool false_for_all( + P&& projection, vecmem::memory_resource& mr, vecmem::copy& copy, + stream& stream, vecmem::data::vector_view vector) { + return true_for_all( + [projection] __device__(Args && ... args) { + return !projection(std::forward(args)...); + }, + mr, copy, stream, vector); +} + +template +requires std::predicate bool true_for_any( + P&& projection, vecmem::memory_resource& mr, vecmem::copy& copy, + stream& stream, vecmem::data::vector_view vector) { + return !false_for_all(std::forward

(projection), mr, copy, stream, + vector); +} + +template +requires std::predicate bool false_for_any( + P&& projection, vecmem::memory_resource& mr, vecmem::copy& copy, + stream& stream, vecmem::data::vector_view vector) { + return !true_for_all(std::forward

(projection), mr, copy, stream, vector); +} +} // namespace traccc::cuda diff --git a/device/sycl/src/sanity/predicate.hpp b/device/sycl/src/sanity/predicate.hpp new file mode 100644 index 0000000000..40b1dfe4a9 --- /dev/null +++ b/device/sycl/src/sanity/predicate.hpp @@ -0,0 +1,130 @@ +/** + * traccc library, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#pragma once + +// Project include(s). +#include + +#include "../utils/get_queue.hpp" + +// VecMem include(s). +#include +#include +#include +#include +#include + +// SYCL include +#include + +// System include +#include + +namespace traccc::sycl { +namespace kernels { +template +class TrueForAllPredicate {}; +} // namespace kernels + +/** + * @brief Sanity check that a predicate is true for all elements of a vector. + * + * @note This function runs in O(n) time. + * + * @tparam P The type of the predicate. + * @tparam T The type of the vector. + * @param predicate A projection object of type `P`. + * @param mr A memory resource used for allocating intermediate memory. + * @param copy A copy object. + * @param stream A wrapped CUDA stream. + * @param vector The vector which to check for contiguity. + * @return true If `predicate` is true for all elements of `vector`. + * @return false Otherwise. + */ +template +requires std::predicate bool true_for_all( + P&& predicate, vecmem::memory_resource& mr, vecmem::copy& copy, + queue_wrapper& queue_wrapper, vecmem::data::vector_view vector) { + // This should never be a performance-critical step, so we can keep the + // block size fixed. + constexpr int block_size = 512; + + cl::sycl::queue& queue = details::get_queue(queue_wrapper); + + // Grab the number of elements in our vector. + const std::uint32_t n = copy.get_size(vector); + + // Allocate memory for outputs, then set them up. + vecmem::unique_alloc_ptr device_out = + vecmem::make_unique_alloc(mr); + + bool initial_out = true; + + cl::sycl::event kernel1_memcpy1 = + queue.memcpy(device_out.get(), &initial_out, sizeof(bool)); + + // Launch the main kernel. + cl::sycl::nd_range<1> kernel_range{ + cl::sycl::range<1>(((n + block_size - 1) / block_size) * block_size), + cl::sycl::range<1>(block_size)}; + + cl::sycl::event kernel1 = queue.submit([&](cl::sycl::handler& h) { + h.depends_on(kernel1_memcpy1); + h.parallel_for>( + kernel_range, [projection, vector, + out = device_out.get()](cl::sycl::nd_item<1> item) { + std::size_t tid = item.get_global_linear_id(); + + vecmem::device_vector in(vector); + + if (tid < in.size()) { + if (!projection(in.at(tid))) { + *out = false; + } + } + }); + }); + + // Copy the total number of squashed elements, e.g. the size of the + // resulting vector. + bool host_out; + + queue.memcpy(&host_out, out.get(), sizeof(bool), {kernel1}) + .wait_and_throw(); + + return host_out; +} + +template +requires std::predicate bool false_for_all( + P&& projection, vecmem::memory_resource& mr, vecmem::copy& copy, + queue_wrapper& queue_wrapper, vecmem::data::vector_view vector) { + return true_for_all( + [projection](Args && ... args) { + return !projection(std::forward(args)...); + }, + mr, copy, queue_wrapper, vector); +} + +template +requires std::predicate bool true_for_any( + P&& projection, vecmem::memory_resource& mr, vecmem::copy& copy, + queue_wrapper& queue_wrapper, vecmem::data::vector_view vector) { + return !false_for_all(std::forward

(projection), mr, copy, queue_wrapper, + vector); +} + +template +requires std::predicate bool false_for_any( + P&& projection, vecmem::memory_resource& mr, vecmem::copy& copy, + queue_wrapper& queue_wrapper, vecmem::data::vector_view vector) { + return !true_for_all(std::forward

(projection), mr, copy, queue_wrapper, + vector); +} +} // namespace traccc::sycl diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt index 458592d5a5..9fe921f00b 100644 --- a/tests/cuda/CMakeLists.txt +++ b/tests/cuda/CMakeLists.txt @@ -44,6 +44,7 @@ traccc_add_test( test_unique_lock.cu test_sanity_contiguous_on.cu test_sanity_ordered_on.cu + test_sanity_predicate.cu test_sort.cu LINK_LIBRARIES diff --git a/tests/cuda/test_sanity_predicate.cu b/tests/cuda/test_sanity_predicate.cu new file mode 100644 index 0000000000..95d229288a --- /dev/null +++ b/tests/cuda/test_sanity_predicate.cu @@ -0,0 +1,146 @@ +/* + * traccc library, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +// vecmem includes +#include +#include + +// traccc includes +#include + +#include "../../device/cuda/src/sanity/predicate.cuh" + +// GTest include(s). +#include + +struct IsLessThan { + __device__ bool operator()(int i) const { return i < m_val; } + + int m_val; +}; + +class CUDASanityPredicate : public testing::Test { + protected: + CUDASanityPredicate() : copy(stream.cudaStream()) {} + + vecmem::cuda::device_memory_resource mr; + traccc::cuda::stream stream; + vecmem::cuda::async_copy copy; +}; + +TEST_F(CUDASanityPredicate, TrueForAllTrue) { + std::vector host_vector; + + for (int i = 0; i < 5000; ++i) { + host_vector.push_back(i); + } + + auto device_data = copy.to(vecmem::get_data(host_vector), mr, + vecmem::copy::type::host_to_device); + + ASSERT_TRUE(traccc::cuda::true_for_all(IsLessThan{5001}, mr, copy, stream, + device_data)); +} + +TEST_F(CUDASanityPredicate, TrueForAllFalse) { + std::vector host_vector; + + for (int i = 0; i < 5000; ++i) { + host_vector.push_back(i); + } + + auto device_data = copy.to(vecmem::get_data(host_vector), mr, + vecmem::copy::type::host_to_device); + + ASSERT_FALSE(traccc::cuda::true_for_all(IsLessThan{4500}, mr, copy, stream, + device_data)); +} + +TEST_F(CUDASanityPredicate, TrueForAnyTrue) { + std::vector host_vector; + + for (int i = 0; i < 5000; ++i) { + host_vector.push_back(i); + } + + auto device_data = copy.to(vecmem::get_data(host_vector), mr, + vecmem::copy::type::host_to_device); + + ASSERT_TRUE(traccc::cuda::true_for_any(IsLessThan{1}, mr, copy, stream, + device_data)); +} + +TEST_F(CUDASanityPredicate, TrueForAnyFalse) { + std::vector host_vector; + + for (int i = 0; i < 5000; ++i) { + host_vector.push_back(i); + } + + auto device_data = copy.to(vecmem::get_data(host_vector), mr, + vecmem::copy::type::host_to_device); + + ASSERT_FALSE(traccc::cuda::true_for_any(IsLessThan{0}, mr, copy, stream, + device_data)); +} + +TEST_F(CUDASanityPredicate, FalseForAllTrue) { + std::vector host_vector; + + for (int i = 0; i < 5000; ++i) { + host_vector.push_back(i); + } + + auto device_data = copy.to(vecmem::get_data(host_vector), mr, + vecmem::copy::type::host_to_device); + + ASSERT_TRUE(traccc::cuda::false_for_all(IsLessThan{0}, mr, copy, stream, + device_data)); +} + +TEST_F(CUDASanityPredicate, FalseForAllFalse) { + std::vector host_vector; + + for (int i = 0; i < 5000; ++i) { + host_vector.push_back(i); + } + + auto device_data = copy.to(vecmem::get_data(host_vector), mr, + vecmem::copy::type::host_to_device); + + ASSERT_FALSE(traccc::cuda::false_for_all(IsLessThan{1}, mr, copy, stream, + device_data)); +} + +TEST_F(CUDASanityPredicate, FalseForAnyTrue) { + std::vector host_vector; + + for (int i = 0; i < 5000; ++i) { + host_vector.push_back(i); + } + + auto device_data = copy.to(vecmem::get_data(host_vector), mr, + vecmem::copy::type::host_to_device); + + ASSERT_TRUE(traccc::cuda::false_for_any(IsLessThan{1}, mr, copy, stream, + device_data)); +} + +TEST_F(CUDASanityPredicate, FalseForAnyFalse) { + std::vector host_vector; + + for (int i = 0; i < 5000; ++i) { + host_vector.push_back(i); + } + + auto device_data = copy.to(vecmem::get_data(host_vector), mr, + vecmem::copy::type::host_to_device); + + ASSERT_FALSE(traccc::cuda::false_for_any(IsLessThan{6000}, mr, copy, stream, + device_data)); +} diff --git a/tests/sycl/CMakeLists.txt b/tests/sycl/CMakeLists.txt index 63dbf9ff43..235d3e34db 100644 --- a/tests/sycl/CMakeLists.txt +++ b/tests/sycl/CMakeLists.txt @@ -22,6 +22,7 @@ traccc_add_test( test_cca.sycl test_sanity_contiguous_on.sycl test_sanity_ordered_on.sycl + test_sanity_predicate.sycl test_sort.sycl LINK_LIBRARIES diff --git a/tests/sycl/test_sanity_predicate.sycl b/tests/sycl/test_sanity_predicate.sycl new file mode 100644 index 0000000000..ad24301462 --- /dev/null +++ b/tests/sycl/test_sanity_predicate.sycl @@ -0,0 +1,147 @@ +/* + * traccc library, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +// vecmem includes +#include +#include + +// traccc includes +#include + +#include "../../device/sycl/src/sanity/predicate.hpp" + +// GTest include(s). +#include + +struct IsLessThan { + bool operator()(int i) const { return i < m_val; } + + int m_val; +}; + +class SYCLSanityPredicate : public testing::Test { + protected: + SYCLSanityPredicate() : queue_wrapper(&queue), copy(&queue) {} + + vecmem::sycl::device_memory_resource mr; + cl::sycl::queue queue; + traccc::sycl::queue_wrapper queue_wrapper; + vecmem::sycl::async_copy copy; +}; + +TEST_F(SYCLSanityPredicate, TrueForAllTrue) { + std::vector host_vector; + + for (int i = 0; i < 5000; ++i) { + host_vector.push_back(i); + } + + auto device_data = copy.to(vecmem::get_data(host_vector), mr, + vecmem::copy::type::host_to_device); + + ASSERT_TRUE(traccc::sycl::true_for_all(IsLessThan{5001}, mr, copy, + queue_wrapper, device_data)); +} + +TEST_F(SYCLSanityPredicate, TrueForAllFalse) { + std::vector host_vector; + + for (int i = 0; i < 5000; ++i) { + host_vector.push_back(i); + } + + auto device_data = copy.to(vecmem::get_data(host_vector), mr, + vecmem::copy::type::host_to_device); + + ASSERT_FALSE(traccc::sycl::true_for_all(IsLessThan{4500}, mr, copy, + queue_wrapper, device_data)); +} + +TEST_F(SYCLSanityPredicate, TrueForAnyTrue) { + std::vector host_vector; + + for (int i = 0; i < 5000; ++i) { + host_vector.push_back(i); + } + + auto device_data = copy.to(vecmem::get_data(host_vector), mr, + vecmem::copy::type::host_to_device); + + ASSERT_TRUE(traccc::sycl::true_for_any(IsLessThan{1}, mr, copy, + queue_wrapper, device_data)); +} + +TEST_F(SYCLSanityPredicate, TrueForAnyFalse) { + std::vector host_vector; + + for (int i = 0; i < 5000; ++i) { + host_vector.push_back(i); + } + + auto device_data = copy.to(vecmem::get_data(host_vector), mr, + vecmem::copy::type::host_to_device); + + ASSERT_FALSE(traccc::sycl::true_for_any(IsLessThan{0}, mr, copy, + queue_wrapper, device_data)); +} + +TEST_F(SYCLSanityPredicate, FalseForAllTrue) { + std::vector host_vector; + + for (int i = 0; i < 5000; ++i) { + host_vector.push_back(i); + } + + auto device_data = copy.to(vecmem::get_data(host_vector), mr, + vecmem::copy::type::host_to_device); + + ASSERT_TRUE(traccc::sycl::false_for_all(IsLessThan{0}, mr, copy, + queue_wrapper, device_data)); +} + +TEST_F(SYCLSanityPredicate, FalseForAllFalse) { + std::vector host_vector; + + for (int i = 0; i < 5000; ++i) { + host_vector.push_back(i); + } + + auto device_data = copy.to(vecmem::get_data(host_vector), mr, + vecmem::copy::type::host_to_device); + + ASSERT_FALSE(traccc::sycl::false_for_all(IsLessThan{1}, mr, copy, + queue_wrapper, device_data)); +} + +TEST_F(SYCLSanityPredicate, FalseForAnyTrue) { + std::vector host_vector; + + for (int i = 0; i < 5000; ++i) { + host_vector.push_back(i); + } + + auto device_data = copy.to(vecmem::get_data(host_vector), mr, + vecmem::copy::type::host_to_device); + + ASSERT_TRUE(traccc::sycl::false_for_any(IsLessThan{1}, mr, copy, + queue_wrapper, device_data)); +} + +TEST_F(SYCLSanityPredicate, FalseForAnyFalse) { + std::vector host_vector; + + for (int i = 0; i < 5000; ++i) { + host_vector.push_back(i); + } + + auto device_data = copy.to(vecmem::get_data(host_vector), mr, + vecmem::copy::type::host_to_device); + + ASSERT_FALSE(traccc::sycl::false_for_any(IsLessThan{6000}, mr, copy, + queue_wrapper, device_data)); +}