diff --git a/CMakeLists.txt b/CMakeLists.txt index 00fe0d134a2f..fcaae9721f26 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,8 +47,9 @@ endif() option(alpaka_INSTALL_TEST_HEADER "Install headers of the namespace alpaka::test. Attention, headers are not designed for production code, see documentation." OFF) include(CMakeDependentOption) + cmake_dependent_option(alpaka_CHECK_HEADERS "Check all alpaka headers as part of the tests whether they can be compiled standalone." OFF BUILD_TESTING OFF) -cmake_dependent_option(alpaka_USE_INTERNAL_CATCH2 "Use internally shipped Catch2" ON BUILD_TESTING OFF) +cmake_dependent_option(alpaka_USE_INTERNAL_CATCH2 "Use internally shipped Catch2" ON "BUILD_TESTING OR alpaka_BUILD_BENCHMARKS" OFF) ################################################################################ # Internal variables. diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 3ae15f537d4f..6a8da0ef7a89 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -15,4 +15,9 @@ project("alpakaBenchmarks" LANGUAGES CXX) # Add subdirectories. ################################################################################ +if(NOT BUILD_TESTING) + # Testing is not enabled therefore CATCH2 which is part of common must be pulled. + add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../test/common "${CMAKE_BINARY_DIR}/test/common") +endif() + add_subdirectory("babelstream/") diff --git a/benchmarks/babelstream/CMakeLists.txt b/benchmarks/babelstream/CMakeLists.txt index e2382a1139d8..5deb1a3e2a88 100644 --- a/benchmarks/babelstream/CMakeLists.txt +++ b/benchmarks/babelstream/CMakeLists.txt @@ -19,9 +19,31 @@ if(NOT TARGET alpaka::alpaka) endif() endif() -alpaka_add_executable(${PROJECT_NAME} src/main.cpp src/Stream.h src/AlpakaStream.cpp src/AlpakaStream.h) -target_compile_definitions(${PROJECT_NAME} PUBLIC ALPAKA) -target_link_libraries(${PROJECT_NAME} PUBLIC alpaka::alpaka) -set_target_properties(${PROJECT_NAME} PROPERTIES FOLDER benchmarks) -# add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME}) +set(_TARGET_NAME "babelstream") +append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE) + +alpaka_add_executable( + ${_TARGET_NAME} + ${_FILES_SOURCE}) + +target_include_directories( + ${_TARGET_NAME} + PRIVATE "src") + +target_link_libraries( + ${_TARGET_NAME} + PRIVATE common) + +set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER benchmarks/babelstream) + +#Run as a ctest +if(alpaka_CI) + # Only run for release builds since this is a benchmark + if(CMAKE_BUILD_TYPE STREQUAL "Release") + add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME}) + endif() +else() + # For a normal benchmark test, number of samples should be equal to the default value. + add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME}) +endif() diff --git a/benchmarks/babelstream/src/AlpakaStream.cpp b/benchmarks/babelstream/src/AlpakaStream.cpp deleted file mode 100644 index 9c8f3043ea68..000000000000 --- a/benchmarks/babelstream/src/AlpakaStream.cpp +++ /dev/null @@ -1,230 +0,0 @@ -// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code -// -// Cupla version created by Jeff Young in 2021 -// Ported from cupla to alpaka by Bernhard Manfred Gruber in 2022 - -#include "AlpakaStream.h" - -#include - -namespace -{ - constexpr auto blockSize = 1024; - constexpr auto dotBlockSize = 256; -} // namespace - -template -AlpakaStream::AlpakaStream(Idx arraySize, Idx deviceIndex) - : arraySize(arraySize) - , devHost(alpaka::getDevByIdx(platformHost, 0)) - , devAcc(alpaka::getDevByIdx(platformAcc, deviceIndex)) - , sums(alpaka::allocBuf(devHost, dotBlockSize)) - , d_a(alpaka::allocBuf(devAcc, arraySize)) - , d_b(alpaka::allocBuf(devAcc, arraySize)) - , d_c(alpaka::allocBuf(devAcc, arraySize)) - , d_sum(alpaka::allocBuf(devAcc, dotBlockSize)) - , queue(devAcc) -{ - if(arraySize % blockSize != 0) - throw std::runtime_error("Array size must be a multiple of " + std::to_string(blockSize)); - std::cout << "Using alpaka device " << alpaka::getName(devAcc) << std::endl; -} - -struct InitKernel -{ - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T* b, T* c, T initA, T initB, T initC) const - { - auto const [i] = alpaka::getIdx(acc); - a[i] = initA; - b[i] = initB; - c[i] = initC; - } -}; - -template -void AlpakaStream::init_arrays(T initA, T initB, T initC) -{ - auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1}; - - alpaka::exec< - Acc>(queue, workdiv, InitKernel{}, std::data(d_a), std::data(d_b), std::data(d_c), initA, initB, initC); - alpaka::wait(queue); -} - -template -void AlpakaStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) -{ - alpaka::memcpy(queue, alpaka::createView(devHost, a), d_a); - alpaka::memcpy(queue, alpaka::createView(devHost, b), d_b); - alpaka::memcpy(queue, alpaka::createView(devHost, c), d_c); -} - -struct CopyKernel -{ - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T* c) const - { - auto const [i] = alpaka::getIdx(acc); - c[i] = a[i]; - } -}; - -template -void AlpakaStream::copy() -{ - auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1}; - - alpaka::exec(queue, workdiv, CopyKernel{}, std::data(d_a), std::data(d_c)); - alpaka::wait(queue); -} - -struct MulKernel -{ - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, T* b, T const* c) const - { - const T scalar = startScalar; - auto const [i] = alpaka::getIdx(acc); - b[i] = scalar * c[i]; - } -}; - -template -void AlpakaStream::mul() -{ - auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1}; - - alpaka::exec(queue, workdiv, MulKernel{}, std::data(d_b), std::data(d_c)); - alpaka::wait(queue); -} - -struct AddKernel -{ - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* c) const - { - auto const [i] = alpaka::getIdx(acc); - c[i] = a[i] + b[i]; - } -}; - -template -void AlpakaStream::add() -{ - auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1}; - - alpaka::exec(queue, workdiv, AddKernel{}, std::data(d_a), std::data(d_b), std::data(d_c)); - alpaka::wait(queue); -} - -struct TriadKernel -{ - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T const* b, T const* c) const - { - const T scalar = startScalar; - auto const [i] = alpaka::getIdx(acc); - a[i] = b[i] + scalar * c[i]; - } -}; - -template -void AlpakaStream::triad() -{ - auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1}; - - alpaka::exec(queue, workdiv, TriadKernel{}, std::data(d_a), std::data(d_b), std::data(d_c)); - alpaka::wait(queue); -} - -struct NstreamKernel -{ - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T const* b, T const* c) const - { - const T scalar = startScalar; - auto const [i] = alpaka::getIdx(acc); - a[i] += b[i] + scalar * c[i]; - } -}; - -template -void AlpakaStream::nstream() -{ - auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1}; - - alpaka::exec(queue, workdiv, NstreamKernel{}, std::data(d_a), std::data(d_b), std::data(d_c)); - alpaka::wait(queue); -} - -struct DotKernel -{ - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* sum, int arraySize) const - { - // TODO(Jeff Young) - test if sharedMem bug is affecting performance here - auto& tbSum = alpaka::declareSharedVar(acc); - - auto [i] = alpaka::getIdx(acc); - auto const [local_i] = alpaka::getIdx(acc); - auto const [totalThreads] = alpaka::getWorkDiv(acc); - - T threadSum = 0; - for(; i < arraySize; i += totalThreads) // NOLINT(bugprone-infinite-loop) - threadSum += a[i] * b[i]; - tbSum[local_i] = threadSum; - - auto const [blockDim] = alpaka::getWorkDiv(acc); - for(int offset = blockDim / 2; offset > 0; offset /= 2) - { - alpaka::syncBlockThreads(acc); - if(local_i < offset) - tbSum[local_i] += tbSum[local_i + offset]; - } - - auto const [blockIdx] = alpaka::getIdx(acc); - if(local_i == 0) - sum[blockIdx] = tbSum[local_i]; - } -}; - -template -auto AlpakaStream::dot() -> T -{ - auto const workdiv = WorkDiv{dotBlockSize, blockSize, 1}; - alpaka::exec(queue, workdiv, DotKernel{}, std::data(d_a), std::data(d_b), std::data(d_sum), arraySize); - alpaka::wait(queue); - - alpaka::memcpy(queue, sums, d_sum); - T const* sumPtr = std::data(sums); - // TODO(bgruber): replace by std::reduce, when gcc 9.3 is the baseline - return std::accumulate(sumPtr, sumPtr + dotBlockSize, T{0}); -} - -void listDevices() -{ - auto const platform = alpaka::Platform{}; - auto const count = alpaka::getDevCount(platform); - std::cout << "Devices:" << std::endl; - for(int i = 0; i < count; i++) - std::cout << i << ": " << getDeviceName(i) << std::endl; -} - -auto getDeviceName(int deviceIndex) -> std::string -{ - auto const platform = alpaka::Platform{}; - return alpaka::getName(alpaka::getDevByIdx(platform, deviceIndex)); -} - -auto getDeviceDriver([[maybe_unused]] int device) -> std::string -{ - return "Not supported"; -} - -template class AlpakaStream; -template class AlpakaStream; diff --git a/benchmarks/babelstream/src/AlpakaStream.h b/benchmarks/babelstream/src/AlpakaStream.h deleted file mode 100644 index ba556b028dba..000000000000 --- a/benchmarks/babelstream/src/AlpakaStream.h +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code -// -// Cupla version created by Jeff Young in 2021 -// Ported from cupla to alpaka by Bernhard Manfred Gruber in 2022 - -#pragma once - -#include "Stream.h" - -#include -#include - -#include - -inline constexpr auto IMPLEMENTATION_STRING = "alpaka"; - -using Dim = alpaka::DimInt<1>; -using Idx = int; -using Vec = alpaka::Vec; -using Acc = alpaka::ExampleDefaultAcc; - -template -struct AlpakaStream : Stream -{ - AlpakaStream(Idx arraySize, Idx deviceIndex); - - void copy() override; - void add() override; - void mul() override; - void triad() override; - void nstream() override; - auto dot() -> T override; - - void init_arrays(T initA, T initB, T initC) override; - void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; - - using PlatformHost = alpaka::PlatformCpu; - using DevHost = alpaka::Dev; - using PlatformAcc = alpaka::Platform; - using DevAcc = alpaka::Dev; - using BufHost = alpaka::Buf; - using BufAcc = alpaka::Buf; - using Queue = alpaka::Queue; - - using WorkDiv = alpaka::WorkDivMembers; - -private: - Idx arraySize; - PlatformHost platformHost; - DevHost devHost; - PlatformAcc platformAcc; - DevAcc devAcc; - BufHost sums; - BufAcc d_a; - BufAcc d_b; - BufAcc d_c; - BufAcc d_sum; - Queue queue; -}; diff --git a/benchmarks/babelstream/src/README.md b/benchmarks/babelstream/src/README.md index 781cdf31039f..cd3eee701166 100644 --- a/benchmarks/babelstream/src/README.md +++ b/benchmarks/babelstream/src/README.md @@ -1,6 +1,101 @@ -This is a port of [BabelStream](https://github.com/UoB-HPC/BabelStream) to alpaka. -This work is based on the [cupla port of BabelStream](https://github.com/jyoung3131/BabelStream) from Jeff Young. -The benchmark driver (`main.cpp` and `Stream.h`) is taken from BabelStream. -No other backends are available, only alpaka. -Thus, there is no need to select a backend, just run the executable. -Please refer to the BabelStream documentation of more information on how to run the benchmark. +This work was initially based on the [cupla port of BabelStream](https://github.com/jyoung3131/BabelStream) from Jeff Young. Then refactored. +The benchmark BabelStream is developed by Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC; based on John D. McCalpin's original STREAM benchmark for CPUs +Some implementations and the documents are accessible through https://github.com/UoB-HPC + +# Example Run +Can be run with custom arguments as well as catch2 arguments +# With Custom arguments: +./babelstream --array-size=1280000 --number-runs=10 +# With Catch2 arguments: +./babelstream --success +# With Custom and catch2 arguments together: +./babelstream --success --array-size=1280000 --number-runs=10 + +# Command for a benchmarking run +# ./babelstream --array-size=33554432 --number-runs=100 +# Otuput is below: + +'''Array size provided: 33554432 +Number of runs provided: 100 +Randomness seeded to: 2775986196 + + +AcceleratorType:AccCpuSerial<1,unsigned int> +NumberOfRuns:100 +Precision:single +DataSize(items):33554432 +DeviceName:13th Gen Intel(R) Core(TM) i7-1360P +WorkDivInit :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)} +WorkDivCopy :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)} +WorkDivMult :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)} +WorkDivAdd :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)} +WorkDivTriad:{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)} +Kernels Bandwidths(GB/s) MinTime(s) MaxTime(s) AvgTime(s) DataUsage(MB) + InitKernel 12.2133 0.0219789 0.0244341 0.0234795 268.435 + CopyKernel 20.8898 0.01285 0.0141298 0.0130288 268.435 + MultKernel 20.9943 0.0127861 0.0161767 0.0129707 268.435 + AddKernel 24.4181 0.01649 0.0178725 0.0166714 402.653 + TriadKernel 24.44 0.0164751 0.0182611 0.0166579 402.653 + + + +AcceleratorType:AccGpuCudaRt<1,unsigned int> +NumberOfRuns:100 +Precision:single +DataSize(items):33554432 +DeviceName:NVIDIA RTX A500 Laptop GPU +WorkDivInit :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)} +WorkDivCopy :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)} +WorkDivMult :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)} +WorkDivAdd :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)} +WorkDivTriad:{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)} +WorkDivDot :{gridBlockExtent: (256), blockThreadExtent: (1024), threadElemExtent: (1)} +Kernels Bandwidths(GB/s) MinTime(s) MaxTime(s) AvgTime(s) DataUsage(MB) + InitKernel 62.3725 0.00430374 0.00434411 0.00433501 268.435 + CopyKernel 90.2948 0.00297288 0.00302862 0.00300712 268.435 + MultKernel 90.3858 0.00296988 0.00302989 0.00300866 268.435 + AddKernel 90.947 0.00442734 0.00448436 0.00446751 402.653 + TriadKernel 90.88 0.0044306 0.00447952 0.00446739 402.653 + DotKernel 93.369 0.002875 0.00291691 0.0029106 268.435 + + + +AcceleratorType:AccCpuSerial<1,unsigned int> +NumberOfRuns:100 +Precision:double +DataSize(items):33554432 +DeviceName:13th Gen Intel(R) Core(TM) i7-1360P +WorkDivInit :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)} +WorkDivCopy :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)} +WorkDivMult :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)} +WorkDivAdd :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)} +WorkDivTriad:{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)} +WorkDivDot :{gridBlockExtent: (256), blockThreadExtent: (1024), threadElemExtent: (1)} +Kernels Bandwidths(GB/s) MinTime(s) MaxTime(s) AvgTime(s) DataUsage(MB) + InitKernel 12.2326 0.0438886 0.0543366 0.0463925 536.871 + CopyKernel 20.8888 0.0257014 0.0272265 0.0260267 536.871 + MultKernel 21.0395 0.0255173 0.0292734 0.0262349 536.871 + AddKernel 24.6628 0.0326527 0.0383083 0.0334047 805.306 + TriadKernel 24.5604 0.0327888 0.0494151 0.0335766 805.306 + + + +AcceleratorType:AccGpuCudaRt<1,unsigned int> +NumberOfRuns:100 +Precision:double +DataSize(items):33554432 +DeviceName:NVIDIA RTX A500 Laptop GPU +WorkDivInit :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)} +WorkDivCopy :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)} +WorkDivMult :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)} +WorkDivAdd :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)} +WorkDivTriad:{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)} +WorkDivDot :{gridBlockExtent: (256), blockThreadExtent: (1024), threadElemExtent: (1)} +Kernels Bandwidths(GB/s) MinTime(s) MaxTime(s) AvgTime(s) DataUsage(MB) + InitKernel 62.4307 0.00859947 0.00864104 0.00862767 536.871 + CopyKernel 89.4157 0.00600421 0.00607738 0.00604754 536.871 + MultKernel 89.2831 0.00601313 0.00606791 0.0060488 536.871 + AddKernel 90.5499 0.00889351 0.00895834 0.00893668 805.306 + TriadKernel 90.5685 0.00889168 0.00897055 0.00893744 805.306 + DotKernel 93.2451 0.00575763 0.00581312 0.00579143 536.871 +''' diff --git a/benchmarks/babelstream/src/Stream.h b/benchmarks/babelstream/src/Stream.h deleted file mode 100644 index d4548428f0bb..000000000000 --- a/benchmarks/babelstream/src/Stream.h +++ /dev/null @@ -1,48 +0,0 @@ - -// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -// NOLINTBEGIN - -#pragma once - -#include -#include - -// Array values -#define startA (0.1) -#define startB (0.2) -#define startC (0.0) -#define startScalar (0.4) - -template -class Stream -{ -public: - virtual ~Stream() - { - } - - // Kernels - // These must be blocking calls - virtual void copy() = 0; - virtual void mul() = 0; - virtual void add() = 0; - virtual void triad() = 0; - virtual void nstream() = 0; - virtual T dot() = 0; - - // Copy memory between host and device - virtual void init_arrays(T initA, T initB, T initC) = 0; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) = 0; -}; - -// Implementation specific device functions -void listDevices(void); -std::string getDeviceName(int const); -std::string getDeviceDriver(int const); - -// NOLINTEND diff --git a/benchmarks/babelstream/src/babelStreamCommon.hpp b/benchmarks/babelstream/src/babelStreamCommon.hpp new file mode 100644 index 000000000000..a22f7d032d31 --- /dev/null +++ b/benchmarks/babelstream/src/babelStreamCommon.hpp @@ -0,0 +1,440 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace +{ + // Default array size, can be changed from command line arguments. + // To display cmd line args use ./babelstream --help or -? + // According to tests, 2^25 or larger values are needed for proper benchmarking: + // ./babelstream --array-size=33554432 --number-runs=100 + // To prevent timeouts in CI, a smaller default value is used. + [[maybe_unused]] auto arraySizeMain = 1024 * 1024; + + // Minimum array size to be used. + [[maybe_unused]] constexpr auto minArrSize = 1024 * 128; + + // Scalar value for Mul and Triad kernel parameters. + [[maybe_unused]] constexpr auto scalarVal = 2.0f; + + // Block thread extent for DotKernel test work division parameters. + [[maybe_unused]] constexpr auto blockThreadExtentMain = 1024; + + // Number of runs for each kernel, can be changed by command line arguments. + // At least 100 runs are recommended for good benchmarking. + // To prevent timeouts in CI, a small value is used. + [[maybe_unused]] auto numberOfRuns = 2; + + // Data input value for babelstream. + [[maybe_unused]] constexpr auto valA = 1.0f; + + //! handleCustomArguments Gets custom cmd line arguments from the all arguments. + //! Namely gets --array-size=1234 and --number-runs=1234 and keeps the others which are + //! command line args for Catch2 session. + [[maybe_unused]] static void handleCustomArguments(int& argc, char* argv[]) + { + std::vector newArgv; + newArgv.push_back(argv[0]); // Keep the program name + + for(int i = 1; i < argc; ++i) + { + std::string arg = argv[i]; + if(arg.rfind("--array-size=", 0) == 0) + { + auto const arrSize = std::stoi(arg.substr(13)); // Convert to integer + if(arrSize > minArrSize) + { + arraySizeMain = arrSize; + std::cout << "Array size provided(items): " << arraySizeMain << std::endl; + } + else + { + std::cout << "Too small array size given. Must be at least " << minArrSize << std::endl; + std::cout << "Using default array size(number of items): " << arraySizeMain << std::endl; + } + } + else if(arg.rfind("--number-runs=", 0) == 0) + { + auto const numRuns = std::stoi(arg.substr(14)); // Convert to integer + if(numRuns > 0) + { + numberOfRuns = numRuns; + std::cout << "Number of runs provided: " << numberOfRuns << std::endl; + } + else + { + std::cout << "Using default number of runs: " << numberOfRuns << std::endl; + } + } + else + { + // If it's not a custom argument, keep it for Catch2 + newArgv.push_back(argv[i]); + } + if(arg.rfind("-?", 0) == 0 || arg.rfind("--help", 0) == 0 || arg.rfind("-h", 0) == 0) + { + std::cout << "Usage of custom arguments (arguments which are not Catch2): --array-size=33554432 and " + "--number-runs=100" + << std::endl; + } + } + + // Update argc and argv to exclude custom arguments + argc = static_cast(newArgv.size()); + for(int i = 0; i < argc; ++i) + { + argv[i] = newArgv[static_cast(i)]; + } + } + + //! FuzzyEqual compares two floating-point or integral type values. + //! \tparam T Type of the values to compare. + //! \param a First value to compare. + //! \param b Second value to compare. + //! \return Returns true if the values are approximately equal (for floating-point types) or exactly equal (for + //! integral types). + template + [[maybe_unused]] bool FuzzyEqual(T a, T b) + { + if constexpr(std::is_floating_point_v) + { + return std::fabs(a - b) < std::numeric_limits::epsilon() * static_cast(100.0); + } + else if constexpr(std::is_integral_v) + { + return a == b; + } + else + { + static_assert( + std::is_floating_point_v || std::is_integral_v, + "FuzzyEqual is only supported for integral or floating-point types."); + } + } + + //! Gets the current timestamp and returns it as a string. + //! \return A string representation of the current timestamp in the format "YYYY-MM-DD HH:MM:SS". + [[maybe_unused]] static std::string getCurrentTimestamp() + { + auto now = std::chrono::system_clock::now(); + auto now_c = std::chrono::system_clock::to_time_t(now); + std::stringstream ss; + ss << std::put_time(std::localtime(&now_c), "%Y-%m-%d %X"); + return ss.str(); + } + + //! joinElements Joins the elements of a vector into a string, separated by a specified delimiter. + //! \tparam T Type of the elements in the vector. + //! \param vec The vector of elements to join. + //! \param delim The delimiter to separate the elements in the resulting string. + //! \return A string with the vector elements separated by the specified delimiter. + template + [[maybe_unused]] static std::string joinElements(std::vector const& vec, std::string const& delim) + { + return std::accumulate( + vec.begin(), + vec.end(), + std::string(), + [&delim](std::string const& a, T const& b) + { + std::ostringstream oss; + if(!a.empty()) + oss << a << delim; + oss << std::setprecision(5) << b; + return oss.str(); + }); + } + + //! findMinMax Finds the minimum and maximum elements in a container. + //! \tparam Container The type of the container. + //! \param times The container from which to find the minimum and maximum elements. + //! \return A pair containing the minimum and maximum values in the container. + //! \note The first element is omitted if the container size is larger than 1, as the result is used in time + //! measurement for benchmarking. + template + [[maybe_unused]] static auto findMinMax(Container const& times) + -> std::pair + { + if(times.empty()) + return std::make_pair(typename Container::value_type{}, typename Container::value_type{}); + + // Default to min and max being the same element for single element containers + auto minValue = *std::min_element(times.begin(), times.end()); + auto maxValue = minValue; + + if(times.size() > 1) + { + // Calculate min and max ignoring the first element + minValue = *std::min_element(times.begin() + 1, times.end()); + maxValue = *std::max_element(times.begin() + 1, times.end()); + } + + return std::make_pair(minValue, maxValue); + } + + //! findAverage Calculates the average value of elements in a container, does not take into account the first one. + //! \tparam Container The type of the container. + //! \param elements The container from which to calculate the average. + //! \return The average value of the elements in the container without considering the first element. + template + [[maybe_unused]] static auto findAverage(Container const& elements) -> typename Container::value_type + { + if(elements.empty()) + return typename Container::value_type{}; + + if(elements.size() == 1) + return elements.front(); // Only one element, return it as the average + + // Calculate the sum of the elements, start from the second one + auto sum = std::accumulate(elements.begin() + 1, elements.end(), typename Container::value_type{}); + + // Calculate and return the average, take into account that one element is not used + return sum / static_cast(elements.size() - 1); + } + + //! Enum class representing benchmark information data types. + enum class BMInfoDataType + { + AcceleratorType, + TimeStamp, + NumRuns, + DataSize, + DataType, + WorkDivInit, + WorkDivCopy, + WorkDivAdd, + WorkDivTriad, + WorkDivMult, + WorkDivDot, + DeviceName, + TimeUnit, + KernelNames, + KernelBandwidths, + KernelDataUsageValues, + KernelMinTimes, + KernelMaxTimes, + KernelAvgTimes + }; + + //! typeToTypeStr Converts BMInfoDataType enum values to their corresponding string representations. + //! \param item The BMInfoDataType enum type value to convert to a more explicit string with units. + //! \return A string representation of the given BMInfoDataType enum value. +#if defined(__clang__) +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wswitch-default" +# pragma clang diagnostic ignored "-Wcovered-switch-default" +#endif + static std::string typeToTypeStr(BMInfoDataType item) + { + switch(item) + { + case BMInfoDataType::AcceleratorType: + return "AcceleratorType"; + case BMInfoDataType::TimeStamp: + return "TimeStamp"; + case BMInfoDataType::NumRuns: + return "NumberOfRuns"; + case BMInfoDataType::DataSize: + return "DataSize(items)"; + case BMInfoDataType::DataType: + return "Precision"; + case BMInfoDataType::DeviceName: + return "DeviceName"; + case BMInfoDataType::TimeUnit: + return "TimeUnitForXMLReport"; + case BMInfoDataType::KernelNames: + return "Kernels"; + case BMInfoDataType::KernelDataUsageValues: + return "DataUsage(MB)"; + case BMInfoDataType::KernelBandwidths: + return "Bandwidths(GB/s)"; + case BMInfoDataType::KernelMinTimes: + return "MinTime(s)"; + case BMInfoDataType::KernelMaxTimes: + return "MaxTime(s)"; + case BMInfoDataType::KernelAvgTimes: + return "AvgTime(s)"; + case BMInfoDataType::WorkDivInit: + return "WorkDivInit "; + case BMInfoDataType::WorkDivCopy: + return "WorkDivCopy "; + case BMInfoDataType::WorkDivAdd: + return "WorkDivAdd "; + case BMInfoDataType::WorkDivTriad: + return "WorkDivTriad"; + case BMInfoDataType::WorkDivMult: + return "WorkDivMult "; + case BMInfoDataType::WorkDivDot: + return "WorkDivDot "; + default: + return ""; + } + } +#if defined(__clang__) +# pragma clang diagnostic pop +#endif + //! getDataThroughput Calculates the data throughput for processing the entire array. + //! \tparam DataType The type of the data. + //! \tparam T The type of the parameters. + //! \param readsWrites The number of read/write operations. + //! \param arraySize The size of the array. + //! \return The calculated data throughput in MB. + template + [[maybe_unused]] static double getDataThroughput(T readsWrites, T arraySize) + { + auto throughput = readsWrites * sizeof(DataType) * arraySize; + // convert to MB (not MiB) + return static_cast(throughput) * 1.0E-6; + } + + //! calculateBandwidth Calculates the bandwidth in GB/sec. + //! \tparam T The type of bytesReadWriteMB. + //! \tparam U The type of runTimeSeconds (e.g., double). + //! \param bytesReadWriteMB The amount of data read/write in MB. + //! \param runTimeSeconds The runtime in seconds. + //! \return The calculated bandwidth in GB/sec. + template + [[maybe_unused]] static double calculateBandwidth(T bytesReadWriteMB, U runTimeSeconds) + { + // Divide by 1.0E+3 to convert from MB to GB (not GiB) + auto bytesReadWriteGB = static_cast(bytesReadWriteMB) * (1.0E-3); + return bytesReadWriteGB / static_cast(runTimeSeconds); + } + + //! MetaData class to store and serialize benchmark information. + //! \details The MetaData class includes a single map to keep all benchmark information and provides serialization + //! methods for generating output. + class MetaData + { + public: + //! setItem Sets an item in the metadata map. + //! \tparam T The type of the value to store. + //! \param key The BMInfoDataType key. + //! \param value The value to store associated with the key. + template + [[maybe_unused]] void setItem(BMInfoDataType key, T const& value) + { + std::ostringstream oss; + oss << value; + metaDataMap[key] = oss.str(); + } + + //! serialize Serializes the entire metadata to a string. + //! \return A string containing the serialized metadata. + //! \details This is standard serialization and produces output that can be post-processed easily. + [[maybe_unused]] std::string serialize() const + { + std::stringstream ss; + for(auto const& pair : metaDataMap) + { + ss << "\n" << typeToTypeStr(pair.first) << ":" << pair.second; + } + return ss.str(); + } + + //! serializeAsTable Serializes the metadata into a more structured format for easy visual inspection. + //! \return A string containing the serialized metadata as a table. + //! \details The method first serializes general information, then creates a summary as a table where each row + //! represents a kernel. + [[maybe_unused]] std::string serializeAsTable() const + { + std::stringstream ss; + // define lambda to add values to a string stream created already + auto addItemValue = [&, this](BMInfoDataType item) { + ss << "\n" << typeToTypeStr(item) << ":" << metaDataMap.at(item); + }; + + // Initially chose some data to serialize + ss << "\n"; + addItemValue(BMInfoDataType::AcceleratorType); + addItemValue(BMInfoDataType::NumRuns); + addItemValue(BMInfoDataType::DataType); + addItemValue(BMInfoDataType::DataSize); + addItemValue(BMInfoDataType::DeviceName); + addItemValue(BMInfoDataType::WorkDivInit); + addItemValue(BMInfoDataType::WorkDivCopy); + addItemValue(BMInfoDataType::WorkDivMult); + addItemValue(BMInfoDataType::WorkDivAdd); + addItemValue(BMInfoDataType::WorkDivTriad); + if(metaDataMap.count(BMInfoDataType::WorkDivDot) != 0) + addItemValue(BMInfoDataType::WorkDivDot); + + auto getItemFromStrList = [this](BMInfoDataType item, int index) -> std::string + { + std::string const str = metaDataMap.at(item); + + if(index < 1) + { + throw std::invalid_argument("Index must be 1 or greater."); + } + + std::istringstream iss(str); + std::string token; + int current_index = 1; // Start at 1 for 1-based indexing + + // Using ", " as the delimiter, we handle the token extraction manually + while(std::getline(iss, token, ',')) + { + // Remove any leading spaces that may be left by `getline` + size_t start = token.find_first_not_of(' '); + if(start != std::string::npos) + { + token = token.substr(start); + } + + if(current_index == index) + { + return token; + } + ++current_index; + } + + throw std::out_of_range("Index out of range"); + }; + + // Prepare Table + // Table column names + ss << std::endl; + ss << std::left << std::setw(15) << typeToTypeStr(BMInfoDataType::KernelNames) << " " << std::left + << std::setw(15) << typeToTypeStr(BMInfoDataType::KernelBandwidths) << " " << std::left << std::setw(10) + << typeToTypeStr(BMInfoDataType::KernelMinTimes) << " " << std::left << std::setw(10) + << typeToTypeStr(BMInfoDataType::KernelMaxTimes) << " " << std::left << std::setw(10) + << typeToTypeStr(BMInfoDataType::KernelAvgTimes) << " " << std::left << std::setw(6) + << typeToTypeStr(BMInfoDataType::KernelDataUsageValues) << " "; + ss << std::endl; + auto const kernelNamesStr = metaDataMap.at(BMInfoDataType::KernelNames); + auto numberOfKernels = std::count(kernelNamesStr.begin(), kernelNamesStr.end(), ',') + 1; + + // Table rows. Print test results for each kernel line by line + for(auto i = 1; i <= numberOfKernels; i++) + { + // Print the row for the kernel i + ss << " " << std::left << std::setw(15) << getItemFromStrList(BMInfoDataType::KernelNames, i) << " "; + ss << std::left << std::setw(15) << getItemFromStrList(BMInfoDataType::KernelBandwidths, i) << " "; + ss << std::left << std::setw(8) << getItemFromStrList(BMInfoDataType::KernelMinTimes, i) << " "; + ss << std::left << std::setw(8) << getItemFromStrList(BMInfoDataType::KernelMaxTimes, i) << " "; + ss << std::left << std::setw(8) << getItemFromStrList(BMInfoDataType::KernelAvgTimes, i) << " "; + ss << std::left << std::setw(6) << getItemFromStrList(BMInfoDataType::KernelDataUsageValues, i) << " " + << std::endl; + } + + return ss.str(); + } + + private: + std::map metaDataMap; + }; +} // namespace diff --git a/benchmarks/babelstream/src/babelStreamMainTest.cpp b/benchmarks/babelstream/src/babelStreamMainTest.cpp new file mode 100644 index 000000000000..79ec62165084 --- /dev/null +++ b/benchmarks/babelstream/src/babelStreamMainTest.cpp @@ -0,0 +1,478 @@ + +#include "babelStreamCommon.hpp" +#include "catch2/catch_session.hpp" + +#include +#include + +#include +#include +#include + +#include + +/** + * Babelstream benchmarking example. Babelstream has 5 kernels. Add, Multiply, Copy, Triad and Dot. + * Babelstream is a memory-bound benchmark since the main operation in the kernels has high Code Balance (bytes/FLOP) + * value. For example c[i] = a[i] + b[i]; has 2 reads 1 writes and has one FLOP operation. For double precision each + * read-write is 8 bytes. Hence Code Balance (3*8 / 1) = 24 bytes/FLOP. + * + * Some implementations and the documents are accessible through https://github.com/UoB-HPC + * + * Can be run with custom arguments as well as catch2 arguments + * Run with Custom arguments: + * ./babelstream --array-size=33554432 --number-runs=100 + * Runt with default array size and num runs: + * ./babelstream + * Run with Catch2 arguments and defaul arrary size and num runs: + * ./babelstream --success + * ./babelstream -r a.xml + * Run with Custom and catch2 arguments together: + * ./babelstream --success --array-size=1280000 --number-runs=10 + * Help to list custom and catch2 arguments + * ./babelstream -? + * ./babelstream --help + * According to tests, 2^25 or larger data size values are needed for proper benchmarking: + * ./babelstream --array-size=33554432 --number-runs=100 + */ + +// Main function that integrates Catch2 and custom argument handling +int main(int argc, char* argv[]) +{ + // Handle custom arguments + handleCustomArguments(argc, argv); + + // Initialize Catch2 and pass the command-line arguments to it + int result = Catch::Session().run(argc, argv); + + // Return the result of the tests + return result; +} + +//! Initialization kernel +struct InitKernel +{ + //! The kernel entry point + //! \tparam TAcc The accelerator environment to be executed on. + //! \tparam T The data type + //! \param acc The accelerator to be executed on. + //! \param a Pointer for vector a + //! \param initA the value to set all items in the vector + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T* b, T* c, T initA) const + { + auto const [i] = alpaka::getIdx(acc); + a[i] = initA; + b[i] = static_cast(0.0); + c[i] = static_cast(0.0); + } +}; + +//! Vector copying kernel +struct CopyKernel +{ + //! The kernel entry point + //! \tparam TAcc The accelerator environment to be executed on. + //! \tparam T The data type + //! \param acc The accelerator to be executed on. + //! \param a Pointer for vector a + //! \param b Pointer for vector b + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T* b) const + { + auto const [index] = alpaka::getIdx(acc); + b[index] = a[index]; + } +}; + +//! Kernel multiplies the vector with a scalar, scaling or multiplication kernel +struct MultKernel +{ + //! The kernel entry point + //! \tparam TAcc The accelerator environment to be executed on. + //! \tparam T The data type + //! \param acc The accelerator to be executed on. + //! \param a Pointer for vector a + //! \param b Pointer for result vector b + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, T* const a, T* b) const + { + const T scalar = static_cast(scalarVal); + auto const [i] = alpaka::getIdx(acc); + b[i] = scalar * a[i]; + } +}; + +//! Vector summation kernel +struct AddKernel +{ + //! The kernel entry point + //! \tparam TAcc The accelerator environment to be executed on. + //! \tparam T The data type + //! \param acc The accelerator to be executed on. + //! \param a Pointer for vector a + //! \param b Pointer for vector b + //! \param c Pointer for result vector c + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* c) const + { + auto const [i] = alpaka::getIdx(acc); + c[i] = a[i] + b[i]; + } +}; + +//! Kernel to find the linear combination of 2 vectors by initially scaling one of them +struct TriadKernel +{ + //! The kernel entry point + //! \tparam TAcc The accelerator environment to be executed on. + //! \tparam T The data type + //! \param acc The accelerator to be executed on. + //! \param a Pointer for vector a + //! \param b Pointer for vector b + //! \param c Pointer for result vector c + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* c) const + { + const T scalar = static_cast(scalarVal); + auto const [i] = alpaka::getIdx(acc); + c[i] = a[i] + scalar * b[i]; + } +}; + +//! Dot product of two vectors. The result is not a scalar but a vector of block-level dot products. For the +//! BabelStream implementation and documentation: https://github.com/UoB-HPC +struct DotKernel +{ + //! The kernel entry point + //! \tparam TAcc The accelerator environment to be executed on. + //! \tparam T The data type + //! \param acc The accelerator to be executed on. + //! \param a Pointer for vector a + //! \param b Pointer for vector b + //! \param sum Pointer for result vector consisting sums for each block + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* sum, alpaka::Idx arraySize) const + { + using Idx = alpaka::Idx; + auto& tbSum = alpaka::declareSharedVar(acc); + + auto i = alpaka::getIdx(acc)[0]; + auto const local_i = alpaka::getIdx(acc)[0]; + auto const totalThreads = alpaka::getWorkDiv(acc)[0]; + + T threadSum = 0; + for(; i < arraySize; i += totalThreads) + threadSum += a[i] * b[i]; + tbSum[local_i] = threadSum; + + auto const blockSize = alpaka::getWorkDiv(acc)[0]; + for(Idx offset = blockSize / 2; offset > 0; offset /= 2) + { + alpaka::syncBlockThreads(acc); + if(local_i < offset) + tbSum[local_i] += tbSum[local_i + offset]; + } + + auto const gridBlockIndex = alpaka::getIdx(acc)[0]; + if(local_i == 0) + sum[gridBlockIndex] = tbSum[local_i]; + } +}; + +//! \brief The Function for testing babelstream kernels for given Acc type and data type. +//! \tparam TAcc the accelerator type +//! \tparam DataType The data type to differentiate single or double data type based tests. +template +void testKernels() +{ + using Acc = TAcc; + // Define the index domain + // Set the number of dimensions as an integral constant. Set to 1 for 1D. + using Dim = alpaka::Dim; + using Idx = alpaka::Idx; + + // Meta data + // A MetaData class instance to keep the problem and results to print later + MetaData metaData; + std::string dataTypeStr; + if(std::is_same::value) + { + dataTypeStr = "single"; + } + else if(std::is_same::value) + { + dataTypeStr = "double"; + } + + using QueueAcc = alpaka::Queue; + + // Select a device + auto const platform = alpaka::Platform{}; + auto const devAcc = alpaka::getDevByIdx(platform, 0); + + // Create a queue on the device + QueueAcc queue(devAcc); + + // Get the host device for allocating memory on the host. + auto const platformHost = alpaka::PlatformCpu{}; + auto const devHost = alpaka::getDevByIdx(platformHost, 0); + + // Create vectors + Idx arraySize = static_cast(arraySizeMain); + + // Acc buffers + auto bufAccInputA = alpaka::allocBuf(devAcc, arraySize); + auto bufAccInputB = alpaka::allocBuf(devAcc, arraySize); + auto bufAccOutputC = alpaka::allocBuf(devAcc, arraySize); + + // Host buffer as the result + auto bufHostOutputA = alpaka::allocBuf(devHost, arraySize); + auto bufHostOutputB = alpaka::allocBuf(devHost, arraySize); + auto bufHostOutputC = alpaka::allocBuf(devHost, arraySize); + + // Grid size and elems per thread will be used to get the work division + using Vec = alpaka::Vec; + auto const elementsPerThread = Vec::all(static_cast(1)); + auto const elementsPerGrid = Vec::all(arraySize); + + // Create pointer variables for buffer access + auto bufAccInputAPtr = std::data(bufAccInputA); + auto bufAccInputBPtr = std::data(bufAccInputB); + auto bufAccOutputCPtr = std::data(bufAccOutputC); + + // Bind gridsize and elements per thread together + alpaka::KernelCfg const kernelCfg = {elementsPerGrid, elementsPerThread}; + // Let alpaka calculate good work division (namely the block and grid sizes) given our full problem extent + auto const workDivInit = alpaka::getValidWorkDiv( + kernelCfg, + devAcc, + InitKernel(), + bufAccInputAPtr, + bufAccInputBPtr, + bufAccOutputCPtr, + static_cast(valA)); + auto const workDivCopy + = alpaka::getValidWorkDiv(kernelCfg, devAcc, CopyKernel(), bufAccInputAPtr, bufAccInputBPtr); + auto const workDivMult + = alpaka::getValidWorkDiv(kernelCfg, devAcc, MultKernel(), bufAccInputAPtr, bufAccInputBPtr); + auto const workDivAdd + = alpaka::getValidWorkDiv(kernelCfg, devAcc, AddKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); + + auto const workDivTriad = alpaka::getValidWorkDiv( + kernelCfg, + devAcc, + TriadKernel(), + bufAccInputAPtr, + bufAccInputBPtr, + bufAccOutputCPtr); + + // Vector of average run-times of babelstream kernels + std::vector avgExecTimesOfKernels; + std::vector minExecTimesOfKernels; + std::vector maxExecTimesOfKernels; + std::vector kernelLabels; + // Vector for collecting successive run-times of a single kernel in benchmark macro + std::vector times; + + // Lambda for measuring run-time + auto measureKernelExec = [&](auto&& kernelFunc, [[maybe_unused]] auto&& kernelLabel) + { + for(auto i = 0; i < numberOfRuns; i++) + { + double runtime = 0.0; + auto start = std::chrono::high_resolution_clock::now(); + kernelFunc(); + alpaka::wait(queue); + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration = end - start; + runtime = duration.count(); + times.push_back(runtime); + } + + // find the minimum of the durations array. + // In benchmarking the first item of the runtimes array is not included in calculations. + const auto minmaxPair = findMinMax(times); + minExecTimesOfKernels.push_back(minmaxPair.first); + maxExecTimesOfKernels.push_back(minmaxPair.second); + avgExecTimesOfKernels.push_back(findAverage(times)); + kernelLabels.push_back(kernelLabel); + times.clear(); + }; + + // Run kernels one by one + // Test the init-kernel. + measureKernelExec( + [&]() + { + alpaka::exec( + queue, + workDivInit, + InitKernel(), + bufAccInputAPtr, + bufAccInputBPtr, + bufAccOutputCPtr, + static_cast(valA)); + }, + "InitKernel"); + + // Test the copy-kernel. Copy A one by one to B. + measureKernelExec( + [&]() { alpaka::exec(queue, workDivCopy, CopyKernel(), bufAccInputAPtr, bufAccInputBPtr); }, + "CopyKernel"); + + // Test the scaling-kernel. Calculate B=scalar*A. + measureKernelExec( + [&]() { alpaka::exec(queue, workDivMult, MultKernel(), bufAccInputAPtr, bufAccInputBPtr); }, + "MultKernel"); + + // Test the addition-kernel. Calculate C=A+B. Where B=scalar*A. + measureKernelExec( + [&]() + { alpaka::exec(queue, workDivAdd, AddKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); }, + "AddKernel"); + + // Test the Triad-kernel. Calculate C=A+scalar*B where B=scalar*A. + measureKernelExec( + [&]() + { alpaka::exec(queue, workDivTriad, TriadKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); }, + "TriadKernel"); + + + // Copy arrays back to host + alpaka::memcpy(queue, bufHostOutputC, bufAccOutputC, arraySize); + alpaka::memcpy(queue, bufHostOutputB, bufAccInputB, arraySize); + alpaka::memcpy(queue, bufHostOutputA, bufAccInputA, arraySize); + + // Verify the results + // + // Find sum of the errors as sum of the differences from expected values + DataType initVal{static_cast(0.0)}; + DataType sumErrC{initVal}, sumErrB{initVal}, sumErrA{initVal}; + + auto const expectedC = static_cast(valA + scalarVal * scalarVal * valA); + auto const expectedB = static_cast(scalarVal * valA); + auto const expectedA = static_cast(valA); + + // sum of the errors for each array + for(Idx i = 0; i < arraySize; ++i) + { + sumErrC += bufHostOutputC[static_cast(i)] - expectedC; + sumErrB += bufHostOutputB[static_cast(i)] - expectedB; + sumErrA += bufHostOutputA[static_cast(i)] - expectedA; + } + + // Normalize and compare sum of the errors + REQUIRE(FuzzyEqual(sumErrC / static_cast(arraySize) / expectedC, static_cast(0.0))); + REQUIRE(FuzzyEqual(sumErrB / static_cast(arraySize) / expectedB, static_cast(0.0))); + REQUIRE(FuzzyEqual(sumErrA / static_cast(arraySize) / expectedA, static_cast(0.0))); + alpaka::wait(queue); + + // Test Dot kernel with specific blocksize which is larger than 1 + if constexpr(alpaka::accMatchesTags) + { + using WorkDiv = alpaka::WorkDivMembers; + // Threads per block for Dot kernel + constexpr Idx blockThreadExtent = blockThreadExtentMain; + // Blocks per grid for Dot kernel + constexpr Idx gridBlockExtent = static_cast(256); + // Vector of sums of each block + auto bufAccSumPerBlock = alpaka::allocBuf(devAcc, gridBlockExtent); + auto bufHostSumPerBlock = alpaka::allocBuf(devHost, gridBlockExtent); + // A specific work-division is used for dotKernel + auto const workDivDot = WorkDiv{Vec{gridBlockExtent}, Vec{blockThreadExtent}, Vec::all(1)}; + + measureKernelExec( + [&]() + { + alpaka::exec( + queue, + workDivDot, + DotKernel(), // Dot kernel + alpaka::getPtrNative(bufAccInputA), + alpaka::getPtrNative(bufAccInputB), + alpaka::getPtrNative(bufAccSumPerBlock), + static_cast>(arraySize)); + }, + "DotKernel"); + + alpaka::memcpy(queue, bufHostSumPerBlock, bufAccSumPerBlock, gridBlockExtent); + alpaka::wait(queue); + + DataType const* sumPtr = std::data(bufHostSumPerBlock); + auto const result = std::reduce(sumPtr, sumPtr + gridBlockExtent, DataType{0}); + // Since vector values are 1, dot product should be identical to arraySize + REQUIRE(FuzzyEqual(static_cast(result), static_cast(arraySize * 2))); + // Add workdiv to the list of workdivs to print later + metaData.setItem(BMInfoDataType::WorkDivDot, workDivDot); + } + + + // + // Calculate and Display Benchmark Results + // + std::vector bytesReadWriteMB = { + getDataThroughput(2u, static_cast(arraySize)), + getDataThroughput(2u, static_cast(arraySize)), + getDataThroughput(2u, static_cast(arraySize)), + getDataThroughput(3u, static_cast(arraySize)), + getDataThroughput(3u, static_cast(arraySize)), + getDataThroughput(2u, static_cast(arraySize)), + }; + + // calculate the bandwidth as throughput per seconds + std::vector bandwidthsPerKernel; + if(minExecTimesOfKernels.size() == kernelLabels.size()) + { + for(size_t i = 0; i < minExecTimesOfKernels.size(); ++i) + { + bandwidthsPerKernel.push_back(calculateBandwidth(bytesReadWriteMB.at(i), minExecTimesOfKernels.at(i))); + } + } + + // Setting fields of Benchmark Info map. All information about benchmark and results are stored in a single map + metaData.setItem(BMInfoDataType::TimeStamp, getCurrentTimestamp()); + metaData.setItem(BMInfoDataType::NumRuns, std::to_string(numberOfRuns)); + metaData.setItem(BMInfoDataType::DataSize, std::to_string(arraySizeMain)); + metaData.setItem(BMInfoDataType::DataType, dataTypeStr); + + metaData.setItem(BMInfoDataType::WorkDivInit, workDivInit); + metaData.setItem(BMInfoDataType::WorkDivCopy, workDivCopy); + metaData.setItem(BMInfoDataType::WorkDivAdd, workDivAdd); + metaData.setItem(BMInfoDataType::WorkDivMult, workDivMult); + metaData.setItem(BMInfoDataType::WorkDivTriad, workDivTriad); + + // Device and accelerator + metaData.setItem(BMInfoDataType::DeviceName, alpaka::getName(devAcc)); + metaData.setItem(BMInfoDataType::AcceleratorType, alpaka::getAccName()); + // XML reporter of catch2 always converts to Nano Seconds + metaData.setItem(BMInfoDataType::TimeUnit, "Nano Seconds"); + // Join elements and create a comma separated string + metaData.setItem(BMInfoDataType::KernelNames, joinElements(kernelLabels, ", ")); + metaData.setItem(BMInfoDataType::KernelDataUsageValues, joinElements(bytesReadWriteMB, ", ")); + metaData.setItem(BMInfoDataType::KernelBandwidths, joinElements(bandwidthsPerKernel, ", ")); + metaData.setItem(BMInfoDataType::KernelMinTimes, joinElements(minExecTimesOfKernels, ", ")); + metaData.setItem(BMInfoDataType::KernelMaxTimes, joinElements(maxExecTimesOfKernels, ", ")); + metaData.setItem(BMInfoDataType::KernelAvgTimes, joinElements(avgExecTimesOfKernels, ", ")); + + // Print the summary as a table, if a standard serialization is needed other functions of the class can be used + std::cout << metaData.serializeAsTable() << std::endl; +} + +using TestAccs1D = alpaka::test::EnabledAccs, std::uint32_t>; + +// Run for all Accs given by the argument +TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Five Kernels", "[benchmark-test]", TestAccs1D) +{ + using Acc = TestType; + // Run tests for the float data type + testKernels(); +} + +// Run for all Accs given by the argument +TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Five Kernels", "[benchmark-test]", TestAccs1D) +{ + using Acc = TestType; + // Run tests for the double data type + testKernels(); +} diff --git a/benchmarks/babelstream/src/main.cpp b/benchmarks/babelstream/src/main.cpp deleted file mode 100644 index acef1c33a60c..000000000000 --- a/benchmarks/babelstream/src/main.cpp +++ /dev/null @@ -1,588 +0,0 @@ - -// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -// NOLINTBEGIN - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define VERSION_STRING "4.0" - -#include "Stream.h" - -#if defined(CUDA) -# include "CUDAStream.h" -#elif defined(STD_DATA) -# include "STDDataStream.h" -#elif defined(STD_INDICES) -# include "STDIndicesStream.h" -#elif defined(STD_RANGES) -# include "STDRangesStream.hpp" -#elif defined(TBB) -# include "TBBStream.hpp" -#elif defined(THRUST) -# include "ThrustStream.h" -#elif defined(HIP) -# include "HIPStream.h" -#elif defined(HC) -# include "HCStream.h" -#elif defined(OCL) -# include "OCLStream.h" -#elif defined(USE_RAJA) -# include "RAJAStream.hpp" -#elif defined(KOKKOS) -# include "KokkosStream.hpp" -#elif defined(ACC) -# include "ACCStream.h" -#elif defined(SYCL) -# include "SYCLStream.h" -#elif defined(SYCL2020) -# include "SYCLStream2020.h" -#elif defined(OMP) -# include "OMPStream.h" -#elif defined(ALPAKA) -# include "AlpakaStream.h" -#endif - -// Default size of 2^25 -int ARRAY_SIZE = 33'554'432; -unsigned int num_times = 100; -unsigned int deviceIndex = 0; -bool use_float = false; -bool output_as_csv = false; -bool mibibytes = false; -std::string csv_separator = ","; - -template -void check_solution(unsigned int const ntimes, std::vector& a, std::vector& b, std::vector& c, T& sum); - -template -void run(); - -// Options for running the benchmark: -// - All 5 kernels (Copy, Add, Mul, Triad, Dot). -// - Triad only. -// - Nstream only. -enum class Benchmark -{ - All, - Triad, - Nstream -}; - -// Selected run options. -Benchmark selection = Benchmark::All; - -void parseArguments(int argc, char* argv[]); - -int main(int argc, char* argv[]) -{ - parseArguments(argc, argv); - - if(!output_as_csv) - { - std::cout << "BabelStream" << std::endl - << "Version: " << VERSION_STRING << std::endl - << "Implementation: " << IMPLEMENTATION_STRING << std::endl; - } - - if(use_float) - run(); - else - run(); -} - -// Run the 5 main kernels -template -std::vector> run_all(Stream* stream, T& sum) -{ - // List of times - std::vector> timings(5); - - // Declare timers - std::chrono::high_resolution_clock::time_point t1, t2; - - // Main loop - for(unsigned int k = 0; k < num_times; k++) - { - // Execute Copy - t1 = std::chrono::high_resolution_clock::now(); - stream->copy(); - t2 = std::chrono::high_resolution_clock::now(); - timings[0].push_back(std::chrono::duration_cast>(t2 - t1).count()); - - // Execute Mul - t1 = std::chrono::high_resolution_clock::now(); - stream->mul(); - t2 = std::chrono::high_resolution_clock::now(); - timings[1].push_back(std::chrono::duration_cast>(t2 - t1).count()); - - // Execute Add - t1 = std::chrono::high_resolution_clock::now(); - stream->add(); - t2 = std::chrono::high_resolution_clock::now(); - timings[2].push_back(std::chrono::duration_cast>(t2 - t1).count()); - - // Execute Triad - t1 = std::chrono::high_resolution_clock::now(); - stream->triad(); - t2 = std::chrono::high_resolution_clock::now(); - timings[3].push_back(std::chrono::duration_cast>(t2 - t1).count()); - - // Execute Dot - t1 = std::chrono::high_resolution_clock::now(); - sum = stream->dot(); - t2 = std::chrono::high_resolution_clock::now(); - timings[4].push_back(std::chrono::duration_cast>(t2 - t1).count()); - } - - // Compiler should use a move - return timings; -} - -// Run the Triad kernel -template -std::vector> run_triad(Stream* stream) -{ - std::vector> timings(1); - - // Declare timers - std::chrono::high_resolution_clock::time_point t1, t2; - - // Run triad in loop - t1 = std::chrono::high_resolution_clock::now(); - for(unsigned int k = 0; k < num_times; k++) - { - stream->triad(); - } - t2 = std::chrono::high_resolution_clock::now(); - - double runtime = std::chrono::duration_cast>(t2 - t1).count(); - timings[0].push_back(runtime); - - return timings; -} - -// Run the Nstream kernel -template -std::vector> run_nstream(Stream* stream) -{ - std::vector> timings(1); - - // Declare timers - std::chrono::high_resolution_clock::time_point t1, t2; - - // Run nstream in loop - for(int k = 0; k < num_times; k++) - { - t1 = std::chrono::high_resolution_clock::now(); - stream->nstream(); - t2 = std::chrono::high_resolution_clock::now(); - timings[0].push_back(std::chrono::duration_cast>(t2 - t1).count()); - } - - return timings; -} - -// Generic run routine -// Runs the kernel(s) and prints output. -template -void run() -{ - std::streamsize ss = std::cout.precision(); - - if(!output_as_csv) - { - if(selection == Benchmark::All) - std::cout << "Running kernels " << num_times << " times" << std::endl; - else if(selection == Benchmark::Triad) - { - std::cout << "Running triad " << num_times << " times" << std::endl; - std::cout << "Number of elements: " << ARRAY_SIZE << std::endl; - } - - - if(sizeof(T) == sizeof(float)) - std::cout << "Precision: float" << std::endl; - else - std::cout << "Precision: double" << std::endl; - - - if(mibibytes) - { - // MiB = 2^20 - std::cout << std::setprecision(1) << std::fixed - << "Array size: " << ARRAY_SIZE * sizeof(T) * pow(2.0, -20.0) << " MiB" - << " (=" << ARRAY_SIZE * sizeof(T) * pow(2.0, -30.0) << " GiB)" << std::endl; - std::cout << "Total size: " << 3.0 * ARRAY_SIZE * sizeof(T) * pow(2.0, -20.0) << " MiB" - << " (=" << 3.0 * ARRAY_SIZE * sizeof(T) * pow(2.0, -30.0) << " GiB)" << std::endl; - } - else - { - // MB = 10^6 - std::cout << std::setprecision(1) << std::fixed << "Array size: " << ARRAY_SIZE * sizeof(T) * 1.0E-6 - << " MB" - << " (=" << ARRAY_SIZE * sizeof(T) * 1.0E-9 << " GB)" << std::endl; - std::cout << "Total size: " << 3.0 * ARRAY_SIZE * sizeof(T) * 1.0E-6 << " MB" - << " (=" << 3.0 * ARRAY_SIZE * sizeof(T) * 1.0E-9 << " GB)" << std::endl; - } - std::cout.precision(ss); - } - - Stream* stream; - -#if defined(CUDA) - // Use the CUDA implementation - stream = new CUDAStream(ARRAY_SIZE, deviceIndex); - -#elif defined(HIP) - // Use the HIP implementation - stream = new HIPStream(ARRAY_SIZE, deviceIndex); - -#elif defined(HC) - // Use the HC implementation - stream = new HCStream(ARRAY_SIZE, deviceIndex); - -#elif defined(OCL) - // Use the OpenCL implementation - stream = new OCLStream(ARRAY_SIZE, deviceIndex); - -#elif defined(USE_RAJA) - // Use the RAJA implementation - stream = new RAJAStream(ARRAY_SIZE, deviceIndex); - -#elif defined(KOKKOS) - // Use the Kokkos implementation - stream = new KokkosStream(ARRAY_SIZE, deviceIndex); - -#elif defined(STD_DATA) - // Use the C++ STD data-oriented implementation - stream = new STDDataStream(ARRAY_SIZE, deviceIndex); - -#elif defined(STD_INDICES) - // Use the C++ STD index-oriented implementation - stream = new STDIndicesStream(ARRAY_SIZE, deviceIndex); - -#elif defined(STD_RANGES) - // Use the C++ STD ranges implementation - stream = new STDRangesStream(ARRAY_SIZE, deviceIndex); - -#elif defined(TBB) - // Use the C++20 implementation - stream = new TBBStream(ARRAY_SIZE, deviceIndex); - -#elif defined(THRUST) - // Use the Thrust implementation - stream = new ThrustStream(ARRAY_SIZE, deviceIndex); - -#elif defined(ACC) - // Use the OpenACC implementation - stream = new ACCStream(ARRAY_SIZE, deviceIndex); - -#elif defined(SYCL) || defined(SYCL2020) - // Use the SYCL implementation - stream = new SYCLStream(ARRAY_SIZE, deviceIndex); - -#elif defined(OMP) - // Use the OpenMP implementation - stream = new OMPStream(ARRAY_SIZE, deviceIndex); - -#elif defined(ALPAKA) - // Use the alpaka implementation - stream = new AlpakaStream(ARRAY_SIZE, deviceIndex); - -#endif - - stream->init_arrays(startA, startB, startC); - - // Result of the Dot kernel, if used. - T sum = 0.0; - - std::vector> timings; - - switch(selection) - { - case Benchmark::All: - timings = run_all(stream, sum); - break; - case Benchmark::Triad: - timings = run_triad(stream); - break; - case Benchmark::Nstream: - timings = run_nstream(stream); - break; - }; - - // Check solutions - // Create host vectors - std::vector a(ARRAY_SIZE); - std::vector b(ARRAY_SIZE); - std::vector c(ARRAY_SIZE); - - - stream->read_arrays(a, b, c); - check_solution(num_times, a, b, c, sum); - - // Display timing results - if(output_as_csv) - { - std::cout << "function" << csv_separator << "num_times" << csv_separator << "n_elements" << csv_separator - << "sizeof" << csv_separator << ((mibibytes) ? "max_mibytes_per_sec" : "max_mbytes_per_sec") - << csv_separator << "min_runtime" << csv_separator << "max_runtime" << csv_separator << "avg_runtime" - << std::endl; - } - else - { - std::cout << std::left << std::setw(12) << "Function" << std::left << std::setw(12) - << ((mibibytes) ? "MiBytes/sec" : "MBytes/sec") << std::left << std::setw(12) << "Min (sec)" - << std::left << std::setw(12) << "Max" << std::left << std::setw(12) << "Average" << std::endl - << std::fixed; - } - - - if(selection == Benchmark::All || selection == Benchmark::Nstream) - { - std::vector labels; - std::vector sizes; - - if(selection == Benchmark::All) - { - labels = {"Copy", "Mul", "Add", "Triad", "Dot"}; - sizes - = {2 * sizeof(T) * ARRAY_SIZE, - 2 * sizeof(T) * ARRAY_SIZE, - 3 * sizeof(T) * ARRAY_SIZE, - 3 * sizeof(T) * ARRAY_SIZE, - 2 * sizeof(T) * ARRAY_SIZE}; - } - else if(selection == Benchmark::Nstream) - { - labels = {"Nstream"}; - sizes = {4 * sizeof(T) * ARRAY_SIZE}; - } - - for(int i = 0; i < timings.size(); ++i) - { - // Get min/max; ignore the first result - auto minmax = std::minmax_element(timings[i].begin() + 1, timings[i].end()); - - // Calculate average; ignore the first result - double average = std::accumulate(timings[i].begin() + 1, timings[i].end(), 0.0) / (double) (num_times - 1); - - // Display results - if(output_as_csv) - { - std::cout << labels[i] << csv_separator << num_times << csv_separator << ARRAY_SIZE << csv_separator - << sizeof(T) << csv_separator - << ((mibibytes) ? pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << csv_separator - << *minmax.first << csv_separator << *minmax.second << csv_separator << average << std::endl; - } - else - { - std::cout << std::left << std::setw(12) << labels[i] << std::left << std::setw(12) - << std::setprecision(3) - << ((mibibytes) ? pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << std::left - << std::setw(12) << std::setprecision(5) << *minmax.first << std::left << std::setw(12) - << std::setprecision(5) << *minmax.second << std::left << std::setw(12) - << std::setprecision(5) << average << std::endl; - } - } - } - else if(selection == Benchmark::Triad) - { - // Display timing results - double total_bytes = 3 * sizeof(T) * ARRAY_SIZE * num_times; - double bandwidth = ((mibibytes) ? pow(2.0, -30.0) : 1.0E-9) * (total_bytes / timings[0][0]); - - if(output_as_csv) - { - std::cout << "function" << csv_separator << "num_times" << csv_separator << "n_elements" << csv_separator - << "sizeof" << csv_separator << ((mibibytes) ? "gibytes_per_sec" : "gbytes_per_sec") - << csv_separator << "runtime" << std::endl; - std::cout << "Triad" << csv_separator << num_times << csv_separator << ARRAY_SIZE << csv_separator - << sizeof(T) << csv_separator << bandwidth << csv_separator << timings[0][0] << std::endl; - } - else - { - std::cout << "--------------------------------" << std::endl - << std::fixed << "Runtime (seconds): " << std::left << std::setprecision(5) << timings[0][0] - << std::endl - << "Bandwidth (" << ((mibibytes) ? "GiB/s" : "GB/s") << "): " << std::left - << std::setprecision(3) << bandwidth << std::endl; - } - } - - delete stream; -} - -template -void check_solution(unsigned int const ntimes, std::vector& a, std::vector& b, std::vector& c, T& sum) -{ - // Generate correct solution - T goldA = startA; - T goldB = startB; - T goldC = startC; - T goldSum = 0.0; - - const T scalar = startScalar; - - for(unsigned int i = 0; i < ntimes; i++) - { - // Do STREAM! - if(selection == Benchmark::All) - { - goldC = goldA; - goldB = scalar * goldC; - goldC = goldA + goldB; - goldA = goldB + scalar * goldC; - } - else if(selection == Benchmark::Triad) - { - goldA = goldB + scalar * goldC; - } - else if(selection == Benchmark::Nstream) - { - goldA += goldB + scalar * goldC; - } - } - - // Do the reduction - goldSum = goldA * goldB * ARRAY_SIZE; - - // Calculate the average error - double errA - = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val) { return sum + fabs(val - goldA); }); - errA /= a.size(); - double errB - = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val) { return sum + fabs(val - goldB); }); - errB /= b.size(); - double errC - = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val) { return sum + fabs(val - goldC); }); - errC /= c.size(); - double errSum = fabs((sum - goldSum) / goldSum); - - double epsi = std::numeric_limits::epsilon() * 100.0; - - if(errA > epsi) - std::cerr << "Validation failed on a[]. Average error " << errA << std::endl; - if(errB > epsi) - std::cerr << "Validation failed on b[]. Average error " << errB << std::endl; - if(errC > epsi) - std::cerr << "Validation failed on c[]. Average error " << errC << std::endl; - // Check sum to 8 decimal places - if(selection == Benchmark::All && errSum > 1.0E-8) - std::cerr << "Validation failed on sum. Error " << errSum << std::endl - << std::setprecision(15) << "Sum was " << sum << " but should be " << goldSum << std::endl; -} - -int parseUInt(char const* str, unsigned int* output) -{ - char* next; - *output = strtoul(str, &next, 10); - return !strlen(next); -} - -int parseInt(char const* str, int* output) -{ - char* next; - *output = strtol(str, &next, 10); - return !strlen(next); -} - -void parseArguments(int argc, char* argv[]) -{ - for(int i = 1; i < argc; i++) - { - if(!std::string("--list").compare(argv[i])) - { - listDevices(); - exit(EXIT_SUCCESS); - } - else if(!std::string("--device").compare(argv[i])) - { - if(++i >= argc || !parseUInt(argv[i], &deviceIndex)) - { - std::cerr << "Invalid device index." << std::endl; - exit(EXIT_FAILURE); - } - } - else if(!std::string("--arraysize").compare(argv[i]) || !std::string("-s").compare(argv[i])) - { - if(++i >= argc || !parseInt(argv[i], &ARRAY_SIZE) || ARRAY_SIZE <= 0) - { - std::cerr << "Invalid array size." << std::endl; - exit(EXIT_FAILURE); - } - } - else if(!std::string("--numtimes").compare(argv[i]) || !std::string("-n").compare(argv[i])) - { - if(++i >= argc || !parseUInt(argv[i], &num_times)) - { - std::cerr << "Invalid number of times." << std::endl; - exit(EXIT_FAILURE); - } - if(num_times < 2) - { - std::cerr << "Number of times must be 2 or more" << std::endl; - exit(EXIT_FAILURE); - } - } - else if(!std::string("--float").compare(argv[i])) - { - use_float = true; - } - else if(!std::string("--triad-only").compare(argv[i])) - { - selection = Benchmark::Triad; - } - else if(!std::string("--nstream-only").compare(argv[i])) - { - selection = Benchmark::Nstream; - } - else if(!std::string("--csv").compare(argv[i])) - { - output_as_csv = true; - } - else if(!std::string("--mibibytes").compare(argv[i])) - { - mibibytes = true; - } - else if(!std::string("--help").compare(argv[i]) || !std::string("-h").compare(argv[i])) - { - std::cout << std::endl; - std::cout << "Usage: " << argv[0] << " [OPTIONS]" << std::endl << std::endl; - std::cout << "Options:" << std::endl; - std::cout << " -h --help Print the message" << std::endl; - std::cout << " --list List available devices" << std::endl; - std::cout << " --device INDEX Select device at INDEX" << std::endl; - std::cout << " -s --arraysize SIZE Use SIZE elements in the array" << std::endl; - std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl; - std::cout << " --float Use floats (rather than doubles)" << std::endl; - std::cout << " --triad-only Only run triad" << std::endl; - std::cout << " --nstream-only Only run nstream" << std::endl; - std::cout << " --csv Output as csv table" << std::endl; - std::cout << " --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6)" - << std::endl; - std::cout << std::endl; - exit(EXIT_SUCCESS); - } - else - { - std::cerr << "Unrecognized argument '" << argv[i] << "' (try '--help')" << std::endl; - exit(EXIT_FAILURE); - } - } -} - -// NOLINTEND diff --git a/thirdParty/CMakeLists.txt b/thirdParty/CMakeLists.txt index 826179d78ee4..97ed4d5f9911 100644 --- a/thirdParty/CMakeLists.txt +++ b/thirdParty/CMakeLists.txt @@ -3,7 +3,7 @@ # SPDX-License-Identifier: MPL-2.0 # -if(BUILD_TESTING) +if(BUILD_TESTING OR alpaka_BUILD_BENCHMARKS) if(alpaka_USE_INTERNAL_CATCH2) message(STATUS "Catch2: Using INTERNAL version 3.5.2") # Force Catch2's CMake to pick up the variables we set below