diff --git a/CMakeLists.txt b/CMakeLists.txt
index 00fe0d134a2f..fcaae9721f26 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,8 +47,9 @@ endif()
 option(alpaka_INSTALL_TEST_HEADER "Install headers of the namespace alpaka::test. Attention, headers are not designed for production code, see documentation." OFF)
 
 include(CMakeDependentOption)
+
 cmake_dependent_option(alpaka_CHECK_HEADERS "Check all alpaka headers as part of the tests whether they can be compiled standalone." OFF BUILD_TESTING OFF)
-cmake_dependent_option(alpaka_USE_INTERNAL_CATCH2 "Use internally shipped Catch2" ON BUILD_TESTING OFF)
+cmake_dependent_option(alpaka_USE_INTERNAL_CATCH2 "Use internally shipped Catch2" ON "BUILD_TESTING OR alpaka_BUILD_BENCHMARKS" OFF)
 
 ################################################################################
 # Internal variables.
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 3ae15f537d4f..6a8da0ef7a89 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -15,4 +15,9 @@ project("alpakaBenchmarks" LANGUAGES CXX)
 # Add subdirectories.
 ################################################################################
 
+if(NOT BUILD_TESTING)
+    # Testing is not enabled therefore CATCH2 which is part of common must be pulled.
+    add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../test/common "${CMAKE_BINARY_DIR}/test/common")
+endif()
+
 add_subdirectory("babelstream/")
diff --git a/benchmarks/babelstream/CMakeLists.txt b/benchmarks/babelstream/CMakeLists.txt
index e2382a1139d8..5deb1a3e2a88 100644
--- a/benchmarks/babelstream/CMakeLists.txt
+++ b/benchmarks/babelstream/CMakeLists.txt
@@ -19,9 +19,31 @@ if(NOT TARGET alpaka::alpaka)
     endif()
 endif()
 
-alpaka_add_executable(${PROJECT_NAME} src/main.cpp src/Stream.h src/AlpakaStream.cpp src/AlpakaStream.h)
-target_compile_definitions(${PROJECT_NAME} PUBLIC ALPAKA)
-target_link_libraries(${PROJECT_NAME} PUBLIC alpaka::alpaka)
-set_target_properties(${PROJECT_NAME} PROPERTIES FOLDER benchmarks)
 
-# add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
+set(_TARGET_NAME "babelstream")
+append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    ${_FILES_SOURCE})
+
+target_include_directories(
+ ${_TARGET_NAME}
+ PRIVATE "src")
+
+target_link_libraries(
+    ${_TARGET_NAME}
+    PRIVATE common)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER benchmarks/babelstream)
+
+#Run as a ctest
+if(alpaka_CI)
+    # Only run for release builds since this is a benchmark
+    if(CMAKE_BUILD_TYPE STREQUAL "Release")
+       add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
+    endif()
+else()
+    # For a normal benchmark test, number of samples should be equal to the default value.
+    add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
+endif()
diff --git a/benchmarks/babelstream/src/AlpakaStream.cpp b/benchmarks/babelstream/src/AlpakaStream.cpp
deleted file mode 100644
index 9c8f3043ea68..000000000000
--- a/benchmarks/babelstream/src/AlpakaStream.cpp
+++ /dev/null
@@ -1,230 +0,0 @@
-// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
-// University of Bristol HPC
-//
-// For full license terms please see the LICENSE file distributed with this
-// source code
-//
-// Cupla version created by Jeff Young in 2021
-// Ported from cupla to alpaka by Bernhard Manfred Gruber in 2022
-
-#include "AlpakaStream.h"
-
-#include <numeric>
-
-namespace
-{
-    constexpr auto blockSize = 1024;
-    constexpr auto dotBlockSize = 256;
-} // namespace
-
-template<typename T>
-AlpakaStream<T>::AlpakaStream(Idx arraySize, Idx deviceIndex)
-    : arraySize(arraySize)
-    , devHost(alpaka::getDevByIdx(platformHost, 0))
-    , devAcc(alpaka::getDevByIdx(platformAcc, deviceIndex))
-    , sums(alpaka::allocBuf<T, Idx>(devHost, dotBlockSize))
-    , d_a(alpaka::allocBuf<T, Idx>(devAcc, arraySize))
-    , d_b(alpaka::allocBuf<T, Idx>(devAcc, arraySize))
-    , d_c(alpaka::allocBuf<T, Idx>(devAcc, arraySize))
-    , d_sum(alpaka::allocBuf<T, Idx>(devAcc, dotBlockSize))
-    , queue(devAcc)
-{
-    if(arraySize % blockSize != 0)
-        throw std::runtime_error("Array size must be a multiple of " + std::to_string(blockSize));
-    std::cout << "Using alpaka device " << alpaka::getName(devAcc) << std::endl;
-}
-
-struct InitKernel
-{
-    template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T* b, T* c, T initA, T initB, T initC) const
-    {
-        auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        a[i] = initA;
-        b[i] = initB;
-        c[i] = initC;
-    }
-};
-
-template<typename T>
-void AlpakaStream<T>::init_arrays(T initA, T initB, T initC)
-{
-    auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
-
-    alpaka::exec<
-        Acc>(queue, workdiv, InitKernel{}, std::data(d_a), std::data(d_b), std::data(d_c), initA, initB, initC);
-    alpaka::wait(queue);
-}
-
-template<typename T>
-void AlpakaStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
-{
-    alpaka::memcpy(queue, alpaka::createView(devHost, a), d_a);
-    alpaka::memcpy(queue, alpaka::createView(devHost, b), d_b);
-    alpaka::memcpy(queue, alpaka::createView(devHost, c), d_c);
-}
-
-struct CopyKernel
-{
-    template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T* c) const
-    {
-        auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        c[i] = a[i];
-    }
-};
-
-template<typename T>
-void AlpakaStream<T>::copy()
-{
-    auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
-
-    alpaka::exec<Acc>(queue, workdiv, CopyKernel{}, std::data(d_a), std::data(d_c));
-    alpaka::wait(queue);
-}
-
-struct MulKernel
-{
-    template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* b, T const* c) const
-    {
-        const T scalar = startScalar;
-        auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        b[i] = scalar * c[i];
-    }
-};
-
-template<typename T>
-void AlpakaStream<T>::mul()
-{
-    auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
-
-    alpaka::exec<Acc>(queue, workdiv, MulKernel{}, std::data(d_b), std::data(d_c));
-    alpaka::wait(queue);
-}
-
-struct AddKernel
-{
-    template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* c) const
-    {
-        auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        c[i] = a[i] + b[i];
-    }
-};
-
-template<typename T>
-void AlpakaStream<T>::add()
-{
-    auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
-
-    alpaka::exec<Acc>(queue, workdiv, AddKernel{}, std::data(d_a), std::data(d_b), std::data(d_c));
-    alpaka::wait(queue);
-}
-
-struct TriadKernel
-{
-    template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T const* b, T const* c) const
-    {
-        const T scalar = startScalar;
-        auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        a[i] = b[i] + scalar * c[i];
-    }
-};
-
-template<typename T>
-void AlpakaStream<T>::triad()
-{
-    auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
-
-    alpaka::exec<Acc>(queue, workdiv, TriadKernel{}, std::data(d_a), std::data(d_b), std::data(d_c));
-    alpaka::wait(queue);
-}
-
-struct NstreamKernel
-{
-    template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T const* b, T const* c) const
-    {
-        const T scalar = startScalar;
-        auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        a[i] += b[i] + scalar * c[i];
-    }
-};
-
-template<typename T>
-void AlpakaStream<T>::nstream()
-{
-    auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
-
-    alpaka::exec<Acc>(queue, workdiv, NstreamKernel{}, std::data(d_a), std::data(d_b), std::data(d_c));
-    alpaka::wait(queue);
-}
-
-struct DotKernel
-{
-    template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* sum, int arraySize) const
-    {
-        // TODO(Jeff Young) - test if sharedMem bug is affecting performance here
-        auto& tbSum = alpaka::declareSharedVar<T[blockSize], __COUNTER__>(acc);
-
-        auto [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        auto const [local_i] = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
-        auto const [totalThreads] = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
-
-        T threadSum = 0;
-        for(; i < arraySize; i += totalThreads) // NOLINT(bugprone-infinite-loop)
-            threadSum += a[i] * b[i];
-        tbSum[local_i] = threadSum;
-
-        auto const [blockDim] = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
-        for(int offset = blockDim / 2; offset > 0; offset /= 2)
-        {
-            alpaka::syncBlockThreads(acc);
-            if(local_i < offset)
-                tbSum[local_i] += tbSum[local_i + offset];
-        }
-
-        auto const [blockIdx] = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);
-        if(local_i == 0)
-            sum[blockIdx] = tbSum[local_i];
-    }
-};
-
-template<typename T>
-auto AlpakaStream<T>::dot() -> T
-{
-    auto const workdiv = WorkDiv{dotBlockSize, blockSize, 1};
-    alpaka::exec<Acc>(queue, workdiv, DotKernel{}, std::data(d_a), std::data(d_b), std::data(d_sum), arraySize);
-    alpaka::wait(queue);
-
-    alpaka::memcpy(queue, sums, d_sum);
-    T const* sumPtr = std::data(sums);
-    // TODO(bgruber): replace by std::reduce, when gcc 9.3 is the baseline
-    return std::accumulate(sumPtr, sumPtr + dotBlockSize, T{0});
-}
-
-void listDevices()
-{
-    auto const platform = alpaka::Platform<Acc>{};
-    auto const count = alpaka::getDevCount(platform);
-    std::cout << "Devices:" << std::endl;
-    for(int i = 0; i < count; i++)
-        std::cout << i << ": " << getDeviceName(i) << std::endl;
-}
-
-auto getDeviceName(int deviceIndex) -> std::string
-{
-    auto const platform = alpaka::Platform<Acc>{};
-    return alpaka::getName(alpaka::getDevByIdx(platform, deviceIndex));
-}
-
-auto getDeviceDriver([[maybe_unused]] int device) -> std::string
-{
-    return "Not supported";
-}
-
-template class AlpakaStream<float>;
-template class AlpakaStream<double>;
diff --git a/benchmarks/babelstream/src/AlpakaStream.h b/benchmarks/babelstream/src/AlpakaStream.h
deleted file mode 100644
index ba556b028dba..000000000000
--- a/benchmarks/babelstream/src/AlpakaStream.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
-// University of Bristol HPC
-//
-// For full license terms please see the LICENSE file distributed with this
-// source code
-//
-// Cupla version created by Jeff Young in 2021
-// Ported from cupla to alpaka by Bernhard Manfred Gruber in 2022
-
-#pragma once
-
-#include "Stream.h"
-
-#include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
-
-#include <vector>
-
-inline constexpr auto IMPLEMENTATION_STRING = "alpaka";
-
-using Dim = alpaka::DimInt<1>;
-using Idx = int;
-using Vec = alpaka::Vec<Dim, Idx>;
-using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
-
-template<typename T>
-struct AlpakaStream : Stream<T>
-{
-    AlpakaStream(Idx arraySize, Idx deviceIndex);
-
-    void copy() override;
-    void add() override;
-    void mul() override;
-    void triad() override;
-    void nstream() override;
-    auto dot() -> T override;
-
-    void init_arrays(T initA, T initB, T initC) override;
-    void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
-
-    using PlatformHost = alpaka::PlatformCpu;
-    using DevHost = alpaka::Dev<PlatformHost>;
-    using PlatformAcc = alpaka::Platform<Acc>;
-    using DevAcc = alpaka::Dev<Acc>;
-    using BufHost = alpaka::Buf<alpaka::DevCpu, T, Dim, Idx>;
-    using BufAcc = alpaka::Buf<Acc, T, Dim, Idx>;
-    using Queue = alpaka::Queue<Acc, alpaka::Blocking>;
-
-    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
-
-private:
-    Idx arraySize;
-    PlatformHost platformHost;
-    DevHost devHost;
-    PlatformAcc platformAcc;
-    DevAcc devAcc;
-    BufHost sums;
-    BufAcc d_a;
-    BufAcc d_b;
-    BufAcc d_c;
-    BufAcc d_sum;
-    Queue queue;
-};
diff --git a/benchmarks/babelstream/src/README.md b/benchmarks/babelstream/src/README.md
index 781cdf31039f..cd3eee701166 100644
--- a/benchmarks/babelstream/src/README.md
+++ b/benchmarks/babelstream/src/README.md
@@ -1,6 +1,101 @@
-This is a port of [BabelStream](https://github.com/UoB-HPC/BabelStream) to alpaka.
-This work is based on the [cupla port of BabelStream](https://github.com/jyoung3131/BabelStream) from Jeff Young.
-The benchmark driver (`main.cpp` and `Stream.h`) is taken from BabelStream.
-No other backends are available, only alpaka.
-Thus, there is no need to select a backend, just run the executable.
-Please refer to the BabelStream documentation of more information on how to run the benchmark.
+This work was initially based on the [cupla port of BabelStream](https://github.com/jyoung3131/BabelStream) from Jeff Young. Then refactored.
+The benchmark BabelStream is developed by Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC; based on John D. McCalpin's original STREAM benchmark for CPUs
+Some implementations and the documents are accessible through https://github.com/UoB-HPC
+
+# Example Run
+Can be run with custom arguments as well as catch2 arguments
+# With Custom arguments:
+./babelstream  --array-size=1280000 --number-runs=10
+# With Catch2 arguments:
+./babelstream --success
+# With Custom and catch2 arguments together:
+./babelstream  --success --array-size=1280000 --number-runs=10
+
+# Command for a benchmarking run
+# ./babelstream --array-size=33554432 --number-runs=100 
+# Otuput is below:
+
+'''Array size provided: 33554432
+Number of runs provided: 100
+Randomness seeded to: 2775986196
+
+
+AcceleratorType:AccCpuSerial<1,unsigned int>
+NumberOfRuns:100
+Precision:single
+DataSize(items):33554432
+DeviceName:13th Gen Intel(R) Core(TM) i7-1360P
+WorkDivInit :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivCopy :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivMult :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivAdd  :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivTriad:{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+Kernels         Bandwidths(GB/s) MinTime(s) MaxTime(s) AvgTime(s) DataUsage(MB) 
+ InitKernel      12.2133         0.0219789 0.0244341 0.0234795 268.435 
+ CopyKernel      20.8898         0.01285  0.0141298 0.0130288 268.435 
+ MultKernel      20.9943         0.0127861 0.0161767 0.0129707 268.435 
+ AddKernel       24.4181         0.01649  0.0178725 0.0166714 402.653 
+ TriadKernel     24.44           0.0164751 0.0182611 0.0166579 402.653 
+
+
+
+AcceleratorType:AccGpuCudaRt<1,unsigned int>
+NumberOfRuns:100
+Precision:single
+DataSize(items):33554432
+DeviceName:NVIDIA RTX A500 Laptop GPU
+WorkDivInit :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivCopy :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivMult :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivAdd  :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivTriad:{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivDot  :{gridBlockExtent: (256), blockThreadExtent: (1024), threadElemExtent: (1)}
+Kernels         Bandwidths(GB/s) MinTime(s) MaxTime(s) AvgTime(s) DataUsage(MB) 
+ InitKernel      62.3725         0.00430374 0.00434411 0.00433501 268.435 
+ CopyKernel      90.2948         0.00297288 0.00302862 0.00300712 268.435 
+ MultKernel      90.3858         0.00296988 0.00302989 0.00300866 268.435 
+ AddKernel       90.947          0.00442734 0.00448436 0.00446751 402.653 
+ TriadKernel     90.88           0.0044306 0.00447952 0.00446739 402.653 
+ DotKernel       93.369          0.002875 0.00291691 0.0029106 268.435 
+
+
+
+AcceleratorType:AccCpuSerial<1,unsigned int>
+NumberOfRuns:100
+Precision:double
+DataSize(items):33554432
+DeviceName:13th Gen Intel(R) Core(TM) i7-1360P
+WorkDivInit :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivCopy :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivMult :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivAdd  :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivTriad:{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivDot  :{gridBlockExtent: (256), blockThreadExtent: (1024), threadElemExtent: (1)}
+Kernels         Bandwidths(GB/s) MinTime(s) MaxTime(s) AvgTime(s) DataUsage(MB) 
+ InitKernel      12.2326         0.0438886 0.0543366 0.0463925 536.871 
+ CopyKernel      20.8888         0.0257014 0.0272265 0.0260267 536.871 
+ MultKernel      21.0395         0.0255173 0.0292734 0.0262349 536.871 
+ AddKernel       24.6628         0.0326527 0.0383083 0.0334047 805.306 
+ TriadKernel     24.5604         0.0327888 0.0494151 0.0335766 805.306 
+
+
+
+AcceleratorType:AccGpuCudaRt<1,unsigned int>
+NumberOfRuns:100
+Precision:double
+DataSize(items):33554432
+DeviceName:NVIDIA RTX A500 Laptop GPU
+WorkDivInit :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivCopy :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivMult :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivAdd  :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivTriad:{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivDot  :{gridBlockExtent: (256), blockThreadExtent: (1024), threadElemExtent: (1)}
+Kernels         Bandwidths(GB/s) MinTime(s) MaxTime(s) AvgTime(s) DataUsage(MB) 
+ InitKernel      62.4307         0.00859947 0.00864104 0.00862767 536.871 
+ CopyKernel      89.4157         0.00600421 0.00607738 0.00604754 536.871 
+ MultKernel      89.2831         0.00601313 0.00606791 0.0060488 536.871 
+ AddKernel       90.5499         0.00889351 0.00895834 0.00893668 805.306 
+ TriadKernel     90.5685         0.00889168 0.00897055 0.00893744 805.306 
+ DotKernel       93.2451         0.00575763 0.00581312 0.00579143 536.871 
+'''
diff --git a/benchmarks/babelstream/src/Stream.h b/benchmarks/babelstream/src/Stream.h
deleted file mode 100644
index d4548428f0bb..000000000000
--- a/benchmarks/babelstream/src/Stream.h
+++ /dev/null
@@ -1,48 +0,0 @@
-
-// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
-// University of Bristol HPC
-//
-// For full license terms please see the LICENSE file distributed with this
-// source code
-
-// NOLINTBEGIN
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-// Array values
-#define startA (0.1)
-#define startB (0.2)
-#define startC (0.0)
-#define startScalar (0.4)
-
-template<class T>
-class Stream
-{
-public:
-    virtual ~Stream()
-    {
-    }
-
-    // Kernels
-    // These must be blocking calls
-    virtual void copy() = 0;
-    virtual void mul() = 0;
-    virtual void add() = 0;
-    virtual void triad() = 0;
-    virtual void nstream() = 0;
-    virtual T dot() = 0;
-
-    // Copy memory between host and device
-    virtual void init_arrays(T initA, T initB, T initC) = 0;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
-};
-
-// Implementation specific device functions
-void listDevices(void);
-std::string getDeviceName(int const);
-std::string getDeviceDriver(int const);
-
-// NOLINTEND
diff --git a/benchmarks/babelstream/src/babelStreamCommon.hpp b/benchmarks/babelstream/src/babelStreamCommon.hpp
new file mode 100644
index 000000000000..a22f7d032d31
--- /dev/null
+++ b/benchmarks/babelstream/src/babelStreamCommon.hpp
@@ -0,0 +1,440 @@
+#pragma once
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <ctime>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <map>
+#include <numeric>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+namespace
+{
+    // Default array size, can be changed from command line arguments.
+    // To display cmd line args use ./babelstream --help or -?
+    // According to tests, 2^25 or larger values are needed for proper benchmarking:
+    // ./babelstream --array-size=33554432 --number-runs=100
+    // To prevent timeouts in CI, a smaller default value is used.
+    [[maybe_unused]] auto arraySizeMain = 1024 * 1024;
+
+    // Minimum array size to be used.
+    [[maybe_unused]] constexpr auto minArrSize = 1024 * 128;
+
+    // Scalar value for Mul and Triad kernel parameters.
+    [[maybe_unused]] constexpr auto scalarVal = 2.0f;
+
+    // Block thread extent for DotKernel test work division parameters.
+    [[maybe_unused]] constexpr auto blockThreadExtentMain = 1024;
+
+    // Number of runs for each kernel, can be changed by command line arguments.
+    // At least 100 runs are recommended for good benchmarking.
+    // To prevent timeouts in CI, a small value is used.
+    [[maybe_unused]] auto numberOfRuns = 2;
+
+    // Data input value for babelstream.
+    [[maybe_unused]] constexpr auto valA = 1.0f;
+
+    //! handleCustomArguments Gets custom cmd line arguments from the all arguments.
+    //! Namely gets --array-size=1234 and --number-runs=1234 and keeps the others which are
+    //! command line args for Catch2 session.
+    [[maybe_unused]] static void handleCustomArguments(int& argc, char* argv[])
+    {
+        std::vector<char*> newArgv;
+        newArgv.push_back(argv[0]); // Keep the program name
+
+        for(int i = 1; i < argc; ++i)
+        {
+            std::string arg = argv[i];
+            if(arg.rfind("--array-size=", 0) == 0)
+            {
+                auto const arrSize = std::stoi(arg.substr(13)); // Convert to integer
+                if(arrSize > minArrSize)
+                {
+                    arraySizeMain = arrSize;
+                    std::cout << "Array size provided(items): " << arraySizeMain << std::endl;
+                }
+                else
+                {
+                    std::cout << "Too small array size given. Must be at least " << minArrSize << std::endl;
+                    std::cout << "Using default array size(number of items): " << arraySizeMain << std::endl;
+                }
+            }
+            else if(arg.rfind("--number-runs=", 0) == 0)
+            {
+                auto const numRuns = std::stoi(arg.substr(14)); // Convert to integer
+                if(numRuns > 0)
+                {
+                    numberOfRuns = numRuns;
+                    std::cout << "Number of runs provided: " << numberOfRuns << std::endl;
+                }
+                else
+                {
+                    std::cout << "Using default number of runs: " << numberOfRuns << std::endl;
+                }
+            }
+            else
+            {
+                // If it's not a custom argument, keep it for Catch2
+                newArgv.push_back(argv[i]);
+            }
+            if(arg.rfind("-?", 0) == 0 || arg.rfind("--help", 0) == 0 || arg.rfind("-h", 0) == 0)
+            {
+                std::cout << "Usage of custom arguments (arguments which are not Catch2):  --array-size=33554432 and "
+                             "--number-runs=100"
+                          << std::endl;
+            }
+        }
+
+        // Update argc and argv to exclude custom arguments
+        argc = static_cast<int>(newArgv.size());
+        for(int i = 0; i < argc; ++i)
+        {
+            argv[i] = newArgv[static_cast<size_t>(i)];
+        }
+    }
+
+    //! FuzzyEqual compares two floating-point or integral type values.
+    //! \tparam T Type of the values to compare.
+    //! \param a First value to compare.
+    //! \param b Second value to compare.
+    //! \return Returns true if the values are approximately equal (for floating-point types) or exactly equal (for
+    //! integral types).
+    template<typename T>
+    [[maybe_unused]] bool FuzzyEqual(T a, T b)
+    {
+        if constexpr(std::is_floating_point_v<T>)
+        {
+            return std::fabs(a - b) < std::numeric_limits<T>::epsilon() * static_cast<T>(100.0);
+        }
+        else if constexpr(std::is_integral_v<T>)
+        {
+            return a == b;
+        }
+        else
+        {
+            static_assert(
+                std::is_floating_point_v<T> || std::is_integral_v<T>,
+                "FuzzyEqual<T> is only supported for integral or floating-point types.");
+        }
+    }
+
+    //!   Gets the current timestamp and returns it as a string.
+    //! \return A string representation of the current timestamp in the format "YYYY-MM-DD HH:MM:SS".
+    [[maybe_unused]] static std::string getCurrentTimestamp()
+    {
+        auto now = std::chrono::system_clock::now();
+        auto now_c = std::chrono::system_clock::to_time_t(now);
+        std::stringstream ss;
+        ss << std::put_time(std::localtime(&now_c), "%Y-%m-%d %X");
+        return ss.str();
+    }
+
+    //! joinElements  Joins the elements of a vector into a string, separated by a specified delimiter.
+    //! \tparam T Type of the elements in the vector.
+    //! \param vec The vector of elements to join.
+    //! \param delim The delimiter to separate the elements in the resulting string.
+    //! \return A string with the vector elements separated by the specified delimiter.
+    template<typename T>
+    [[maybe_unused]] static std::string joinElements(std::vector<T> const& vec, std::string const& delim)
+    {
+        return std::accumulate(
+            vec.begin(),
+            vec.end(),
+            std::string(),
+            [&delim](std::string const& a, T const& b)
+            {
+                std::ostringstream oss;
+                if(!a.empty())
+                    oss << a << delim;
+                oss << std::setprecision(5) << b;
+                return oss.str();
+            });
+    }
+
+    //! findMinMax  Finds the minimum and maximum elements in a container.
+    //! \tparam Container The type of the container.
+    //! \param times The container from which to find the minimum and maximum elements.
+    //! \return A pair containing the minimum and maximum values in the container.
+    //! \note The first element is omitted if the container size is larger than 1, as the result is used in time
+    //! measurement for benchmarking.
+    template<typename Container>
+    [[maybe_unused]] static auto findMinMax(Container const& times)
+        -> std::pair<typename Container::value_type, typename Container::value_type>
+    {
+        if(times.empty())
+            return std::make_pair(typename Container::value_type{}, typename Container::value_type{});
+
+        // Default to min and max being the same element for single element containers
+        auto minValue = *std::min_element(times.begin(), times.end());
+        auto maxValue = minValue;
+
+        if(times.size() > 1)
+        {
+            // Calculate min and max ignoring the first element
+            minValue = *std::min_element(times.begin() + 1, times.end());
+            maxValue = *std::max_element(times.begin() + 1, times.end());
+        }
+
+        return std::make_pair(minValue, maxValue);
+    }
+
+    //! findAverage  Calculates the average value of elements in a container, does not take into account the first one.
+    //! \tparam Container The type of the container.
+    //! \param elements The container from which to calculate the average.
+    //! \return The average value of the elements in the container without considering the first element.
+    template<typename Container>
+    [[maybe_unused]] static auto findAverage(Container const& elements) -> typename Container::value_type
+    {
+        if(elements.empty())
+            return typename Container::value_type{};
+
+        if(elements.size() == 1)
+            return elements.front(); // Only one element, return it as the average
+
+        // Calculate the sum of the elements, start from the second one
+        auto sum = std::accumulate(elements.begin() + 1, elements.end(), typename Container::value_type{});
+
+        // Calculate and return the average, take into account that one element is not used
+        return sum / static_cast<typename Container::value_type>(elements.size() - 1);
+    }
+
+    //!   Enum class representing benchmark information data types.
+    enum class BMInfoDataType
+    {
+        AcceleratorType,
+        TimeStamp,
+        NumRuns,
+        DataSize,
+        DataType,
+        WorkDivInit,
+        WorkDivCopy,
+        WorkDivAdd,
+        WorkDivTriad,
+        WorkDivMult,
+        WorkDivDot,
+        DeviceName,
+        TimeUnit,
+        KernelNames,
+        KernelBandwidths,
+        KernelDataUsageValues,
+        KernelMinTimes,
+        KernelMaxTimes,
+        KernelAvgTimes
+    };
+
+    //! typeToTypeStr Converts BMInfoDataType enum values to their corresponding string representations.
+    //! \param item The BMInfoDataType enum type value to convert to a more explicit string with units.
+    //! \return A string representation of the given BMInfoDataType enum value.
+#if defined(__clang__)
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wswitch-default"
+#    pragma clang diagnostic ignored "-Wcovered-switch-default"
+#endif
+    static std::string typeToTypeStr(BMInfoDataType item)
+    {
+        switch(item)
+        {
+        case BMInfoDataType::AcceleratorType:
+            return "AcceleratorType";
+        case BMInfoDataType::TimeStamp:
+            return "TimeStamp";
+        case BMInfoDataType::NumRuns:
+            return "NumberOfRuns";
+        case BMInfoDataType::DataSize:
+            return "DataSize(items)";
+        case BMInfoDataType::DataType:
+            return "Precision";
+        case BMInfoDataType::DeviceName:
+            return "DeviceName";
+        case BMInfoDataType::TimeUnit:
+            return "TimeUnitForXMLReport";
+        case BMInfoDataType::KernelNames:
+            return "Kernels";
+        case BMInfoDataType::KernelDataUsageValues:
+            return "DataUsage(MB)";
+        case BMInfoDataType::KernelBandwidths:
+            return "Bandwidths(GB/s)";
+        case BMInfoDataType::KernelMinTimes:
+            return "MinTime(s)";
+        case BMInfoDataType::KernelMaxTimes:
+            return "MaxTime(s)";
+        case BMInfoDataType::KernelAvgTimes:
+            return "AvgTime(s)";
+        case BMInfoDataType::WorkDivInit:
+            return "WorkDivInit ";
+        case BMInfoDataType::WorkDivCopy:
+            return "WorkDivCopy ";
+        case BMInfoDataType::WorkDivAdd:
+            return "WorkDivAdd  ";
+        case BMInfoDataType::WorkDivTriad:
+            return "WorkDivTriad";
+        case BMInfoDataType::WorkDivMult:
+            return "WorkDivMult ";
+        case BMInfoDataType::WorkDivDot:
+            return "WorkDivDot  ";
+        default:
+            return "";
+        }
+    }
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
+    //! getDataThroughput Calculates the data throughput for processing the entire array.
+    //! \tparam DataType The type of the data.
+    //! \tparam T The type of the parameters.
+    //! \param readsWrites The number of read/write operations.
+    //! \param arraySize The size of the array.
+    //! \return The calculated data throughput in MB.
+    template<typename DataType, typename T>
+    [[maybe_unused]] static double getDataThroughput(T readsWrites, T arraySize)
+    {
+        auto throughput = readsWrites * sizeof(DataType) * arraySize;
+        // convert to MB (not MiB)
+        return static_cast<double>(throughput) * 1.0E-6;
+    }
+
+    //! calculateBandwidth Calculates the bandwidth in GB/sec.
+    //! \tparam T The type of bytesReadWriteMB.
+    //! \tparam U The type of runTimeSeconds (e.g., double).
+    //! \param bytesReadWriteMB The amount of data read/write in MB.
+    //! \param runTimeSeconds The runtime in seconds.
+    //! \return The calculated bandwidth in GB/sec.
+    template<typename T, typename U>
+    [[maybe_unused]] static double calculateBandwidth(T bytesReadWriteMB, U runTimeSeconds)
+    {
+        // Divide by 1.0E+3 to convert from MB to GB (not GiB)
+        auto bytesReadWriteGB = static_cast<double>(bytesReadWriteMB) * (1.0E-3);
+        return bytesReadWriteGB / static_cast<double>(runTimeSeconds);
+    }
+
+    //! MetaData class to store and serialize benchmark information.
+    //! \details The MetaData class includes a single map to keep all benchmark information and provides serialization
+    //! methods for generating output.
+    class MetaData
+    {
+    public:
+        //! setItem  Sets an item in the metadata map.
+        //! \tparam T The type of the value to store.
+        //! \param key The BMInfoDataType key.
+        //! \param value The value to store associated with the key.
+        template<typename T>
+        [[maybe_unused]] void setItem(BMInfoDataType key, T const& value)
+        {
+            std::ostringstream oss;
+            oss << value;
+            metaDataMap[key] = oss.str();
+        }
+
+        //! serialize  Serializes the entire metadata to a string.
+        //! \return A string containing the serialized metadata.
+        //! \details This is standard serialization and produces output that can be post-processed easily.
+        [[maybe_unused]] std::string serialize() const
+        {
+            std::stringstream ss;
+            for(auto const& pair : metaDataMap)
+            {
+                ss << "\n" << typeToTypeStr(pair.first) << ":" << pair.second;
+            }
+            return ss.str();
+        }
+
+        //! serializeAsTable Serializes the metadata into a more structured format for easy visual inspection.
+        //! \return A string containing the serialized metadata as a table.
+        //! \details The method first serializes general information, then creates a summary as a table where each row
+        //! represents a kernel.
+        [[maybe_unused]] std::string serializeAsTable() const
+        {
+            std::stringstream ss;
+            // define lambda to add values to a string stream created already
+            auto addItemValue = [&, this](BMInfoDataType item) {
+                ss << "\n" << typeToTypeStr(item) << ":" << metaDataMap.at(item);
+            };
+
+            // Initially chose some data to serialize
+            ss << "\n";
+            addItemValue(BMInfoDataType::AcceleratorType);
+            addItemValue(BMInfoDataType::NumRuns);
+            addItemValue(BMInfoDataType::DataType);
+            addItemValue(BMInfoDataType::DataSize);
+            addItemValue(BMInfoDataType::DeviceName);
+            addItemValue(BMInfoDataType::WorkDivInit);
+            addItemValue(BMInfoDataType::WorkDivCopy);
+            addItemValue(BMInfoDataType::WorkDivMult);
+            addItemValue(BMInfoDataType::WorkDivAdd);
+            addItemValue(BMInfoDataType::WorkDivTriad);
+            if(metaDataMap.count(BMInfoDataType::WorkDivDot) != 0)
+                addItemValue(BMInfoDataType::WorkDivDot);
+
+            auto getItemFromStrList = [this](BMInfoDataType item, int index) -> std::string
+            {
+                std::string const str = metaDataMap.at(item);
+
+                if(index < 1)
+                {
+                    throw std::invalid_argument("Index must be 1 or greater.");
+                }
+
+                std::istringstream iss(str);
+                std::string token;
+                int current_index = 1; // Start at 1 for 1-based indexing
+
+                // Using ", " as the delimiter, we handle the token extraction manually
+                while(std::getline(iss, token, ','))
+                {
+                    // Remove any leading spaces that may be left by `getline`
+                    size_t start = token.find_first_not_of(' ');
+                    if(start != std::string::npos)
+                    {
+                        token = token.substr(start);
+                    }
+
+                    if(current_index == index)
+                    {
+                        return token;
+                    }
+                    ++current_index;
+                }
+
+                throw std::out_of_range("Index out of range");
+            };
+
+            // Prepare Table
+            // Table column names
+            ss << std::endl;
+            ss << std::left << std::setw(15) << typeToTypeStr(BMInfoDataType::KernelNames) << " " << std::left
+               << std::setw(15) << typeToTypeStr(BMInfoDataType::KernelBandwidths) << " " << std::left << std::setw(10)
+               << typeToTypeStr(BMInfoDataType::KernelMinTimes) << " " << std::left << std::setw(10)
+               << typeToTypeStr(BMInfoDataType::KernelMaxTimes) << " " << std::left << std::setw(10)
+               << typeToTypeStr(BMInfoDataType::KernelAvgTimes) << " " << std::left << std::setw(6)
+               << typeToTypeStr(BMInfoDataType::KernelDataUsageValues) << " ";
+            ss << std::endl;
+            auto const kernelNamesStr = metaDataMap.at(BMInfoDataType::KernelNames);
+            auto numberOfKernels = std::count(kernelNamesStr.begin(), kernelNamesStr.end(), ',') + 1;
+
+            // Table rows. Print test results for each kernel line by line
+            for(auto i = 1; i <= numberOfKernels; i++)
+            {
+                // Print the row for the kernel i
+                ss << " " << std::left << std::setw(15) << getItemFromStrList(BMInfoDataType::KernelNames, i) << " ";
+                ss << std::left << std::setw(15) << getItemFromStrList(BMInfoDataType::KernelBandwidths, i) << " ";
+                ss << std::left << std::setw(8) << getItemFromStrList(BMInfoDataType::KernelMinTimes, i) << " ";
+                ss << std::left << std::setw(8) << getItemFromStrList(BMInfoDataType::KernelMaxTimes, i) << " ";
+                ss << std::left << std::setw(8) << getItemFromStrList(BMInfoDataType::KernelAvgTimes, i) << " ";
+                ss << std::left << std::setw(6) << getItemFromStrList(BMInfoDataType::KernelDataUsageValues, i) << " "
+                   << std::endl;
+            }
+
+            return ss.str();
+        }
+
+    private:
+        std::map<BMInfoDataType, std::string> metaDataMap;
+    };
+} // namespace
diff --git a/benchmarks/babelstream/src/babelStreamMainTest.cpp b/benchmarks/babelstream/src/babelStreamMainTest.cpp
new file mode 100644
index 000000000000..79ec62165084
--- /dev/null
+++ b/benchmarks/babelstream/src/babelStreamMainTest.cpp
@@ -0,0 +1,478 @@
+
+#include "babelStreamCommon.hpp"
+#include "catch2/catch_session.hpp"
+
+#include <alpaka/alpaka.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+
+#include <catch2/benchmark/catch_benchmark.hpp>
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+#include <string>
+
+/**
+ * Babelstream benchmarking example. Babelstream has 5 kernels. Add, Multiply, Copy, Triad and Dot.
+ * Babelstream is a memory-bound benchmark since the main operation in the kernels has high Code Balance (bytes/FLOP)
+ * value. For example c[i] = a[i] + b[i]; has 2 reads 1 writes and has one FLOP operation. For double precision each
+ * read-write is 8 bytes. Hence Code Balance (3*8 / 1) = 24 bytes/FLOP.
+ *
+ * Some implementations and the documents are accessible through https://github.com/UoB-HPC
+ *
+ * Can be run with custom arguments as well as catch2 arguments
+ * Run with Custom arguments:
+ * ./babelstream --array-size=33554432 --number-runs=100
+ * Runt with default array size and num runs:
+ * ./babelstream
+ * Run with Catch2 arguments and defaul arrary size and num runs:
+ * ./babelstream --success
+ * ./babelstream -r a.xml
+ * Run with Custom and catch2 arguments together:
+ * ./babelstream  --success --array-size=1280000 --number-runs=10
+ * Help to list custom and catch2 arguments
+ * ./babelstream -?
+ * ./babelstream --help
+ *  According to tests, 2^25 or larger data size values are needed for proper benchmarking:
+ *  ./babelstream --array-size=33554432 --number-runs=100
+ */
+
+// Main function that integrates Catch2 and custom argument handling
+int main(int argc, char* argv[])
+{
+    // Handle custom arguments
+    handleCustomArguments(argc, argv);
+
+    // Initialize Catch2 and pass the command-line arguments to it
+    int result = Catch::Session().run(argc, argv);
+
+    // Return the result of the tests
+    return result;
+}
+
+//! Initialization kernel
+struct InitKernel
+{
+    //! The kernel entry point
+    //! \tparam TAcc The accelerator environment to be executed on.
+    //! \tparam T The data type
+    //! \param acc The accelerator to be executed on.
+    //! \param a Pointer for vector a
+    //! \param initA the value to set all items in the vector
+    template<typename TAcc, typename T>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T* b, T* c, T initA) const
+    {
+        auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        a[i] = initA;
+        b[i] = static_cast<T>(0.0);
+        c[i] = static_cast<T>(0.0);
+    }
+};
+
+//! Vector copying kernel
+struct CopyKernel
+{
+    //! The kernel entry point
+    //! \tparam TAcc The accelerator environment to be executed on.
+    //! \tparam T The data type
+    //! \param acc The accelerator to be executed on.
+    //! \param a Pointer for vector a
+    //! \param b Pointer for vector b
+    template<typename TAcc, typename T>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T* b) const
+    {
+        auto const [index] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        b[index] = a[index];
+    }
+};
+
+//! Kernel multiplies the vector with a scalar, scaling or multiplication kernel
+struct MultKernel
+{
+    //! The kernel entry point
+    //! \tparam TAcc The accelerator environment to be executed on.
+    //! \tparam T The data type
+    //! \param acc The accelerator to be executed on.
+    //! \param a Pointer for vector a
+    //! \param b Pointer for result vector b
+    template<typename TAcc, typename T>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* const a, T* b) const
+    {
+        const T scalar = static_cast<T>(scalarVal);
+        auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        b[i] = scalar * a[i];
+    }
+};
+
+//! Vector summation kernel
+struct AddKernel
+{
+    //! The kernel entry point
+    //! \tparam TAcc The accelerator environment to be executed on.
+    //! \tparam T The data type
+    //! \param acc The accelerator to be executed on.
+    //! \param a Pointer for vector a
+    //! \param b Pointer for vector b
+    //! \param c Pointer for result vector c
+    template<typename TAcc, typename T>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* c) const
+    {
+        auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        c[i] = a[i] + b[i];
+    }
+};
+
+//! Kernel to find the linear combination of 2 vectors by initially scaling one of them
+struct TriadKernel
+{
+    //! The kernel entry point
+    //! \tparam TAcc The accelerator environment to be executed on.
+    //! \tparam T The data type
+    //! \param acc The accelerator to be executed on.
+    //! \param a Pointer for vector a
+    //! \param b Pointer for vector b
+    //! \param c Pointer for result vector c
+    template<typename TAcc, typename T>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* c) const
+    {
+        const T scalar = static_cast<T>(scalarVal);
+        auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        c[i] = a[i] + scalar * b[i];
+    }
+};
+
+//! Dot product of two vectors. The result is not a scalar but a vector of block-level dot products. For the
+//! BabelStream implementation and documentation: https://github.com/UoB-HPC
+struct DotKernel
+{
+    //! The kernel entry point
+    //! \tparam TAcc The accelerator environment to be executed on.
+    //! \tparam T The data type
+    //! \param acc The accelerator to be executed on.
+    //! \param a Pointer for vector a
+    //! \param b Pointer for vector b
+    //! \param sum Pointer for result vector consisting sums for each block
+    template<typename TAcc, typename T>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* sum, alpaka::Idx<TAcc> arraySize) const
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        auto& tbSum = alpaka::declareSharedVar<T[blockThreadExtentMain], __COUNTER__>(acc);
+
+        auto i = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
+        auto const local_i = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0];
+        auto const totalThreads = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];
+
+        T threadSum = 0;
+        for(; i < arraySize; i += totalThreads)
+            threadSum += a[i] * b[i];
+        tbSum[local_i] = threadSum;
+
+        auto const blockSize = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0];
+        for(Idx offset = blockSize / 2; offset > 0; offset /= 2)
+        {
+            alpaka::syncBlockThreads(acc);
+            if(local_i < offset)
+                tbSum[local_i] += tbSum[local_i + offset];
+        }
+
+        auto const gridBlockIndex = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0];
+        if(local_i == 0)
+            sum[gridBlockIndex] = tbSum[local_i];
+    }
+};
+
+//! \brief The Function for testing babelstream kernels for given Acc type and data type.
+//! \tparam TAcc the accelerator type
+//! \tparam DataType The data type to differentiate single or double data type based tests.
+template<typename TAcc, typename DataType>
+void testKernels()
+{
+    using Acc = TAcc;
+    // Define the index domain
+    // Set the number of dimensions as an integral constant. Set to 1 for 1D.
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+
+    // Meta data
+    // A MetaData class instance to keep the problem and results to print later
+    MetaData metaData;
+    std::string dataTypeStr;
+    if(std::is_same<DataType, float>::value)
+    {
+        dataTypeStr = "single";
+    }
+    else if(std::is_same<DataType, double>::value)
+    {
+        dataTypeStr = "double";
+    }
+
+    using QueueAcc = alpaka::Queue<Acc, alpaka::Blocking>;
+
+    // Select a device
+    auto const platform = alpaka::Platform<Acc>{};
+    auto const devAcc = alpaka::getDevByIdx(platform, 0);
+
+    // Create a queue on the device
+    QueueAcc queue(devAcc);
+
+    // Get the host device for allocating memory on the host.
+    auto const platformHost = alpaka::PlatformCpu{};
+    auto const devHost = alpaka::getDevByIdx(platformHost, 0);
+
+    // Create vectors
+    Idx arraySize = static_cast<Idx>(arraySizeMain);
+
+    // Acc buffers
+    auto bufAccInputA = alpaka::allocBuf<DataType, Idx>(devAcc, arraySize);
+    auto bufAccInputB = alpaka::allocBuf<DataType, Idx>(devAcc, arraySize);
+    auto bufAccOutputC = alpaka::allocBuf<DataType, Idx>(devAcc, arraySize);
+
+    // Host buffer as the result
+    auto bufHostOutputA = alpaka::allocBuf<DataType, Idx>(devHost, arraySize);
+    auto bufHostOutputB = alpaka::allocBuf<DataType, Idx>(devHost, arraySize);
+    auto bufHostOutputC = alpaka::allocBuf<DataType, Idx>(devHost, arraySize);
+
+    // Grid size and elems per thread will be used to get the work division
+    using Vec = alpaka::Vec<Dim, Idx>;
+    auto const elementsPerThread = Vec::all(static_cast<Idx>(1));
+    auto const elementsPerGrid = Vec::all(arraySize);
+
+    // Create pointer variables for buffer access
+    auto bufAccInputAPtr = std::data(bufAccInputA);
+    auto bufAccInputBPtr = std::data(bufAccInputB);
+    auto bufAccOutputCPtr = std::data(bufAccOutputC);
+
+    // Bind gridsize and elements per thread together
+    alpaka::KernelCfg<Acc> const kernelCfg = {elementsPerGrid, elementsPerThread};
+    // Let alpaka calculate good work division (namely the block and grid sizes) given our full problem extent
+    auto const workDivInit = alpaka::getValidWorkDiv(
+        kernelCfg,
+        devAcc,
+        InitKernel(),
+        bufAccInputAPtr,
+        bufAccInputBPtr,
+        bufAccOutputCPtr,
+        static_cast<DataType>(valA));
+    auto const workDivCopy
+        = alpaka::getValidWorkDiv(kernelCfg, devAcc, CopyKernel(), bufAccInputAPtr, bufAccInputBPtr);
+    auto const workDivMult
+        = alpaka::getValidWorkDiv(kernelCfg, devAcc, MultKernel(), bufAccInputAPtr, bufAccInputBPtr);
+    auto const workDivAdd
+        = alpaka::getValidWorkDiv(kernelCfg, devAcc, AddKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr);
+
+    auto const workDivTriad = alpaka::getValidWorkDiv(
+        kernelCfg,
+        devAcc,
+        TriadKernel(),
+        bufAccInputAPtr,
+        bufAccInputBPtr,
+        bufAccOutputCPtr);
+
+    // Vector of average run-times of babelstream kernels
+    std::vector<double> avgExecTimesOfKernels;
+    std::vector<double> minExecTimesOfKernels;
+    std::vector<double> maxExecTimesOfKernels;
+    std::vector<std::string> kernelLabels;
+    // Vector for collecting successive run-times of a single kernel in benchmark macro
+    std::vector<double> times;
+
+    // Lambda for measuring run-time
+    auto measureKernelExec = [&](auto&& kernelFunc, [[maybe_unused]] auto&& kernelLabel)
+    {
+        for(auto i = 0; i < numberOfRuns; i++)
+        {
+            double runtime = 0.0;
+            auto start = std::chrono::high_resolution_clock::now();
+            kernelFunc();
+            alpaka::wait(queue);
+            auto end = std::chrono::high_resolution_clock::now();
+            std::chrono::duration<double> duration = end - start;
+            runtime = duration.count();
+            times.push_back(runtime);
+        }
+
+        // find the minimum of the durations array.
+        // In benchmarking the first item of the runtimes array is not included in calculations.
+        const auto minmaxPair = findMinMax(times);
+        minExecTimesOfKernels.push_back(minmaxPair.first);
+        maxExecTimesOfKernels.push_back(minmaxPair.second);
+        avgExecTimesOfKernels.push_back(findAverage(times));
+        kernelLabels.push_back(kernelLabel);
+        times.clear();
+    };
+
+    // Run kernels one by one
+    // Test the init-kernel.
+    measureKernelExec(
+        [&]()
+        {
+            alpaka::exec<Acc>(
+                queue,
+                workDivInit,
+                InitKernel(),
+                bufAccInputAPtr,
+                bufAccInputBPtr,
+                bufAccOutputCPtr,
+                static_cast<DataType>(valA));
+        },
+        "InitKernel");
+
+    // Test the copy-kernel. Copy A one by one to B.
+    measureKernelExec(
+        [&]() { alpaka::exec<Acc>(queue, workDivCopy, CopyKernel(), bufAccInputAPtr, bufAccInputBPtr); },
+        "CopyKernel");
+
+    // Test the scaling-kernel. Calculate B=scalar*A.
+    measureKernelExec(
+        [&]() { alpaka::exec<Acc>(queue, workDivMult, MultKernel(), bufAccInputAPtr, bufAccInputBPtr); },
+        "MultKernel");
+
+    // Test the addition-kernel. Calculate C=A+B. Where B=scalar*A.
+    measureKernelExec(
+        [&]()
+        { alpaka::exec<Acc>(queue, workDivAdd, AddKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); },
+        "AddKernel");
+
+    // Test the Triad-kernel. Calculate C=A+scalar*B where B=scalar*A.
+    measureKernelExec(
+        [&]()
+        { alpaka::exec<Acc>(queue, workDivTriad, TriadKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); },
+        "TriadKernel");
+
+
+    // Copy arrays back to host
+    alpaka::memcpy(queue, bufHostOutputC, bufAccOutputC, arraySize);
+    alpaka::memcpy(queue, bufHostOutputB, bufAccInputB, arraySize);
+    alpaka::memcpy(queue, bufHostOutputA, bufAccInputA, arraySize);
+
+    // Verify the results
+    //
+    // Find sum of the errors as sum of the differences from expected values
+    DataType initVal{static_cast<DataType>(0.0)};
+    DataType sumErrC{initVal}, sumErrB{initVal}, sumErrA{initVal};
+
+    auto const expectedC = static_cast<DataType>(valA + scalarVal * scalarVal * valA);
+    auto const expectedB = static_cast<DataType>(scalarVal * valA);
+    auto const expectedA = static_cast<DataType>(valA);
+
+    // sum of the errors for each array
+    for(Idx i = 0; i < arraySize; ++i)
+    {
+        sumErrC += bufHostOutputC[static_cast<Idx>(i)] - expectedC;
+        sumErrB += bufHostOutputB[static_cast<Idx>(i)] - expectedB;
+        sumErrA += bufHostOutputA[static_cast<Idx>(i)] - expectedA;
+    }
+
+    // Normalize and compare sum of the errors
+    REQUIRE(FuzzyEqual(sumErrC / static_cast<DataType>(arraySize) / expectedC, static_cast<DataType>(0.0)));
+    REQUIRE(FuzzyEqual(sumErrB / static_cast<DataType>(arraySize) / expectedB, static_cast<DataType>(0.0)));
+    REQUIRE(FuzzyEqual(sumErrA / static_cast<DataType>(arraySize) / expectedA, static_cast<DataType>(0.0)));
+    alpaka::wait(queue);
+
+    // Test Dot kernel with specific blocksize which is larger than 1
+    if constexpr(alpaka::accMatchesTags<TAcc, alpaka::TagGpuCudaRt, alpaka::TagGpuHipRt, alpaka::TagGpuSyclIntel>)
+    {
+        using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+        // Threads per block for Dot kernel
+        constexpr Idx blockThreadExtent = blockThreadExtentMain;
+        // Blocks per grid for Dot kernel
+        constexpr Idx gridBlockExtent = static_cast<Idx>(256);
+        // Vector of sums of each block
+        auto bufAccSumPerBlock = alpaka::allocBuf<DataType, Idx>(devAcc, gridBlockExtent);
+        auto bufHostSumPerBlock = alpaka::allocBuf<DataType, Idx>(devHost, gridBlockExtent);
+        // A specific work-division is used for dotKernel
+        auto const workDivDot = WorkDiv{Vec{gridBlockExtent}, Vec{blockThreadExtent}, Vec::all(1)};
+
+        measureKernelExec(
+            [&]()
+            {
+                alpaka::exec<Acc>(
+                    queue,
+                    workDivDot,
+                    DotKernel(), // Dot kernel
+                    alpaka::getPtrNative(bufAccInputA),
+                    alpaka::getPtrNative(bufAccInputB),
+                    alpaka::getPtrNative(bufAccSumPerBlock),
+                    static_cast<alpaka::Idx<Acc>>(arraySize));
+            },
+            "DotKernel");
+
+        alpaka::memcpy(queue, bufHostSumPerBlock, bufAccSumPerBlock, gridBlockExtent);
+        alpaka::wait(queue);
+
+        DataType const* sumPtr = std::data(bufHostSumPerBlock);
+        auto const result = std::reduce(sumPtr, sumPtr + gridBlockExtent, DataType{0});
+        // Since vector values are 1, dot product should be identical to arraySize
+        REQUIRE(FuzzyEqual(static_cast<DataType>(result), static_cast<DataType>(arraySize * 2)));
+        // Add workdiv to the list of workdivs to print later
+        metaData.setItem(BMInfoDataType::WorkDivDot, workDivDot);
+    }
+
+
+    //
+    // Calculate and Display Benchmark Results
+    //
+    std::vector<double> bytesReadWriteMB = {
+        getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize)),
+        getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize)),
+        getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize)),
+        getDataThroughput<DataType>(3u, static_cast<unsigned>(arraySize)),
+        getDataThroughput<DataType>(3u, static_cast<unsigned>(arraySize)),
+        getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize)),
+    };
+
+    // calculate the bandwidth as throughput per seconds
+    std::vector<double> bandwidthsPerKernel;
+    if(minExecTimesOfKernels.size() == kernelLabels.size())
+    {
+        for(size_t i = 0; i < minExecTimesOfKernels.size(); ++i)
+        {
+            bandwidthsPerKernel.push_back(calculateBandwidth(bytesReadWriteMB.at(i), minExecTimesOfKernels.at(i)));
+        }
+    }
+
+    // Setting fields of Benchmark Info map. All information about benchmark and results are stored in a single map
+    metaData.setItem(BMInfoDataType::TimeStamp, getCurrentTimestamp());
+    metaData.setItem(BMInfoDataType::NumRuns, std::to_string(numberOfRuns));
+    metaData.setItem(BMInfoDataType::DataSize, std::to_string(arraySizeMain));
+    metaData.setItem(BMInfoDataType::DataType, dataTypeStr);
+
+    metaData.setItem(BMInfoDataType::WorkDivInit, workDivInit);
+    metaData.setItem(BMInfoDataType::WorkDivCopy, workDivCopy);
+    metaData.setItem(BMInfoDataType::WorkDivAdd, workDivAdd);
+    metaData.setItem(BMInfoDataType::WorkDivMult, workDivMult);
+    metaData.setItem(BMInfoDataType::WorkDivTriad, workDivTriad);
+
+    // Device and accelerator
+    metaData.setItem(BMInfoDataType::DeviceName, alpaka::getName(devAcc));
+    metaData.setItem(BMInfoDataType::AcceleratorType, alpaka::getAccName<Acc>());
+    // XML reporter of catch2 always converts to Nano Seconds
+    metaData.setItem(BMInfoDataType::TimeUnit, "Nano Seconds");
+    // Join elements and create a comma separated string
+    metaData.setItem(BMInfoDataType::KernelNames, joinElements(kernelLabels, ", "));
+    metaData.setItem(BMInfoDataType::KernelDataUsageValues, joinElements(bytesReadWriteMB, ", "));
+    metaData.setItem(BMInfoDataType::KernelBandwidths, joinElements(bandwidthsPerKernel, ", "));
+    metaData.setItem(BMInfoDataType::KernelMinTimes, joinElements(minExecTimesOfKernels, ", "));
+    metaData.setItem(BMInfoDataType::KernelMaxTimes, joinElements(maxExecTimesOfKernels, ", "));
+    metaData.setItem(BMInfoDataType::KernelAvgTimes, joinElements(avgExecTimesOfKernels, ", "));
+
+    // Print the summary as a table, if a standard serialization is needed other functions of the class can be used
+    std::cout << metaData.serializeAsTable() << std::endl;
+}
+
+using TestAccs1D = alpaka::test::EnabledAccs<alpaka::DimInt<1u>, std::uint32_t>;
+
+// Run for all Accs given by the argument
+TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Five Kernels<Float>", "[benchmark-test]", TestAccs1D)
+{
+    using Acc = TestType;
+    // Run tests for the float data type
+    testKernels<Acc, float>();
+}
+
+// Run for all Accs given by the argument
+TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Five Kernels<Double>", "[benchmark-test]", TestAccs1D)
+{
+    using Acc = TestType;
+    // Run tests for the double data type
+    testKernels<Acc, double>();
+}
diff --git a/benchmarks/babelstream/src/main.cpp b/benchmarks/babelstream/src/main.cpp
deleted file mode 100644
index acef1c33a60c..000000000000
--- a/benchmarks/babelstream/src/main.cpp
+++ /dev/null
@@ -1,588 +0,0 @@
-
-// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
-// University of Bristol HPC
-//
-// For full license terms please see the LICENSE file distributed with this
-// source code
-
-// NOLINTBEGIN
-
-#include <algorithm>
-#include <chrono>
-#include <cmath>
-#include <cstring>
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <numeric>
-#include <vector>
-
-#define VERSION_STRING "4.0"
-
-#include "Stream.h"
-
-#if defined(CUDA)
-#    include "CUDAStream.h"
-#elif defined(STD_DATA)
-#    include "STDDataStream.h"
-#elif defined(STD_INDICES)
-#    include "STDIndicesStream.h"
-#elif defined(STD_RANGES)
-#    include "STDRangesStream.hpp"
-#elif defined(TBB)
-#    include "TBBStream.hpp"
-#elif defined(THRUST)
-#    include "ThrustStream.h"
-#elif defined(HIP)
-#    include "HIPStream.h"
-#elif defined(HC)
-#    include "HCStream.h"
-#elif defined(OCL)
-#    include "OCLStream.h"
-#elif defined(USE_RAJA)
-#    include "RAJAStream.hpp"
-#elif defined(KOKKOS)
-#    include "KokkosStream.hpp"
-#elif defined(ACC)
-#    include "ACCStream.h"
-#elif defined(SYCL)
-#    include "SYCLStream.h"
-#elif defined(SYCL2020)
-#    include "SYCLStream2020.h"
-#elif defined(OMP)
-#    include "OMPStream.h"
-#elif defined(ALPAKA)
-#    include "AlpakaStream.h"
-#endif
-
-// Default size of 2^25
-int ARRAY_SIZE = 33'554'432;
-unsigned int num_times = 100;
-unsigned int deviceIndex = 0;
-bool use_float = false;
-bool output_as_csv = false;
-bool mibibytes = false;
-std::string csv_separator = ",";
-
-template<typename T>
-void check_solution(unsigned int const ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum);
-
-template<typename T>
-void run();
-
-// Options for running the benchmark:
-// - All 5 kernels (Copy, Add, Mul, Triad, Dot).
-// - Triad only.
-// - Nstream only.
-enum class Benchmark
-{
-    All,
-    Triad,
-    Nstream
-};
-
-// Selected run options.
-Benchmark selection = Benchmark::All;
-
-void parseArguments(int argc, char* argv[]);
-
-int main(int argc, char* argv[])
-{
-    parseArguments(argc, argv);
-
-    if(!output_as_csv)
-    {
-        std::cout << "BabelStream" << std::endl
-                  << "Version: " << VERSION_STRING << std::endl
-                  << "Implementation: " << IMPLEMENTATION_STRING << std::endl;
-    }
-
-    if(use_float)
-        run<float>();
-    else
-        run<double>();
-}
-
-// Run the 5 main kernels
-template<typename T>
-std::vector<std::vector<double>> run_all(Stream<T>* stream, T& sum)
-{
-    // List of times
-    std::vector<std::vector<double>> timings(5);
-
-    // Declare timers
-    std::chrono::high_resolution_clock::time_point t1, t2;
-
-    // Main loop
-    for(unsigned int k = 0; k < num_times; k++)
-    {
-        // Execute Copy
-        t1 = std::chrono::high_resolution_clock::now();
-        stream->copy();
-        t2 = std::chrono::high_resolution_clock::now();
-        timings[0].push_back(std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count());
-
-        // Execute Mul
-        t1 = std::chrono::high_resolution_clock::now();
-        stream->mul();
-        t2 = std::chrono::high_resolution_clock::now();
-        timings[1].push_back(std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count());
-
-        // Execute Add
-        t1 = std::chrono::high_resolution_clock::now();
-        stream->add();
-        t2 = std::chrono::high_resolution_clock::now();
-        timings[2].push_back(std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count());
-
-        // Execute Triad
-        t1 = std::chrono::high_resolution_clock::now();
-        stream->triad();
-        t2 = std::chrono::high_resolution_clock::now();
-        timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count());
-
-        // Execute Dot
-        t1 = std::chrono::high_resolution_clock::now();
-        sum = stream->dot();
-        t2 = std::chrono::high_resolution_clock::now();
-        timings[4].push_back(std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count());
-    }
-
-    // Compiler should use a move
-    return timings;
-}
-
-// Run the Triad kernel
-template<typename T>
-std::vector<std::vector<double>> run_triad(Stream<T>* stream)
-{
-    std::vector<std::vector<double>> timings(1);
-
-    // Declare timers
-    std::chrono::high_resolution_clock::time_point t1, t2;
-
-    // Run triad in loop
-    t1 = std::chrono::high_resolution_clock::now();
-    for(unsigned int k = 0; k < num_times; k++)
-    {
-        stream->triad();
-    }
-    t2 = std::chrono::high_resolution_clock::now();
-
-    double runtime = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count();
-    timings[0].push_back(runtime);
-
-    return timings;
-}
-
-// Run the Nstream kernel
-template<typename T>
-std::vector<std::vector<double>> run_nstream(Stream<T>* stream)
-{
-    std::vector<std::vector<double>> timings(1);
-
-    // Declare timers
-    std::chrono::high_resolution_clock::time_point t1, t2;
-
-    // Run nstream in loop
-    for(int k = 0; k < num_times; k++)
-    {
-        t1 = std::chrono::high_resolution_clock::now();
-        stream->nstream();
-        t2 = std::chrono::high_resolution_clock::now();
-        timings[0].push_back(std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count());
-    }
-
-    return timings;
-}
-
-// Generic run routine
-// Runs the kernel(s) and prints output.
-template<typename T>
-void run()
-{
-    std::streamsize ss = std::cout.precision();
-
-    if(!output_as_csv)
-    {
-        if(selection == Benchmark::All)
-            std::cout << "Running kernels " << num_times << " times" << std::endl;
-        else if(selection == Benchmark::Triad)
-        {
-            std::cout << "Running triad " << num_times << " times" << std::endl;
-            std::cout << "Number of elements: " << ARRAY_SIZE << std::endl;
-        }
-
-
-        if(sizeof(T) == sizeof(float))
-            std::cout << "Precision: float" << std::endl;
-        else
-            std::cout << "Precision: double" << std::endl;
-
-
-        if(mibibytes)
-        {
-            // MiB = 2^20
-            std::cout << std::setprecision(1) << std::fixed
-                      << "Array size: " << ARRAY_SIZE * sizeof(T) * pow(2.0, -20.0) << " MiB"
-                      << " (=" << ARRAY_SIZE * sizeof(T) * pow(2.0, -30.0) << " GiB)" << std::endl;
-            std::cout << "Total size: " << 3.0 * ARRAY_SIZE * sizeof(T) * pow(2.0, -20.0) << " MiB"
-                      << " (=" << 3.0 * ARRAY_SIZE * sizeof(T) * pow(2.0, -30.0) << " GiB)" << std::endl;
-        }
-        else
-        {
-            // MB = 10^6
-            std::cout << std::setprecision(1) << std::fixed << "Array size: " << ARRAY_SIZE * sizeof(T) * 1.0E-6
-                      << " MB"
-                      << " (=" << ARRAY_SIZE * sizeof(T) * 1.0E-9 << " GB)" << std::endl;
-            std::cout << "Total size: " << 3.0 * ARRAY_SIZE * sizeof(T) * 1.0E-6 << " MB"
-                      << " (=" << 3.0 * ARRAY_SIZE * sizeof(T) * 1.0E-9 << " GB)" << std::endl;
-        }
-        std::cout.precision(ss);
-    }
-
-    Stream<T>* stream;
-
-#if defined(CUDA)
-    // Use the CUDA implementation
-    stream = new CUDAStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(HIP)
-    // Use the HIP implementation
-    stream = new HIPStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(HC)
-    // Use the HC implementation
-    stream = new HCStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(OCL)
-    // Use the OpenCL implementation
-    stream = new OCLStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(USE_RAJA)
-    // Use the RAJA implementation
-    stream = new RAJAStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(KOKKOS)
-    // Use the Kokkos implementation
-    stream = new KokkosStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(STD_DATA)
-    // Use the C++ STD data-oriented implementation
-    stream = new STDDataStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(STD_INDICES)
-    // Use the C++ STD index-oriented implementation
-    stream = new STDIndicesStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(STD_RANGES)
-    // Use the C++ STD ranges implementation
-    stream = new STDRangesStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(TBB)
-    // Use the C++20 implementation
-    stream = new TBBStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(THRUST)
-    // Use the Thrust implementation
-    stream = new ThrustStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(ACC)
-    // Use the OpenACC implementation
-    stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(SYCL) || defined(SYCL2020)
-    // Use the SYCL implementation
-    stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(OMP)
-    // Use the OpenMP implementation
-    stream = new OMPStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(ALPAKA)
-    // Use the alpaka implementation
-    stream = new AlpakaStream<T>(ARRAY_SIZE, deviceIndex);
-
-#endif
-
-    stream->init_arrays(startA, startB, startC);
-
-    // Result of the Dot kernel, if used.
-    T sum = 0.0;
-
-    std::vector<std::vector<double>> timings;
-
-    switch(selection)
-    {
-    case Benchmark::All:
-        timings = run_all<T>(stream, sum);
-        break;
-    case Benchmark::Triad:
-        timings = run_triad<T>(stream);
-        break;
-    case Benchmark::Nstream:
-        timings = run_nstream<T>(stream);
-        break;
-    };
-
-    // Check solutions
-    // Create host vectors
-    std::vector<T> a(ARRAY_SIZE);
-    std::vector<T> b(ARRAY_SIZE);
-    std::vector<T> c(ARRAY_SIZE);
-
-
-    stream->read_arrays(a, b, c);
-    check_solution<T>(num_times, a, b, c, sum);
-
-    // Display timing results
-    if(output_as_csv)
-    {
-        std::cout << "function" << csv_separator << "num_times" << csv_separator << "n_elements" << csv_separator
-                  << "sizeof" << csv_separator << ((mibibytes) ? "max_mibytes_per_sec" : "max_mbytes_per_sec")
-                  << csv_separator << "min_runtime" << csv_separator << "max_runtime" << csv_separator << "avg_runtime"
-                  << std::endl;
-    }
-    else
-    {
-        std::cout << std::left << std::setw(12) << "Function" << std::left << std::setw(12)
-                  << ((mibibytes) ? "MiBytes/sec" : "MBytes/sec") << std::left << std::setw(12) << "Min (sec)"
-                  << std::left << std::setw(12) << "Max" << std::left << std::setw(12) << "Average" << std::endl
-                  << std::fixed;
-    }
-
-
-    if(selection == Benchmark::All || selection == Benchmark::Nstream)
-    {
-        std::vector<std::string> labels;
-        std::vector<size_t> sizes;
-
-        if(selection == Benchmark::All)
-        {
-            labels = {"Copy", "Mul", "Add", "Triad", "Dot"};
-            sizes
-                = {2 * sizeof(T) * ARRAY_SIZE,
-                   2 * sizeof(T) * ARRAY_SIZE,
-                   3 * sizeof(T) * ARRAY_SIZE,
-                   3 * sizeof(T) * ARRAY_SIZE,
-                   2 * sizeof(T) * ARRAY_SIZE};
-        }
-        else if(selection == Benchmark::Nstream)
-        {
-            labels = {"Nstream"};
-            sizes = {4 * sizeof(T) * ARRAY_SIZE};
-        }
-
-        for(int i = 0; i < timings.size(); ++i)
-        {
-            // Get min/max; ignore the first result
-            auto minmax = std::minmax_element(timings[i].begin() + 1, timings[i].end());
-
-            // Calculate average; ignore the first result
-            double average = std::accumulate(timings[i].begin() + 1, timings[i].end(), 0.0) / (double) (num_times - 1);
-
-            // Display results
-            if(output_as_csv)
-            {
-                std::cout << labels[i] << csv_separator << num_times << csv_separator << ARRAY_SIZE << csv_separator
-                          << sizeof(T) << csv_separator
-                          << ((mibibytes) ? pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << csv_separator
-                          << *minmax.first << csv_separator << *minmax.second << csv_separator << average << std::endl;
-            }
-            else
-            {
-                std::cout << std::left << std::setw(12) << labels[i] << std::left << std::setw(12)
-                          << std::setprecision(3)
-                          << ((mibibytes) ? pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << std::left
-                          << std::setw(12) << std::setprecision(5) << *minmax.first << std::left << std::setw(12)
-                          << std::setprecision(5) << *minmax.second << std::left << std::setw(12)
-                          << std::setprecision(5) << average << std::endl;
-            }
-        }
-    }
-    else if(selection == Benchmark::Triad)
-    {
-        // Display timing results
-        double total_bytes = 3 * sizeof(T) * ARRAY_SIZE * num_times;
-        double bandwidth = ((mibibytes) ? pow(2.0, -30.0) : 1.0E-9) * (total_bytes / timings[0][0]);
-
-        if(output_as_csv)
-        {
-            std::cout << "function" << csv_separator << "num_times" << csv_separator << "n_elements" << csv_separator
-                      << "sizeof" << csv_separator << ((mibibytes) ? "gibytes_per_sec" : "gbytes_per_sec")
-                      << csv_separator << "runtime" << std::endl;
-            std::cout << "Triad" << csv_separator << num_times << csv_separator << ARRAY_SIZE << csv_separator
-                      << sizeof(T) << csv_separator << bandwidth << csv_separator << timings[0][0] << std::endl;
-        }
-        else
-        {
-            std::cout << "--------------------------------" << std::endl
-                      << std::fixed << "Runtime (seconds): " << std::left << std::setprecision(5) << timings[0][0]
-                      << std::endl
-                      << "Bandwidth (" << ((mibibytes) ? "GiB/s" : "GB/s") << "):  " << std::left
-                      << std::setprecision(3) << bandwidth << std::endl;
-        }
-    }
-
-    delete stream;
-}
-
-template<typename T>
-void check_solution(unsigned int const ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum)
-{
-    // Generate correct solution
-    T goldA = startA;
-    T goldB = startB;
-    T goldC = startC;
-    T goldSum = 0.0;
-
-    const T scalar = startScalar;
-
-    for(unsigned int i = 0; i < ntimes; i++)
-    {
-        // Do STREAM!
-        if(selection == Benchmark::All)
-        {
-            goldC = goldA;
-            goldB = scalar * goldC;
-            goldC = goldA + goldB;
-            goldA = goldB + scalar * goldC;
-        }
-        else if(selection == Benchmark::Triad)
-        {
-            goldA = goldB + scalar * goldC;
-        }
-        else if(selection == Benchmark::Nstream)
-        {
-            goldA += goldB + scalar * goldC;
-        }
-    }
-
-    // Do the reduction
-    goldSum = goldA * goldB * ARRAY_SIZE;
-
-    // Calculate the average error
-    double errA
-        = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val) { return sum + fabs(val - goldA); });
-    errA /= a.size();
-    double errB
-        = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val) { return sum + fabs(val - goldB); });
-    errB /= b.size();
-    double errC
-        = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val) { return sum + fabs(val - goldC); });
-    errC /= c.size();
-    double errSum = fabs((sum - goldSum) / goldSum);
-
-    double epsi = std::numeric_limits<T>::epsilon() * 100.0;
-
-    if(errA > epsi)
-        std::cerr << "Validation failed on a[]. Average error " << errA << std::endl;
-    if(errB > epsi)
-        std::cerr << "Validation failed on b[]. Average error " << errB << std::endl;
-    if(errC > epsi)
-        std::cerr << "Validation failed on c[]. Average error " << errC << std::endl;
-    // Check sum to 8 decimal places
-    if(selection == Benchmark::All && errSum > 1.0E-8)
-        std::cerr << "Validation failed on sum. Error " << errSum << std::endl
-                  << std::setprecision(15) << "Sum was " << sum << " but should be " << goldSum << std::endl;
-}
-
-int parseUInt(char const* str, unsigned int* output)
-{
-    char* next;
-    *output = strtoul(str, &next, 10);
-    return !strlen(next);
-}
-
-int parseInt(char const* str, int* output)
-{
-    char* next;
-    *output = strtol(str, &next, 10);
-    return !strlen(next);
-}
-
-void parseArguments(int argc, char* argv[])
-{
-    for(int i = 1; i < argc; i++)
-    {
-        if(!std::string("--list").compare(argv[i]))
-        {
-            listDevices();
-            exit(EXIT_SUCCESS);
-        }
-        else if(!std::string("--device").compare(argv[i]))
-        {
-            if(++i >= argc || !parseUInt(argv[i], &deviceIndex))
-            {
-                std::cerr << "Invalid device index." << std::endl;
-                exit(EXIT_FAILURE);
-            }
-        }
-        else if(!std::string("--arraysize").compare(argv[i]) || !std::string("-s").compare(argv[i]))
-        {
-            if(++i >= argc || !parseInt(argv[i], &ARRAY_SIZE) || ARRAY_SIZE <= 0)
-            {
-                std::cerr << "Invalid array size." << std::endl;
-                exit(EXIT_FAILURE);
-            }
-        }
-        else if(!std::string("--numtimes").compare(argv[i]) || !std::string("-n").compare(argv[i]))
-        {
-            if(++i >= argc || !parseUInt(argv[i], &num_times))
-            {
-                std::cerr << "Invalid number of times." << std::endl;
-                exit(EXIT_FAILURE);
-            }
-            if(num_times < 2)
-            {
-                std::cerr << "Number of times must be 2 or more" << std::endl;
-                exit(EXIT_FAILURE);
-            }
-        }
-        else if(!std::string("--float").compare(argv[i]))
-        {
-            use_float = true;
-        }
-        else if(!std::string("--triad-only").compare(argv[i]))
-        {
-            selection = Benchmark::Triad;
-        }
-        else if(!std::string("--nstream-only").compare(argv[i]))
-        {
-            selection = Benchmark::Nstream;
-        }
-        else if(!std::string("--csv").compare(argv[i]))
-        {
-            output_as_csv = true;
-        }
-        else if(!std::string("--mibibytes").compare(argv[i]))
-        {
-            mibibytes = true;
-        }
-        else if(!std::string("--help").compare(argv[i]) || !std::string("-h").compare(argv[i]))
-        {
-            std::cout << std::endl;
-            std::cout << "Usage: " << argv[0] << " [OPTIONS]" << std::endl << std::endl;
-            std::cout << "Options:" << std::endl;
-            std::cout << "  -h  --help               Print the message" << std::endl;
-            std::cout << "      --list               List available devices" << std::endl;
-            std::cout << "      --device     INDEX   Select device at INDEX" << std::endl;
-            std::cout << "  -s  --arraysize  SIZE    Use SIZE elements in the array" << std::endl;
-            std::cout << "  -n  --numtimes   NUM     Run the test NUM times (NUM >= 2)" << std::endl;
-            std::cout << "      --float              Use floats (rather than doubles)" << std::endl;
-            std::cout << "      --triad-only         Only run triad" << std::endl;
-            std::cout << "      --nstream-only       Only run nstream" << std::endl;
-            std::cout << "      --csv                Output as csv table" << std::endl;
-            std::cout << "      --mibibytes          Use MiB=2^20 for bandwidth calculation (default MB=10^6)"
-                      << std::endl;
-            std::cout << std::endl;
-            exit(EXIT_SUCCESS);
-        }
-        else
-        {
-            std::cerr << "Unrecognized argument '" << argv[i] << "' (try '--help')" << std::endl;
-            exit(EXIT_FAILURE);
-        }
-    }
-}
-
-// NOLINTEND
diff --git a/thirdParty/CMakeLists.txt b/thirdParty/CMakeLists.txt
index 826179d78ee4..97ed4d5f9911 100644
--- a/thirdParty/CMakeLists.txt
+++ b/thirdParty/CMakeLists.txt
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-if(BUILD_TESTING)
+if(BUILD_TESTING OR alpaka_BUILD_BENCHMARKS)
     if(alpaka_USE_INTERNAL_CATCH2)
         message(STATUS "Catch2: Using INTERNAL version 3.5.2")
         # Force Catch2's CMake to pick up the variables we set below