From 66cb5015f0b6debcd2dfbfedb790f51b7b709997 Mon Sep 17 00:00:00 2001 From: JhaShweta1 Date: Wed, 17 Jul 2024 22:47:19 -0500 Subject: [PATCH 1/3] added changes to support NUMA --- example.cpp | 46 +++++++++++++++++++++++++ include/oneapi/tbb/parallel_for.h | 57 ++++++++++++++++++++++++++++++- include/oneapi/tbb/partitioner.h | 34 +++++++++++++++++- p_for.cpp | 44 ++++++++++++++++++++++++ 4 files changed, 179 insertions(+), 2 deletions(-) create mode 100644 example.cpp create mode 100644 p_for.cpp diff --git a/example.cpp b/example.cpp new file mode 100644 index 0000000000..2a3b5206b9 --- /dev/null +++ b/example.cpp @@ -0,0 +1,46 @@ +#include +#include +#include +#include +#include +#include + +int main() { + // Initialize a vector with some values + size_t data_size = 1 << 20; // Example: 1 million elements + std::vector data(data_size); + std::iota(data.begin(), data.end(), 1.0f); // Fill the vector with values 1, 2, ..., data_size + + // Atomic variable to hold the total sum + std::atomic total_sum(0.0f); + + // Lambda function to add local sums to the total sum atomically + auto atomic_add = [](std::atomic& target, float value) { + float current = target.load(); + while (!target.compare_exchange_weak(current, current + value)); + }; + + // Create a static partitioner + tbb::static_partitioner partitioner; + + // Start the timer + auto start_time = std::chrono::high_resolution_clock::now(); + + // Parallel sum using tbb::parallel_for with static_partitioner + tbb::parallel_for(tbb::blocked_range(0, data.size()), + [&data, &total_sum, &atomic_add](const tbb::blocked_range& range) { + float local_sum = 0; + for (size_t i = range.begin(); i != range.end(); ++i) { + local_sum += data[i]; + } + atomic_add(total_sum, local_sum); + }, tbb::numa_partitioner()); + + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end_time - start_time).count(); + + // Print the total sum + std::cout << "Total sum: " << total_sum.load() << std::endl; + std::cout << "Execution time: " << duration << " milliseconds" << std::endl; + return 0; +} diff --git a/include/oneapi/tbb/parallel_for.h b/include/oneapi/tbb/parallel_for.h index 37a2613508..a89e8fe2a5 100644 --- a/include/oneapi/tbb/parallel_for.h +++ b/include/oneapi/tbb/parallel_for.h @@ -30,6 +30,7 @@ #include #include +#include namespace tbb { namespace detail { @@ -261,7 +262,61 @@ void parallel_for( const Range& range, const Body& body, affinity_partitioner& p start_for::run(range,body,partitioner); } -//! Parallel iteration over range with default partitioner and user-supplied context. +template +__TBB_requires(tbb_range && parallel_for_body) +void parallel_for(const Range& range, const Body& body, const T& n_partitioner) { + + size_t data_size = range.size(); + size_t partition_size = data_size / n_partitioner.num_numa_nodes; + std::vector task_groups(n_partitioner.num_numa_nodes); + + // Allocate memory for data partitions with NUMA affinity + n_partitioner.data_partitions.resize(n_partitioner.num_numa_nodes); + + /* for (size_t i = 0; i < n_partitioner.num_numa_nodes; ++i) { + n_partitioner.data_partitions[i].resize(partition_size); + // Set NUMA affinity (Linux specific) + numa_set_preferred(n_partitioner.numa_nodes[i]); + }*/ + + // Initialize the data in partitions + auto initialize_data = [&](std::vector& partition, const tbb::blocked_range& range) { + for (size_t j = range.begin(); j != range.end(); ++j) { + partition[j] = static_cast(j); + } + }; + + for (size_t i = 0; i < n_partitioner.num_numa_nodes; ++i) { + n_partitioner.arenas[i].execute([&task_groups, &n_partitioner, &range, &initialize_data, i]() { + task_groups[i].run([&, i] { + parallel_for(tbb::blocked_range(0, n_partitioner.data_partitions[i].size()), + [&](const tbb::blocked_range& r) { + initialize_data(n_partitioner.data_partitions[i], r); + }, n_partitioner.inner_partitioner); + }); + }); + } + + for (size_t i = 0; i < n_partitioner.num_numa_nodes; ++i) { + n_partitioner.arenas[i].execute([&task_groups, &n_partitioner, &range, &body, i]() { + task_groups[i].run([&, i] { + parallel_for(range, + [&](const tbb::blocked_range& r) { + body(r); + }, n_partitioner.inner_partitioner); + }); + }); + } + + for (size_t i = 0; i < n_partitioner.num_numa_nodes; ++i) { + n_partitioner.arenas[i].execute([&task_groups, i]() { + task_groups[i].wait(); + }); + } +} + + + //! Parallel iteration over range with default partitioner and user-supplied context. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_for_body) diff --git a/include/oneapi/tbb/partitioner.h b/include/oneapi/tbb/partitioner.h index f09786c022..d0640e34af 100644 --- a/include/oneapi/tbb/partitioner.h +++ b/include/oneapi/tbb/partitioner.h @@ -67,7 +67,8 @@ class static_partitioner; class affinity_partitioner; class affinity_partition_type; class affinity_partitioner_base; - +template class numa_partitioner; + inline std::size_t get_initial_auto_partitioner_divisor() { const std::size_t factor = 4; return factor * static_cast(max_concurrency()); @@ -654,6 +655,35 @@ class affinity_partitioner : affinity_partitioner_base { typedef affinity_partition_type::split_type split_type; }; +template +class numa_partitioner{ +public: + numa_partitioner() { + printf("Creating Numa partitioner\n"); + numa_nodes = oneapi::tbb::info::numa_nodes(); + num_numa_nodes = numa_nodes.size(); + + // Create a task arena for each valid NUMA node + for (int node : numa_nodes) { + arenas.emplace_back(tbb::task_arena::constraints().set_numa_id(node)); + } + + } + + mutable std::vector numa_nodes; + mutable std::vector arenas; + mutable std::vector> data_partitions; + T inner_partitioner; + //std::vector>& get_data_partitions() { + //return data_partitions; + //} + size_t num_numa_nodes; +private: + template friend struct start_for; + //typedef hierarchical_partition_type task_partition_type; + typedef detail::proportional_split split_type; +}; + } // namespace d1 } // namespace detail @@ -663,6 +693,8 @@ using detail::d1::auto_partitioner; using detail::d1::simple_partitioner; using detail::d1::static_partitioner; using detail::d1::affinity_partitioner; +using detail::d1::numa_partitioner; + // Split types using detail::split; using detail::proportional_split; diff --git a/p_for.cpp b/p_for.cpp new file mode 100644 index 0000000000..28b07acb6f --- /dev/null +++ b/p_for.cpp @@ -0,0 +1,44 @@ +#include +#include +#include + +#include + +int main(int argc, char **argv) +{ + auto values = std::vector(10000); + + tbb::parallel_for( tbb::blocked_range(0,values.size()), + [&](tbb::blocked_range r) + { + for (size_t i=r.begin(); i()); + double total = 0; + + for (double value : values) + { + total += value; + } + + std::cout << total << std::endl; + + total = 0; + tbb::parallel_for( tbb::blocked_range(0,values.size()), + [&](tbb::blocked_range r) + { + for (size_t i=r.begin(); i Date: Thu, 18 Jul 2024 11:06:08 -0500 Subject: [PATCH 2/3] minor changes, started 2D add --- include/oneapi/tbb/parallel_for.h | 21 +++++++++++++-------- include/oneapi/tbb/partitioner.h | 2 +- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/include/oneapi/tbb/parallel_for.h b/include/oneapi/tbb/parallel_for.h index a89e8fe2a5..3f5a2899c0 100644 --- a/include/oneapi/tbb/parallel_for.h +++ b/include/oneapi/tbb/parallel_for.h @@ -262,23 +262,28 @@ void parallel_for( const Range& range, const Body& body, affinity_partitioner& p start_for::run(range,body,partitioner); } +template +struct is_2d_range : std::false_type {}; + +template +struct is_2d_range().rows()), decltype(std::declval().cols())>> : std::true_type {}; + template __TBB_requires(tbb_range && parallel_for_body) void parallel_for(const Range& range, const Body& body, const T& n_partitioner) { - size_t data_size = range.size(); + //size_t data_size = range.size(); + size_t data_size; + if constexpr (is_2d_range::value) { + data_size = (range.rows().end() - range.rows().begin()) * (range.cols().end() - range.cols().begin()); + } else { + data_size = range.end() - range.begin(); + } size_t partition_size = data_size / n_partitioner.num_numa_nodes; std::vector task_groups(n_partitioner.num_numa_nodes); - // Allocate memory for data partitions with NUMA affinity n_partitioner.data_partitions.resize(n_partitioner.num_numa_nodes); - /* for (size_t i = 0; i < n_partitioner.num_numa_nodes; ++i) { - n_partitioner.data_partitions[i].resize(partition_size); - // Set NUMA affinity (Linux specific) - numa_set_preferred(n_partitioner.numa_nodes[i]); - }*/ - // Initialize the data in partitions auto initialize_data = [&](std::vector& partition, const tbb::blocked_range& range) { for (size_t j = range.begin(); j != range.end(); ++j) { diff --git a/include/oneapi/tbb/partitioner.h b/include/oneapi/tbb/partitioner.h index d0640e34af..5cca8166fc 100644 --- a/include/oneapi/tbb/partitioner.h +++ b/include/oneapi/tbb/partitioner.h @@ -679,7 +679,7 @@ class numa_partitioner{ //} size_t num_numa_nodes; private: - template friend struct start_for; + //template friend struct start_for; //typedef hierarchical_partition_type task_partition_type; typedef detail::proportional_split split_type; }; From 0a6e91beecda061d018ae430ed511b54ee4f079b Mon Sep 17 00:00:00 2001 From: JhaShweta1 Date: Thu, 18 Jul 2024 13:53:22 -0500 Subject: [PATCH 3/3] changes to add benchmark-type in config --- include/oneapi/tbb/parallel_for.h | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/include/oneapi/tbb/parallel_for.h b/include/oneapi/tbb/parallel_for.h index 3f5a2899c0..5d0829759e 100644 --- a/include/oneapi/tbb/parallel_for.h +++ b/include/oneapi/tbb/parallel_for.h @@ -30,7 +30,7 @@ #include #include -#include + namespace tbb { namespace detail { @@ -292,23 +292,16 @@ void parallel_for(const Range& range, const Body& body, const T& n_partitioner) }; for (size_t i = 0; i < n_partitioner.num_numa_nodes; ++i) { - n_partitioner.arenas[i].execute([&task_groups, &n_partitioner, &range, &initialize_data, i]() { + n_partitioner.arenas[i].execute([&task_groups, &n_partitioner, &range, &initialize_data, &body, i]() { task_groups[i].run([&, i] { parallel_for(tbb::blocked_range(0, n_partitioner.data_partitions[i].size()), [&](const tbb::blocked_range& r) { initialize_data(n_partitioner.data_partitions[i], r); - }, n_partitioner.inner_partitioner); - }); - }); - } - - for (size_t i = 0; i < n_partitioner.num_numa_nodes; ++i) { - n_partitioner.arenas[i].execute([&task_groups, &n_partitioner, &range, &body, i]() { - task_groups[i].run([&, i] { - parallel_for(range, - [&](const tbb::blocked_range& r) { - body(r); - }, n_partitioner.inner_partitioner); + }, n_partitioner.inner_partitioner); + parallel_for(range, + [&](const tbb::blocked_range& r) { + body(r); + }, n_partitioner.inner_partitioner); }); }); }