diff --git a/example.cpp b/example.cpp new file mode 100644 index 0000000000..2a3b5206b9 --- /dev/null +++ b/example.cpp @@ -0,0 +1,46 @@ +#include +#include +#include +#include +#include +#include + +int main() { + // Initialize a vector with some values + size_t data_size = 1 << 20; // Example: 1 million elements + std::vector data(data_size); + std::iota(data.begin(), data.end(), 1.0f); // Fill the vector with values 1, 2, ..., data_size + + // Atomic variable to hold the total sum + std::atomic total_sum(0.0f); + + // Lambda function to add local sums to the total sum atomically + auto atomic_add = [](std::atomic& target, float value) { + float current = target.load(); + while (!target.compare_exchange_weak(current, current + value)); + }; + + // Create a static partitioner + tbb::static_partitioner partitioner; + + // Start the timer + auto start_time = std::chrono::high_resolution_clock::now(); + + // Parallel sum using tbb::parallel_for with static_partitioner + tbb::parallel_for(tbb::blocked_range(0, data.size()), + [&data, &total_sum, &atomic_add](const tbb::blocked_range& range) { + float local_sum = 0; + for (size_t i = range.begin(); i != range.end(); ++i) { + local_sum += data[i]; + } + atomic_add(total_sum, local_sum); + }, tbb::numa_partitioner()); + + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end_time - start_time).count(); + + // Print the total sum + std::cout << "Total sum: " << total_sum.load() << std::endl; + std::cout << "Execution time: " << duration << " milliseconds" << std::endl; + return 0; +} diff --git a/include/oneapi/tbb/parallel_for.h b/include/oneapi/tbb/parallel_for.h index 37a2613508..5d0829759e 100644 --- a/include/oneapi/tbb/parallel_for.h +++ b/include/oneapi/tbb/parallel_for.h @@ -31,6 +31,7 @@ #include #include + namespace tbb { namespace detail { #if __TBB_CPP20_CONCEPTS_PRESENT @@ -261,7 +262,59 @@ void parallel_for( const Range& range, const Body& body, affinity_partitioner& p start_for::run(range,body,partitioner); } -//! Parallel iteration over range with default partitioner and user-supplied context. +template +struct is_2d_range : std::false_type {}; + +template +struct is_2d_range().rows()), decltype(std::declval().cols())>> : std::true_type {}; + +template +__TBB_requires(tbb_range && parallel_for_body) +void parallel_for(const Range& range, const Body& body, const T& n_partitioner) { + + //size_t data_size = range.size(); + size_t data_size; + if constexpr (is_2d_range::value) { + data_size = (range.rows().end() - range.rows().begin()) * (range.cols().end() - range.cols().begin()); + } else { + data_size = range.end() - range.begin(); + } + size_t partition_size = data_size / n_partitioner.num_numa_nodes; + std::vector task_groups(n_partitioner.num_numa_nodes); + + n_partitioner.data_partitions.resize(n_partitioner.num_numa_nodes); + + // Initialize the data in partitions + auto initialize_data = [&](std::vector& partition, const tbb::blocked_range& range) { + for (size_t j = range.begin(); j != range.end(); ++j) { + partition[j] = static_cast(j); + } + }; + + for (size_t i = 0; i < n_partitioner.num_numa_nodes; ++i) { + n_partitioner.arenas[i].execute([&task_groups, &n_partitioner, &range, &initialize_data, &body, i]() { + task_groups[i].run([&, i] { + parallel_for(tbb::blocked_range(0, n_partitioner.data_partitions[i].size()), + [&](const tbb::blocked_range& r) { + initialize_data(n_partitioner.data_partitions[i], r); + }, n_partitioner.inner_partitioner); + parallel_for(range, + [&](const tbb::blocked_range& r) { + body(r); + }, n_partitioner.inner_partitioner); + }); + }); + } + + for (size_t i = 0; i < n_partitioner.num_numa_nodes; ++i) { + n_partitioner.arenas[i].execute([&task_groups, i]() { + task_groups[i].wait(); + }); + } +} + + + //! Parallel iteration over range with default partitioner and user-supplied context. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_for_body) diff --git a/include/oneapi/tbb/partitioner.h b/include/oneapi/tbb/partitioner.h index f09786c022..5cca8166fc 100644 --- a/include/oneapi/tbb/partitioner.h +++ b/include/oneapi/tbb/partitioner.h @@ -67,7 +67,8 @@ class static_partitioner; class affinity_partitioner; class affinity_partition_type; class affinity_partitioner_base; - +template class numa_partitioner; + inline std::size_t get_initial_auto_partitioner_divisor() { const std::size_t factor = 4; return factor * static_cast(max_concurrency()); @@ -654,6 +655,35 @@ class affinity_partitioner : affinity_partitioner_base { typedef affinity_partition_type::split_type split_type; }; +template +class numa_partitioner{ +public: + numa_partitioner() { + printf("Creating Numa partitioner\n"); + numa_nodes = oneapi::tbb::info::numa_nodes(); + num_numa_nodes = numa_nodes.size(); + + // Create a task arena for each valid NUMA node + for (int node : numa_nodes) { + arenas.emplace_back(tbb::task_arena::constraints().set_numa_id(node)); + } + + } + + mutable std::vector numa_nodes; + mutable std::vector arenas; + mutable std::vector> data_partitions; + T inner_partitioner; + //std::vector>& get_data_partitions() { + //return data_partitions; + //} + size_t num_numa_nodes; +private: + //template friend struct start_for; + //typedef hierarchical_partition_type task_partition_type; + typedef detail::proportional_split split_type; +}; + } // namespace d1 } // namespace detail @@ -663,6 +693,8 @@ using detail::d1::auto_partitioner; using detail::d1::simple_partitioner; using detail::d1::static_partitioner; using detail::d1::affinity_partitioner; +using detail::d1::numa_partitioner; + // Split types using detail::split; using detail::proportional_split; diff --git a/p_for.cpp b/p_for.cpp new file mode 100644 index 0000000000..28b07acb6f --- /dev/null +++ b/p_for.cpp @@ -0,0 +1,44 @@ +#include +#include +#include + +#include + +int main(int argc, char **argv) +{ + auto values = std::vector(10000); + + tbb::parallel_for( tbb::blocked_range(0,values.size()), + [&](tbb::blocked_range r) + { + for (size_t i=r.begin(); i()); + double total = 0; + + for (double value : values) + { + total += value; + } + + std::cout << total << std::endl; + + total = 0; + tbb::parallel_for( tbb::blocked_range(0,values.size()), + [&](tbb::blocked_range r) + { + for (size_t i=r.begin(); i