From 16d87cb23e4102439ed1fbd3565ec147ed71fcc3 Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Thu, 7 Nov 2024 18:34:34 -0600 Subject: [PATCH 01/34] Fixed copyrights --- new_examples/CMakeLists.txt | 2 +- .../performance_tuning/CMakeLists.txt | 11 +- new_examples/performance_tuning/arena_trace.h | 167 +++++++++++++++ .../blocked_ranges_trivial.cpp | 43 +--- .../performance_tuning/constraints.cpp | 122 +++++++++++ .../global_control_and_explicit_arena.cpp | 1 - .../parallel_for_addition_partitioners.cpp | 135 ++++++++++++ .../parallel_for_spin_partitioners.cpp | 101 +++++++++ .../parallel_for_spin_partitioners_timed.cpp | 117 +++++++++++ .../parallel_for_transpose_partitioners.cpp | 194 ++++++++++++++++++ 10 files changed, 853 insertions(+), 40 deletions(-) create mode 100644 new_examples/performance_tuning/arena_trace.h create mode 100644 new_examples/performance_tuning/constraints.cpp create mode 100644 new_examples/performance_tuning/parallel_for_addition_partitioners.cpp create mode 100644 new_examples/performance_tuning/parallel_for_spin_partitioners.cpp create mode 100644 new_examples/performance_tuning/parallel_for_spin_partitioners_timed.cpp create mode 100644 new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp diff --git a/new_examples/CMakeLists.txt b/new_examples/CMakeLists.txt index c2d8215fd5..6651073a6e 100644 --- a/new_examples/CMakeLists.txt +++ b/new_examples/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required (VERSION 3.4) project(tbb_tutorials LANGUAGES CXX) -set(CMAKE_CXX_COMPILER "icpx") +set(CMAKE_CXX_COMPILER "icx-cl") set(CMAKE_LINKER "icpx") set(CMAKE_CXX_STANDARD 20) diff --git a/new_examples/performance_tuning/CMakeLists.txt b/new_examples/performance_tuning/CMakeLists.txt index ee9b133a2a..ff880d251b 100644 --- a/new_examples/performance_tuning/CMakeLists.txt +++ b/new_examples/performance_tuning/CMakeLists.txt @@ -1,12 +1,17 @@ -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb -pthread") -set(CMAKE_CXX_LINKER_FLAGS "-tbb -pthread") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qtbb") +set(CMAKE_CXX_LINKER_FLAGS "-Qtbb") foreach(tpp global_control_and_implicit_arena.cpp global_control_and_explicit_arena.cpp global_control_and_implicit_conflict.cpp global_control_and_explicit_conflict.cpp priorities_and_conflict.cpp - blocked_ranges_trivial.cpp) + blocked_ranges_trivial.cpp + constraints.cpp + parallel_for_spin_partitioners.cpp + parallel_for_spin_partitioners_timed.cpp + parallel_for_addition_partitioners.cpp + parallel_for_transpose_partitioners.cpp) string(REPLACE ".cpp" "" texe ${tpp}) add_executable(${texe} ${tpp}) target_include_directories(${texe} PUBLIC diff --git a/new_examples/performance_tuning/arena_trace.h b/new_examples/performance_tuning/arena_trace.h new file mode 100644 index 0000000000..2a680ed8e5 --- /dev/null +++ b/new_examples/performance_tuning/arena_trace.h @@ -0,0 +1,167 @@ +/* + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ARENA_TRACE_H +#define ARENA_TRACE_H 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +class arena_tracer { +public: + arena_tracer(const std::string& name) : m_t0(std::chrono::steady_clock::now()), m_name(name) { } + ~arena_tracer() { dump_trace(); } + + void add_arena(const std::string& n, tbb::task_arena& a) { + m_observers.local().emplace_back( std::make_shared(m_events, m_arena_id++, m_t0, n, a )); + } + +private: + using time_point_type = std::chrono::time_point; + time_point_type m_t0; + const std::string m_name; + std::atomic m_arena_id = 0; + + struct trace_event { + time_point_type t; + char ph; + bool is_worker; + std::thread::id tid; + int slot; + int pid; + trace_event( time_point_type t_, char ph_, + bool is_worker_, std::thread::id tid_, + int slot_, int pid_ ) : t(t_), ph(ph_), is_worker(is_worker_), tid(tid_), slot(slot_), pid(pid_) {} + }; + + using ets_vector_type = std::vector; + using ets_value_type = ets_vector_type; + using ets_type = tbb::combinable; + ets_type m_events; + + // observer class + class tracing_observer : public oneapi::tbb::task_scheduler_observer { + ets_type& m_events; + int m_arena_id; + time_point_type m_t; + const std::string m_name; + + public: + tracing_observer( ets_type& e, int arena_id, time_point_type t0, + const std::string& fn, oneapi::tbb::task_arena &a ) + : oneapi::tbb::task_scheduler_observer(a), m_events(e), m_arena_id(arena_id), m_t(t0), m_name(fn) { + observe(true); // activate the observer + } + + ~tracing_observer() { } + + int get_arena_id() { return m_arena_id; } + const std::string& get_arena_name() { return m_name; } + + void on_scheduler_entry( bool worker ) override { + auto& l = m_events.local(); + std::thread::id tid = std::this_thread::get_id(); + + l.emplace_back( + std::chrono::steady_clock::now(), + 'B', // ph + worker, // is_worker + tid, + oneapi::tbb::this_task_arena::current_thread_index(), + m_arena_id + ); + } + void on_scheduler_exit( bool worker ) override { + auto& l = m_events.local(); + std::thread::id tid = std::this_thread::get_id(); + + l.emplace_back( + std::chrono::steady_clock::now(), + 'E', // ph + worker, // is_worker + tid, + oneapi::tbb::this_task_arena::current_thread_index(), + m_arena_id + ); + } + }; + + void dump_trace() { + std::unordered_set was_seen; + + std::ofstream out(m_name); + out << "["; + + bool first = true; + for (auto& v : m_observers.range()) { + for (auto& obs : v) { + if (!first) out << ","; + first = false; + out << "\n{\"name\": \"process_name\", \"ph\" : \"M\"" + << ", \"pid\" : " << obs->get_arena_id() + << ", \"args\" : { \"name\" : \"" << obs->get_arena_name() << "\" }" + << "}"; + } + } + + ets_vector_type r; + m_events.combine_each( + [&r](const ets_vector_type& v) { + r.insert(r.end(), v.begin(), v.end()); + }); + + std::sort(r.begin(), r.end(), [](const trace_event& a, const trace_event& b) { return a.t < b.t; }); + + for (auto& e : r) { + out << ",\n{" + << "\"name\": \"" << e.tid << "\"" + << ", \"cat\": \"arena\"" + << ", \"ph\": \"" << e.ph << "\"" + << ", \"ts\": " << std::chrono::duration_cast(e.t-m_t0).count() + << ", \"pid\": " << e.pid + << ", \"tid\": " << e.slot + << "}"; + if (e.ph == 'B') { + out << ",\n{" + << "\"name\": \"flow\"" + << ", \"id\": " << e.tid + << ", \"cat\": \"arena\"" + << ", \"ts\": " << std::chrono::duration_cast(e.t-m_t0).count() + << ", \"pid\": " << e.pid + << ", \"tid\": " << e.slot; + if (was_seen.find(e.tid) != was_seen.end()) { + out << ", \"ph\": \"t\", \"bp\": \"e\" }"; + } else { + was_seen.insert(e.tid); + out << ", \"ph\": \"s\", \"bp\": \"e\" }"; + } + } + } + + out << "\n]\n"; + } + + tbb::enumerable_thread_specific>> m_observers; +}; + +#endif diff --git a/new_examples/performance_tuning/blocked_ranges_trivial.cpp b/new_examples/performance_tuning/blocked_ranges_trivial.cpp index 8515b1ce57..145f795575 100644 --- a/new_examples/performance_tuning/blocked_ranges_trivial.cpp +++ b/new_examples/performance_tuning/blocked_ranges_trivial.cpp @@ -16,9 +16,7 @@ #include #include - -#define TBB_PREVIEW_BLOCKED_RANGE_ND 1 -#include +#include "tbb/tbb.h" double f(double v); @@ -64,26 +62,10 @@ void example3d(int Z, int P, int N, int M, double* a) { ); } -void exampleNd(int Z, int P, int N, int M, double* a) { - tbb::parallel_for(tbb::blocked_rangeNd{{0,Z},{0,P},{0,N}, {0,M}}, - [=](const tbb::blocked_rangeNd& r_ijkl) { - const auto& r_i = r_ijkl.dim(0); - const auto& r_j = r_ijkl.dim(1); - const auto& r_k = r_ijkl.dim(2); - const auto& r_l = r_ijkl.dim(3); - for (int i = r_i.begin(); i < r_i.end(); ++i) - for (int j = r_j.begin(); j < r_j.end(); ++j) - for (int k = r_k.begin(); k < r_k.end(); ++k) - for (int l = r_l.begin(); l < r_l.end(); ++l) - a[i*P*N*M+j*N*M+k*M+l] = f(a[i*P*N*M+j*N*M+k*M+l]); - } - ); -} - #include static void warmupTBB(); -static bool resultsAreValid(int, const double*, const double*, const double*, const double*, const double*); +static bool resultsAreValid(int, const double*, const double*, const double*, const double*); static void serialDouble(int, double*); int main() { @@ -98,9 +80,9 @@ int main() { double* a1 = new double[Size]; double* a2 = new double[Size]; double* a3 = new double[Size]; - double* aN = new double[Size]; + for (int i = 0; i < Size; ++i) - a0[i] = a1[i] = a2[i] = a3[i] = aN[i] = 1.0; + a0[i] = a1[i] = a2[i] = a3[i] = 1.0; // Perform serial double tbb::tick_count t0 = tbb::tick_count::now(); @@ -127,21 +109,14 @@ int main() { double tbb3d_time = (tbb::tick_count::now() - t0).seconds(); std::printf("3d done\n"); - t0 = tbb::tick_count::now(); - exampleNd(Z, P, N, M, aN); - double tbbNd_time = (tbb::tick_count::now() - t0).seconds(); - std::printf("Nd done\n"); - - if (resultsAreValid(Size, a0, a1, a2, a3, aN)) { + if (resultsAreValid(Size, a0, a1, a2, a3)) { std::cout << "serial_time == " << serial_time << " seconds\n" << "tbb1d_time == " << tbb1d_time << " seconds\n" << "speedup == " << serial_time/tbb1d_time << "\n" << "tbb2d_time == " << tbb2d_time << " seconds\n" << "speedup == " << serial_time/tbb2d_time << "\n" << "tbb3d_time == " << tbb3d_time << " seconds\n" - << "speedup == " << serial_time/tbb3d_time << "\n" - << "tbbNd_time == " << tbbNd_time << " seconds\n" - << "speedup == " << serial_time/tbbNd_time << "\n"; + << "speedup == " << serial_time/tbb3d_time << "\n"; return 0; } else { std::cout << "ERROR: invalid results!\n"; @@ -181,14 +156,12 @@ double f(double v) { static bool resultsAreValid(int N, const double* a0, const double* a1, - const double* a2, const double* a3, - const double* aN) { + const double* a2, const double* a3) { for (int i = 0; i < N; ++i) { if (a0[i] != 2.0 || a1[i] != 2.0 || a2[i] != 2.0 - || a3[i] != 2.0 - || aN[i] != 2.0) { + || a3[i] != 2.0) { std::printf("%d: %f, %f, %f, %f\n", i, a0[i], a1[i], a2[i], a3[i]); std::cerr << "Invalid results" << std::endl; return false; diff --git a/new_examples/performance_tuning/constraints.cpp b/new_examples/performance_tuning/constraints.cpp new file mode 100644 index 0000000000..42800c6205 --- /dev/null +++ b/new_examples/performance_tuning/constraints.cpp @@ -0,0 +1,122 @@ +/* + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "tbb/tbb.h" +#include + +#define USE_ARENA_TRACE 1 +#if USE_ARENA_TRACE +#include "arena_trace.h" +#endif + +int N = 1000; +double w = 0.01; +double f(double v); + +void constrain_for_numa_nodes() { +#if USE_ARENA_TRACE + arena_tracer t{"numa_trace.json"}; +#endif + + std::vector numa_nodes = tbb::info::numa_nodes(); + std::vector arenas(numa_nodes.size()); + std::vector task_groups(numa_nodes.size()); + + for (int i = 0; i < numa_nodes.size(); i++) { + arenas[i].initialize(tbb::task_arena::constraints(numa_nodes[i]), 0); + #if USE_ARENA_TRACE + t.add_arena(std::to_string(i), arenas[i]); + #endif + } + for (int i = 0; i < numa_nodes.size(); i++) { + arenas[i].execute([&task_groups, i] { + task_groups[i].run([] { + tbb::parallel_for(0, N, [](int j) { f(w); }); + }); + }); + } + for (int i = 0; i < numa_nodes.size(); i++) { + arenas[i].execute([&task_groups, i] { + task_groups[i].wait(); + }); + } +} + +void constrain_for_core_type() { + + std::vector core_types = tbb::info::core_types(); + tbb::task_arena arena( + tbb::task_arena::constraints{}.set_core_type(core_types.back()) + ); + + #if USE_ARENA_TRACE + arena_tracer t{"core_trace.json"}; + t.add_arena("pcores", arena); + #endif + + arena.execute([] { + tbb::parallel_for(0, N, [](int) { f(w); }); + }); +} + +void constrain_for_no_hyperthreading() { + tbb::task_arena::constraints c; + std::vector core_types = tbb::info::core_types(); + c.set_core_type(core_types.back()); + c.set_max_threads_per_core(1); + tbb::task_arena no_ht_arena(c); + + #if USE_ARENA_TRACE + arena_tracer t{"no_ht_constraints_trace.json"}; + t.add_arena("no_ht_arena", no_ht_arena); + #endif + + no_ht_arena.execute( [] { + tbb::parallel_for(0, N, [](int) { f(w); }); + }); +} + +void limit_concurrency_for_no_hyperthreading() { + tbb::task_arena::constraints c; + std::vector core_types = tbb::info::core_types(); + c.set_core_type(core_types.back()); + c.set_max_threads_per_core(1); + int no_ht_concurrency = tbb::info::default_concurrency(c); + tbb::task_arena arena( no_ht_concurrency ); + + #if USE_ARENA_TRACE + arena_tracer t{"no_ht_concurrency_trace.json"}; + t.add_arena("no_ht_concurrency", arena); + #endif + + arena.execute( [] { + tbb::parallel_for(0, N, [](int) { f(w); }); + }); +} + +int main() { + constrain_for_numa_nodes(); + constrain_for_core_type(); + constrain_for_no_hyperthreading(); + limit_concurrency_for_no_hyperthreading(); + return 0; +} + +double f(double v) { + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < 0.01); + return 2*v; +} diff --git a/new_examples/performance_tuning/global_control_and_explicit_arena.cpp b/new_examples/performance_tuning/global_control_and_explicit_arena.cpp index 3bf5e90c14..93a8520b7e 100644 --- a/new_examples/performance_tuning/global_control_and_explicit_arena.cpp +++ b/new_examples/performance_tuning/global_control_and_explicit_arena.cpp @@ -65,7 +65,6 @@ void clearParticipation() { } void dumpParticipation(int p) { - int end = next_tid; int sum = tid_participation[0]; std::cout << "[" << tid_participation[0]; for (int i = 1; i < p; ++i) { diff --git a/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp b/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp new file mode 100644 index 0000000000..49333b9bc6 --- /dev/null +++ b/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp @@ -0,0 +1,135 @@ +/* + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include +#include +#include + +template +void parForAdd(double v, int N, double *a, const Partitioner& p) { + tbb::parallel_for( tbb::blocked_range(0, N, 1), + [v, a](const tbb::blocked_range& r) { + int ie = r.end(); + for (int i = r.begin(); i < ie; ++i) { + a[i] += v; + } + }, p + ); +} + +template +void parForAdd(double v, int N, double *a, Partitioner& p) { + tbb::parallel_for( tbb::blocked_range(0, N, 1), + [v, a](const tbb::blocked_range& r) { + int ie = r.end(); + for (int i = r.begin(); i < ie; ++i) { + a[i] += v; + } + }, p + ); +} + +void resetV(int N, double *v); +void resetA(int N, double *a); +static void warmupTBB(); + +int main(int argc, char *argv[]) { + int M = 10000; + int N = 100000; + + std::cout << "P = " << tbb::info::default_concurrency() + << std::endl << "N = " << N + << std::endl << "M = " << M << std::endl; + + #define CONSTRAIN_TO_FEWER_CORES 0 + #if CONSTRAIN_TO_FEWER_CORES + tbb::task_arena::constraints c; + c.set_max_concurrency(tbb::info::default_concurrency() - 2); + tbb::task_arena cores_arena(c); + std::cout << "Using arena with " << cores_arena.max_concurrency() << " slots\n"; + cores_arena.execute([&]() { + #endif + + double *v = new double[M]; + double *a = new double[N]; + + warmupTBB(); + resetV(M, v); + resetA(N, a); + tbb::tick_count t0 = tbb::tick_count::now(); + for (int i = 0; i < M; ++i) { + parForAdd(v[i], N, a, tbb::auto_partitioner{}); + } + double auto_time = (tbb::tick_count::now() - t0).seconds(); + + warmupTBB(); + resetA(N, a); + tbb::affinity_partitioner aff_p; + t0 = tbb::tick_count::now(); + for (int i = 0; i < M; ++i) { + parForAdd(v[i], N, a, aff_p); + } + double affinity_time = (tbb::tick_count::now() - t0).seconds(); + + warmupTBB(); + resetA(N, a); + t0 = tbb::tick_count::now(); + for (int i = 0; i < M; ++i) { + parForAdd(v[i], N, a, tbb::static_partitioner{}); + } + double static_time = (tbb::tick_count::now() - t0).seconds(); + + std::cout << "auto_partitioner = " << auto_time << std::endl + << "affinity_partitioner = " << affinity_time << std::endl + << "static_partitioner = " << static_time << std::endl; + + delete [] v; + delete [] a; + + #if CONSTRAIN_TO_FEWER_CORES + }); + #endif + + return 0; +} + + +void resetV(int N, double *v) { + for (int i = 0; i < N; ++i) { + v[i] = i; + } + std::shuffle(v, v+N, std::random_device{}); +} + +void resetA(int N, double *a) { + for (int i = 0; i < N; ++i) { + a[i] = 0; + } +} + +static void warmupTBB() { + // This is a simple loop that should get workers started. + // oneTBB creates workers lazily on first use of the library + // so this hides the startup time when looking at trivial + // examples that do little real work. + tbb::parallel_for(0, tbb::info::default_concurrency(), + [=](int) { + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < 0.01); + } + ); +} \ No newline at end of file diff --git a/new_examples/performance_tuning/parallel_for_spin_partitioners.cpp b/new_examples/performance_tuning/parallel_for_spin_partitioners.cpp new file mode 100644 index 0000000000..5de939e84a --- /dev/null +++ b/new_examples/performance_tuning/parallel_for_spin_partitioners.cpp @@ -0,0 +1,101 @@ +/* + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include + +void doWork(double sec); + +template +void pforWork(int N, const Partitioner& p) { + tbb::parallel_for( tbb::blocked_range(0, N, 1), + [](const tbb::blocked_range& r) { + int ie = r.end(); + for (int i = r.begin(); i < ie; ++i) { + doWork(i); + } + }, p + ); +} + +template +void pforWork(int N, Partitioner& p) { + tbb::parallel_for( tbb::blocked_range(0, N, 1), + [](const tbb::blocked_range& r) { + int ie = r.end(); + for (int i = r.begin(); i < ie; ++i) { + doWork(i); + } + }, p + ); +} + +static void warmupTBB(); + +int main(int argc, char *argv[]) { + int N = 1000; + int M = 10; + + std::cout << "P = " << tbb::info::default_concurrency() + << std::endl << "M = " << M + << std::endl << "N = " << N << std::endl; + + warmupTBB(); + tbb::tick_count t0 = tbb::tick_count::now(); + for (int i = 0; i < M; ++i) { + pforWork(N, tbb::auto_partitioner{}); + } + double auto_time = (tbb::tick_count::now() - t0).seconds(); + + warmupTBB(); + tbb::affinity_partitioner aff_p; + t0 = tbb::tick_count::now(); + for (int i = 0; i < M; ++i) { + pforWork(N, aff_p); + } + double affinity_time = (tbb::tick_count::now() - t0).seconds(); + + warmupTBB(); + t0 = tbb::tick_count::now(); + for (int i = 0; i < M; ++i) { + pforWork(N, tbb::static_partitioner{}); + } + double static_time = (tbb::tick_count::now() - t0).seconds(); + + std::cout << "auto_partitioner = " << auto_time << " seconds" << std::endl + << "affinity_partitioner = " << affinity_time << " seconds" << std::endl + << "static_partitioner = " << static_time << " seconds" << std::endl; + return 0; +} + +static void warmupTBB() { + // This is a simple loop that should get workers started. + // oneTBB creates workers lazily on first use of the library + // so this hides the startup time when looking at trivial + // examples that do little real work. + tbb::parallel_for(0, tbb::info::default_concurrency(), + [=](int) { + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < 0.01); + } + ); +} + +void doWork(double usec) { + double sec = usec*1e-06; + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() <= sec); +} \ No newline at end of file diff --git a/new_examples/performance_tuning/parallel_for_spin_partitioners_timed.cpp b/new_examples/performance_tuning/parallel_for_spin_partitioners_timed.cpp new file mode 100644 index 0000000000..64272005af --- /dev/null +++ b/new_examples/performance_tuning/parallel_for_spin_partitioners_timed.cpp @@ -0,0 +1,117 @@ +/* + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include +#include + +static inline void spinWaitForAtLeast(double sec=0.0); + +static inline double executeFor(int num_trials, int N, double tpi) { + tbb::tick_count t0; + for (int t = -1; t < num_trials; ++t) { + if (!t) t0 = tbb::tick_count::now(); + for (int i = 0; i < N; ++i) { + spinWaitForAtLeast(tpi); + } + } + tbb::tick_count t1 = tbb::tick_count::now(); + return (t1 - t0).seconds()/num_trials; +} + +template< typename P > +static inline double executePfor(int num_trials, int N, + int gs, P& p, double tpi) { + tbb::tick_count t0; + for (int t = -1; t < num_trials; ++t) { + if (!t) t0 = tbb::tick_count::now(); + tbb::parallel_for ( + tbb::blocked_range{0, N, static_cast(gs)}, + [tpi](const tbb::blocked_range& r) { + int e = r.end(); + for (int i = r.begin(); i < e; ++i) { + spinWaitForAtLeast(tpi); + } + }, + p + ); + } + tbb::tick_count t1 = tbb::tick_count::now(); + return (t1 - t0).seconds()/num_trials; +} + +#define CONSTRAIN_TO_ECORES 1 + +int main() { + tbb::auto_partitioner auto_p; + tbb::simple_partitioner simple_p; + tbb::static_partitioner static_p; + const std::string pname[4] = {"simple", "auto", "affinity", "static"}; + + const int N = 262144; + const int T = 20; + const double ten_ns = 0.00000001; + const double twenty_us = 0.00002; + double timing[4][19]; + + #if CONSTRAIN_TO_ECORES + std::vector core_types = tbb::info::core_types(); + tbb::task_arena::constraints c; + c.set_core_type(core_types.front()); + c.set_max_concurrency(tbb::info::default_concurrency(c) - 2); + tbb::task_arena ecore_arena(c); + std::cout << "Using arena with " << ecore_arena.max_concurrency() << " slots\n"; + ecore_arena.execute([&]() { + #endif + + for (double tpi = ten_ns; tpi < twenty_us; tpi *= 10) { + std::cout << "Speedups for " << tpi << " seconds per iteration" << std::endl + << "partitioner"; + for (int gs = 1, i = 0; gs <= N; gs *= 2, ++i) + std::cout << ", " << gs; + std::cout << std::endl; + + double serial_time = executeFor(T, N, tpi); + + for (int gs = 1, i = 0; gs <= N; gs *= 2, ++i) { + tbb::affinity_partitioner affinity_p; + spinWaitForAtLeast(0.001); + timing[0][i] = executePfor(T, N, gs, simple_p, tpi); + timing[1][i] = executePfor(T, N, gs, auto_p, tpi); + timing[2][i] = executePfor(T, N, gs, affinity_p, tpi); + timing[3][i] = executePfor(T, N, gs, static_p, tpi); + } + for (int p = 0; p < 4; ++p) { + std::cout << pname[p]; + for (int gs = 1, i = 0; gs <= N; gs *= 2, ++i) + std::cout << ", " << serial_time/timing[p][i]; + std::cout << std::endl; + } + std::cout << std::endl; + } + + #if CONSTRAIN_TO_ECORES + }); + #endif + + return 0; +} + +static inline void spinWaitForAtLeast(double sec) { + if (sec == 0.0) return; + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < sec); +} diff --git a/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp b/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp new file mode 100644 index 0000000000..9400e077ef --- /dev/null +++ b/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp @@ -0,0 +1,194 @@ +/* +Copyright (C) 2019 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom +the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES +OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE +OR OTHER DEALINGS IN THE SOFTWARE. + +SPDX-License-Identifier: MIT +*/ + +#include +#include + +double serialTranspose(int N, double *a, double *b) { + tbb::tick_count t0 = tbb::tick_count::now(); + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + b[j*N+i] = a[i*N+j]; + } + } + tbb::tick_count t1 = tbb::tick_count::now(); + + return (t1-t0).seconds(); +} + +void obliviousTranspose(int N, int ib, int ie, int jb, int je, + double *a, double *b, int gs) { + int ilen = ie-ib; + int jlen = je-jb; + if (ilen > gs || jlen > gs) { + if ( ilen > jlen ) { + int imid = (ib+ie)/2; + obliviousTranspose(N, ib, imid, jb, je, a, b, gs); + obliviousTranspose(N, imid, ie, jb, je, a, b, gs); + } else { + int jmid = (jb+je)/2; + obliviousTranspose(N, ib, ie, jb, jmid, a, b, gs); + obliviousTranspose(N, ib, ie, jmid, je, a, b, gs); + } + } else { + for (int i = ib; i < ie; ++i) { + for (int j = jb; j < je; ++j) { + b[j*N+i] = a[i*N+j]; + } + } + } +} + +double serialObliviousTranspose(int N, double *a, double *b, int gs) { + tbb::tick_count t0 = tbb::tick_count::now(); + obliviousTranspose(N, 0, N, 0, N, a, b, gs); + tbb::tick_count t1 = tbb::tick_count::now(); + return (t1-t0).seconds(); +} + +template< typename P > +double pforTranspose(int N, double *a, double *b, int gs) { + tbb::tick_count t0 = tbb::tick_count::now(); + tbb::parallel_for( tbb::blocked_range(0, N, gs), + [N, a, b](const tbb::blocked_range& r) { + int ie = r.end(); + for (int i = r.begin(); i < ie; ++i) { + for (int j = 0; j < N; ++j) { + b[j*N+i] = a[i*N+j]; + } + } + }, P() + ); + tbb::tick_count t1 = tbb::tick_count::now(); + return (t1-t0).seconds(); +} + +template +double pforTranspose2d(int N, double *a, double *b, int gs) { + tbb::tick_count t0 = tbb::tick_count::now(); + tbb::parallel_for( tbb::blocked_range2d{ + 0, N, static_cast(gs), 0, + N, static_cast(gs)}, + [N, a, b](const tbb::blocked_range2d& r) { + int ie = r.rows().end(); + int je = r.cols().end(); + for (int i = r.rows().begin(); i < ie; ++i) { + for (int j = r.cols().begin(); j < je; ++j) { + b[j*N+i] = a[i*N+j]; + } + } + }, P() + ); + tbb::tick_count t1 = tbb::tick_count::now(); + return (t1-t0).seconds(); +} + +void setArray(int N, double *a); +void checkTranspose(int N, double *a); + +#define CONSTRAIN_TO_ECORES 1 + +int main() { + int N = 2<<12; // 8192 + double *a = new double[N*N]; + double *b = new double[N*N]; + setArray(N, a); + setArray(N, b); + + #if CONSTRAIN_TO_ECORES + std::vector core_types = tbb::info::core_types(); + tbb::task_arena::constraints c; + c.set_core_type(core_types.front()); + c.set_max_concurrency(tbb::info::default_concurrency(c) - 2); + tbb::task_arena ecore_arena(c); + std::cout << "Using arena with " << ecore_arena.max_concurrency() << " slots\n"; + ecore_arena.execute([&]() { + #endif + + serialTranspose(N, a, b); + double ts = serialTranspose(N, a, b); + checkTranspose(N, b); + std::cout << "Serial Time = " << ts << std::endl; + + std::cout << "Parallel Times:" << std::endl + << "grainsize, oblivious, 1d auto, 1d simple, 2d auto, 2d simple" << std::endl; + for (int gs = 1; gs <= N; gs *= 2) { + setArray(N, a); + setArray(N, b); + serialObliviousTranspose(N, a, b, gs); + double to = serialObliviousTranspose(N, a, b, gs); + checkTranspose(N, b); + + setArray(N, a); + setArray(N, b); + pforTranspose(N, a, b, gs); + double t1d_auto = pforTranspose(N, a, b, gs); + + setArray(N, a); + setArray(N, b); + pforTranspose(N, a, b, gs); + double t1d_simple = pforTranspose(N, a, b, gs); + + setArray(N, a); + setArray(N, b); + pforTranspose2d(N, a, b, gs); + double t2d_auto = pforTranspose2d(N, a, b, gs); + + setArray(N, a); + setArray(N, b); + pforTranspose2d(N, a, b, gs); + double t2d_simple = pforTranspose2d(N, a, b, gs); + + std::cout << gs + << ", " << to + << ", " << t1d_auto + << ", " << t1d_simple + << ", " << t2d_auto + << ", " << t2d_simple << std::endl; + } + + #if CONSTRAIN_TO_ECORES + }); + #endif + + return 0; +} + +void setArray(int N, double *a) { + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + a[i*N+j] = i; + } + } +} + +void checkTranspose(int N, double *a) { + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + if (a[i*N+j] != j) { + std::cout << "Transpose failed" << std::endl; + } + } + } +} From 315e18839eaafbe8b87cadbaf56e1895c8003ec8 Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Thu, 7 Nov 2024 18:35:20 -0600 Subject: [PATCH 02/34] Added transpose example --- .../parallel_for_transpose_partitioners.cpp | 34 +++++++------------ 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp b/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp index 9400e077ef..45f68f4391 100644 --- a/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp +++ b/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp @@ -1,25 +1,17 @@ /* -Copyright (C) 2019 Intel Corporation - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), -to deal in the Software without restriction, including without limitation -the rights to use, copy, modify, merge, publish, distribute, sublicense, -and/or sell copies of the Software, and to permit persons to whom -the Software is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included -in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES -OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE -OR OTHER DEALINGS IN THE SOFTWARE. - -SPDX-License-Identifier: MIT + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ #include From 3bd28aacca90ecdc393e701bcee7f3cb02da0864 Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Thu, 14 Nov 2024 17:37:30 -0600 Subject: [PATCH 03/34] Fixed comments, added example --- new_examples/graph/graph_loops.cpp | 3 +- new_examples/graph/graph_stereoscopic_3d.cpp | 227 +++++++++++++++++++ new_examples/graph/graph_two_nodes.cpp | 2 +- new_examples/graph/graph_with_join.cpp | 2 +- 4 files changed, 230 insertions(+), 4 deletions(-) create mode 100644 new_examples/graph/graph_stereoscopic_3d.cpp diff --git a/new_examples/graph/graph_loops.cpp b/new_examples/graph/graph_loops.cpp index b1dea8cb85..847cc544e6 100644 --- a/new_examples/graph/graph_loops.cpp +++ b/new_examples/graph/graph_loops.cpp @@ -27,8 +27,7 @@ void tryPutLoop() { } }; for (int count = 0; count < limit; ++count) { - int value = count; - my_node.try_put(value); + my_node.try_put(count); } g.wait_for_all(); } diff --git a/new_examples/graph/graph_stereoscopic_3d.cpp b/new_examples/graph/graph_stereoscopic_3d.cpp new file mode 100644 index 0000000000..35bb571b93 --- /dev/null +++ b/new_examples/graph/graph_stereoscopic_3d.cpp @@ -0,0 +1,227 @@ +/* + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#define NOMINMAX + +#include +#include +#include +#include +#include +#include "../common/lodepng.h" +#include + +class Image { +public: + uint64_t frameNumber = -1; + unsigned int width = 0, height = 0; + std::shared_ptr> buffer; + static const int numChannels = 4; + static const int redOffset = 0; + static const int greenOffset = 1; + static const int blueOffset = 2; + + Image() {} + Image(uint64_t frame_number, const std::string& file_name); + Image(const Image& p); + virtual ~Image() {} + void write() const; +}; + +int getNextFrameNumber(); +Image getLeftImage(uint64_t frameNumber); +Image getRightImage(uint64_t frameNumber); +void increasePNGChannel(Image& image, int channel_offset, int increase); +void mergeImages(Image& right, const Image& left); + +void stereo3D() { + using Image = Image; + // step 1: create graph object + tbb::flow::graph g; + + // step 2: create nodes + tbb::flow::input_node frame_no_node{g, + []( tbb::flow_control &fc ) -> uint64_t { + uint64_t frame_number = getNextFrameNumber(); + if (frame_number) + return frame_number; + else { + fc.stop(); + return frame_number; + } + + } + }; + tbb::flow::function_node get_left_node{g, + /* concurrency */ tbb::flow::serial, + [] (uint64_t frame_number) -> Image { + return getLeftImage(frame_number); + } + }; + tbb::flow::function_node get_right_node{g, + /* concurrency */ tbb::flow::serial, + [] (uint64_t frame_number) -> Image { + return getRightImage(frame_number); + } + }; + tbb::flow::function_node increase_left_node{g, + /* concurrency */ tbb::flow::unlimited, + [] (Image left) -> Image { + increasePNGChannel(left, Image::redOffset, 10); + return left; + } + }; + tbb::flow::function_node increase_right_node{g, + /* concurrency */ tbb::flow::unlimited, + [] (Image right) -> Image { + increasePNGChannel(right, Image::blueOffset, 10); + return right; + } + }; + tbb::flow::join_node, tbb::flow::tag_matching > + join_images_node(g, [] (Image left) { return left.frameNumber; }, + [] (Image right) { return right.frameNumber; } ); + tbb::flow::function_node, Image> merge_images_node{g, + /* concurrency */ tbb::flow::unlimited, + [] (std::tuple t) -> Image { + auto& l = std::get<0>(t); + auto& r = std::get<1>(t); + mergeImages(r, l); + return r; + } + }; + tbb::flow::function_node write_node{g, + /* concurrency */ tbb::flow::unlimited, + [] (Image img) { + img.write(); + } + }; + + // step 3: add edges + tbb::flow::make_edge(frame_no_node, get_left_node); + tbb::flow::make_edge(frame_no_node, get_right_node); + tbb::flow::make_edge(get_left_node, increase_left_node); + tbb::flow::make_edge(get_right_node, increase_right_node); + tbb::flow::make_edge(increase_left_node, + tbb::flow::input_port<0>(join_images_node)); + tbb::flow::make_edge(increase_right_node, + tbb::flow::input_port<1>(join_images_node)); + tbb::flow::make_edge(join_images_node, merge_images_node); + tbb::flow::make_edge(merge_images_node, write_node); + + // step 4: send messages in to the graph + frame_no_node.activate(); + // step 5: wait for graph to complete + g.wait_for_all(); +} + +Image::Image(uint64_t frame_number, const std::string& file_name) : + frameNumber{frame_number}, buffer{std::make_shared< std::vector >()} { + if (lodepng::decode(*buffer, width, height, file_name)) { + std::cerr << "Error: could not read PNG file!" << std::endl; + width = height = 0; + } +}; + +Image::Image(const Image& p) : frameNumber{p.frameNumber}, + width{p.width}, height{p.height}, + buffer{p.buffer} {} + +void Image::write() const { + std::string file_name = std::string("out") + std::to_string(frameNumber) + ".png"; + if (lodepng::encode(file_name, *buffer, width, height)) { + std::cerr << "Error: could not write PNG file!" << std::endl; + } +} + +static int stereo3DFrameCounter = 0; +static int stero3DNumImages = 0; + +void initStereo3D(int num_images) { + stereo3DFrameCounter = 0; + stero3DNumImages = num_images; +} + +int getNextFrameNumber() { + if ( stereo3DFrameCounter < stero3DNumImages ) { + return ++stereo3DFrameCounter; + } else { + return 0; + } +} + +Image getLeftImage(uint64_t frameNumber) { + return Image(frameNumber, "input1.png"); +} + +Image getRightImage(uint64_t frameNumber) { + return Image(frameNumber, "input2.png"); +} + +void increasePNGChannel(Image& image, int channel_offset, int increase) { + const int height_base = Image::numChannels * image.width; + std::vector& buffer = *image.buffer; + + // Increase selected color channel by a predefined value + for (unsigned int y = 0; y < image.height; y++) { + const int height_offset = height_base * y; + for (unsigned int x = 0; x < image.width; x++) { + int pixel_offset = height_offset + Image::numChannels * x + channel_offset; + buffer[pixel_offset] = static_cast(std::min(buffer[pixel_offset] + increase, 255)); + } + } +} + +void mergeImages(Image& right, const Image& left) { + const int channels_per_pixel = Image::numChannels; + const int height_base = channels_per_pixel * right.width; + std::vector& left_buffer = *left.buffer; + std::vector& right_buffer = *right.buffer; + + for (unsigned int y = 0; y < right.height; y++) { + const int height_offset = height_base * y; + for (unsigned int x = 0; x < right.width; x++) { + const int pixel_offset = height_offset + channels_per_pixel * x; + const int red_index = pixel_offset + Image::redOffset; + right_buffer[red_index] = left_buffer[red_index]; + } + } +} + +static void warmupTBB() { + tbb::parallel_for(0, tbb::this_task_arena::max_concurrency(), [](int) { + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < 0.01); + }); +} + +int main(int argc, char *argv[]) { + int num_images = 3; + + initStereo3D(num_images); + + warmupTBB(); + double parallel_time = 0.0; + { + tbb::tick_count t0 = tbb::tick_count::now(); + stereo3D(); + parallel_time = (tbb::tick_count::now() - t0).seconds(); + } + + std::cout << "parallel_time == " << parallel_time << " seconds" << std::endl; + return 0; +} + diff --git a/new_examples/graph/graph_two_nodes.cpp b/new_examples/graph/graph_two_nodes.cpp index 1a3d647744..481667e714 100644 --- a/new_examples/graph/graph_two_nodes.cpp +++ b/new_examples/graph/graph_two_nodes.cpp @@ -40,7 +40,7 @@ void graphTwoNodes() { // step 3: add edges tbb::flow::make_edge(my_first_node, my_second_node); - // step 4: send messages + // step 4: send message that eagerly starts graph execution my_first_node.try_put(10); // step 5: wait for graph to complete diff --git a/new_examples/graph/graph_with_join.cpp b/new_examples/graph/graph_with_join.cpp index 88ad9639cb..2efc46212a 100644 --- a/new_examples/graph/graph_with_join.cpp +++ b/new_examples/graph/graph_with_join.cpp @@ -55,7 +55,7 @@ void graphJoin() { make_edge(my_other_node, tbb::flow::input_port<1>(my_join_node)); make_edge(my_join_node, my_final_node); - // step 4: send messages + // step 4: send messages that eagerly start graph execution my_node.try_put(1); my_other_node.try_put(2); // step 5: wait for the graph to complete From 884582ef972f8b1c1ce3b41f9ac7e6f67f25ab08 Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Thu, 14 Nov 2024 21:34:41 -0600 Subject: [PATCH 04/34] Updated execute_while_building example --- .../graph/graph_execute_while_building.cpp | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/new_examples/graph/graph_execute_while_building.cpp b/new_examples/graph/graph_execute_while_building.cpp index 7a977221df..387bb9a9f9 100644 --- a/new_examples/graph/graph_execute_while_building.cpp +++ b/new_examples/graph/graph_execute_while_building.cpp @@ -16,6 +16,7 @@ #include #include +#include #include struct config_t { @@ -23,16 +24,17 @@ struct config_t { int predecessor; }; -config_t configuration[] = { { 0, 3 }, - { 1, 4 }, - { 2, 5 }, - { 3, -1 }, - { 4, -1 }, - { 5, 3 }, - { 6, 4 }, - { 7, 1 } }; +// each element defines a node and what other node it must wait for +std::vector configuration = { { 0, 3 }, + { 1, 4 }, + { 2, 5 }, + { 3, -1 }, + { 4, -1 }, + { 5, 3 }, + { 6, 4 }, + { 7, 1 } }; -int num_nodes = sizeof(configuration) / sizeof(config_t); +const int num_nodes = configuration.size(); int main() { tbb::flow::graph g; @@ -57,14 +59,18 @@ int main() { return m; } }); - // connect the new node to its future + // connect the new node to its "future" tbb::flow::make_edge(*work_nodes[c.id], future_nodes[c.id]); // start the node or link it to predecessor's promise if (c.predecessor != -1) { + // must connect to predecessor's "future" + // if the future is already written to this will start the node + // otherwise it will be started when the future is written std::printf("new %d with %d -> %d\n", c.id, c.predecessor, c.id); tbb::flow::make_edge(future_nodes[c.predecessor], *work_nodes[c.id]); } else { + // does not need to wait and can be started immediately std::printf("starting %d from main\n", c.id); work_nodes[c.id]->try_put(tbb::flow::continue_msg{}); } From 27d71ea301578b142b1de5983d7d39776103bc44 Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Thu, 14 Nov 2024 22:32:20 -0600 Subject: [PATCH 05/34] Added noexcept to lightweight samples --- new_examples/graph/graph_small_nodes.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/new_examples/graph/graph_small_nodes.cpp b/new_examples/graph/graph_small_nodes.cpp index 345ff196e6..6e2c9f5051 100644 --- a/new_examples/graph/graph_small_nodes.cpp +++ b/new_examples/graph/graph_small_nodes.cpp @@ -57,9 +57,9 @@ void small_nodes_lightweight() { tbb::flow::function_node< int, int > add( g, tbb::flow::unlimited, [](const int &v) { return v+1; } ); tbb::flow::function_node< int, int, tbb::flow::lightweight > multiply( g, tbb::flow::unlimited, - [](const int &v) { return v*2; } ); + [](const int &v) noexcept { return v*2; } ); tbb::flow::function_node< int, int, tbb::flow::lightweight > cube( g, tbb::flow::unlimited, - [](const int &v) { return v*v*v; } ); + [](const int &v) noexcept { return v*v*v; } ); tbb::flow::make_edge(add, multiply); tbb::flow::make_edge(multiply, cube); @@ -73,7 +73,7 @@ void small_nodes_combined_lightweight() { tbb::flow::graph g; tbb::flow::function_node< int, int, tbb::flow::lightweight > combined_node( g, tbb::flow::unlimited, - [](const int &v) { + [](const int &v) noexcept { auto v2 = (v+1)*2; return v2*v2*v2; }); From b8dda7f1b8698769e9ef3a43d45d6529a470d7ea Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Sat, 16 Nov 2024 13:31:10 -0600 Subject: [PATCH 06/34] Fixed minor typos --- new_examples/tasks/parallel_invoke_fib.cpp | 2 +- new_examples/tasks/resumable_tasks.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/new_examples/tasks/parallel_invoke_fib.cpp b/new_examples/tasks/parallel_invoke_fib.cpp index a5e2f7c584..903729fc0f 100644 --- a/new_examples/tasks/parallel_invoke_fib.cpp +++ b/new_examples/tasks/parallel_invoke_fib.cpp @@ -73,7 +73,7 @@ int main(int argc, char** argv) std::cout << "SerialFib: " << fib_s << " Time: " << t_s << "\n"; std::cout << "ParallelFib: " << fib_p << " Time: " << t_p << " Speedup: " << t_s/t_p << "\n"; - std::cout << "ParallelFibCutoff_30: " << fib_c << " Time: " << t_p << " Speedup: " << t_s/t_c << "\n"; + std::cout << "ParallelFibCutoff_30: " << fib_c << " Time: " << t_c << " Speedup: " << t_s/t_c << "\n"; return 0; } diff --git a/new_examples/tasks/resumable_tasks.cpp b/new_examples/tasks/resumable_tasks.cpp index 9cd3cc8982..c1bedaf921 100755 --- a/new_examples/tasks/resumable_tasks.cpp +++ b/new_examples/tasks/resumable_tasks.cpp @@ -32,7 +32,7 @@ int main() { tbb::task::suspend([=,&sycl_q](tbb::task::suspend_point tag) { auto sycl_event = sycl_q.fill(a, id, N); sycl_q.submit([=](sycl::handler& sycl_h) { - sycl_h.depends_on(sycl_event); // only run after e is done + sycl_h.depends_on(sycl_event); // run after sycl_event is done sycl_h.host_task([tag]() { tbb::task::resume(tag); }); From e1132a1fb2885d446f3d1d8e60962a86362e8f2e Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Sat, 16 Nov 2024 20:07:40 -0600 Subject: [PATCH 07/34] Added atomics migration example --- new_examples/migration/migrate_atomics.cpp | 79 +++++++++++++++++++ .../migration/migrate_task_scheduler_init.cpp | 1 - 2 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 new_examples/migration/migrate_atomics.cpp diff --git a/new_examples/migration/migrate_atomics.cpp b/new_examples/migration/migrate_atomics.cpp new file mode 100644 index 0000000000..822858a98c --- /dev/null +++ b/new_examples/migration/migrate_atomics.cpp @@ -0,0 +1,79 @@ +/* + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include +#include +#include + +#include +#include + +#if TBB_VERSION_MAJOR > 2020 +#include +#else +#include +#endif + +int main(int argc, char** argv) { + long int n = 1000000000; + constexpr int num_bins = 256; + + // Initialize random number generator + std::random_device seed; // Random device seed + std::mt19937 mte{seed()}; // mersenne_twister_engine + std::uniform_int_distribution<> uniform{0,num_bins}; + // Initialize image + std::vector image; // empty vector + image.reserve(n); // image vector prealocated + std::generate_n(std::back_inserter(image), n, + [&] { return uniform(mte); } + ); + // Initialize histogram + std::vector hist(num_bins); + + // Serial execution + tbb::tick_count t0 = tbb::tick_count::now(); + std::for_each(image.begin(), image.end(), + [&](uint8_t i){hist[i]++;}); + tbb::tick_count t1 = tbb::tick_count::now(); + double t_serial = (t1 - t0).seconds(); + + // Parallel execution + #if TBB_VERSION_MAJOR > 2020 + std::vector> hist_p(num_bins); + #else + std::vector> hist_p(num_bins); + #endif + + t0 = tbb::tick_count::now(); + parallel_for(tbb::blocked_range{0, image.size()}, + [&](const tbb::blocked_range& r) + { + for (size_t i = r.begin(); i < r.end(); ++i) + hist_p[image[i]]++; + }); + t1 = tbb::tick_count::now(); + double t_parallel = (t1 - t0).seconds(); + + std::cout << "Serial: " << t_serial << ", "; + std::cout << "Parallel: " << t_parallel << ", "; + std::cout << "Speed-up: " << t_serial/t_parallel << std::endl; + + if (!std::equal(hist.begin(),hist.end(),hist_p.begin())) + std::cerr << "Parallel computation failed!!" << std::endl; + return 0; +} diff --git a/new_examples/migration/migrate_task_scheduler_init.cpp b/new_examples/migration/migrate_task_scheduler_init.cpp index 7021a38a31..4e1114c6fb 100644 --- a/new_examples/migration/migrate_task_scheduler_init.cpp +++ b/new_examples/migration/migrate_task_scheduler_init.cpp @@ -77,7 +77,6 @@ void clearParticipation() { } void dumpParticipation(int p) { - int end = next_tid; int sum = tid_participation[0]; std::cout << "[" << tid_participation[0]; for (int i = 1; i < p; ++i) { From 055781ad76569c631986a2a08652a47016a02a6b Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Sat, 16 Nov 2024 20:31:56 -0600 Subject: [PATCH 08/34] Added warnings for old TBB usage --- new_examples/migration/migrate_atomics.cpp | 1 + new_examples/migration/migrate_bypass_tasks.cpp | 1 + new_examples/migration/migrate_parallel_do.cpp | 1 + new_examples/migration/migrate_priorities.cpp | 1 + new_examples/migration/migrate_recycling_tasks.cpp | 5 +++-- new_examples/migration/migrate_task_blocking.cpp | 1 + new_examples/migration/migrate_task_scheduler_init.cpp | 1 + new_examples/migration/migrate_tasks_adding_work.cpp | 1 + 8 files changed, 10 insertions(+), 2 deletions(-) diff --git a/new_examples/migration/migrate_atomics.cpp b/new_examples/migration/migrate_atomics.cpp index 822858a98c..95c0c551b5 100644 --- a/new_examples/migration/migrate_atomics.cpp +++ b/new_examples/migration/migrate_atomics.cpp @@ -56,6 +56,7 @@ int main(int argc, char** argv) { #if TBB_VERSION_MAJOR > 2020 std::vector> hist_p(num_bins); #else + #warning Using tbb::atomic instead of std::atomic std::vector> hist_p(num_bins); #endif diff --git a/new_examples/migration/migrate_bypass_tasks.cpp b/new_examples/migration/migrate_bypass_tasks.cpp index 64ba8f6b81..8675252269 100644 --- a/new_examples/migration/migrate_bypass_tasks.cpp +++ b/new_examples/migration/migrate_bypass_tasks.cpp @@ -114,6 +114,7 @@ void parallelFwdSubTaskGroup(std::vector& x, } #if TBB_VERSION_MAJOR <= 2020 +#warning Using tbb::task directly using RootTask = tbb::empty_task; class FwdSubTask : public tbb::task { diff --git a/new_examples/migration/migrate_parallel_do.cpp b/new_examples/migration/migrate_parallel_do.cpp index 8672f9108d..335d94890d 100644 --- a/new_examples/migration/migrate_parallel_do.cpp +++ b/new_examples/migration/migrate_parallel_do.cpp @@ -87,6 +87,7 @@ void parallelFwdSub(std::vector& x, } ); #else +#warning Using tbb::parallel_do instead of tbb::parallel_for_each tbb::parallel_do( &top_left, &top_left+1, [&](const BlockIndex& bi, tbb::parallel_do_feeder& f) { auto [r, c] = bi; diff --git a/new_examples/migration/migrate_priorities.cpp b/new_examples/migration/migrate_priorities.cpp index 3b6cc43e51..e65bb4f5cf 100644 --- a/new_examples/migration/migrate_priorities.cpp +++ b/new_examples/migration/migrate_priorities.cpp @@ -71,6 +71,7 @@ void runParallelForWithHighPriority() { std::printf("\n"); } #else +#warning Using tbb::task directly auto P = tbb::task_scheduler_init::default_num_threads(); class MyTask : public tbb::task { diff --git a/new_examples/migration/migrate_recycling_tasks.cpp b/new_examples/migration/migrate_recycling_tasks.cpp index eae856a95b..f066c130a6 100644 --- a/new_examples/migration/migrate_recycling_tasks.cpp +++ b/new_examples/migration/migrate_recycling_tasks.cpp @@ -62,8 +62,8 @@ class FwdSubFunctor { const std::vector& a, std::vector& b, std::vector>& ref_count) : - my_tg(tg), my_N(N), my_num_blocks(num_blocks), - my_index(new BlockIndex{bi}), + my_tg(tg), my_index(new BlockIndex{bi}), + my_N(N), my_num_blocks(num_blocks), my_x(x), my_a(a), my_b(b), my_ref_count(ref_count) {} void operator()() const { @@ -123,6 +123,7 @@ void parallelFwdSub(std::vector& x, tg.wait(); } #else +#warning Using tbb::task directly with recycling using RootTask = tbb::empty_task; class FwdSubTask : public tbb::task { diff --git a/new_examples/migration/migrate_task_blocking.cpp b/new_examples/migration/migrate_task_blocking.cpp index b09edbaca3..0852e38108 100644 --- a/new_examples/migration/migrate_task_blocking.cpp +++ b/new_examples/migration/migrate_task_blocking.cpp @@ -28,6 +28,7 @@ void taskBlocking() { g.wait(); } #else +#warning Using tbb::task directly const int P = tbb::task_scheduler_init::default_num_threads(); class MyTask : public tbb::task { diff --git a/new_examples/migration/migrate_task_scheduler_init.cpp b/new_examples/migration/migrate_task_scheduler_init.cpp index 4e1114c6fb..3f2a54c80a 100644 --- a/new_examples/migration/migrate_task_scheduler_init.cpp +++ b/new_examples/migration/migrate_task_scheduler_init.cpp @@ -32,6 +32,7 @@ void setThreadsAndSlots() { }); } #else +#warning Using tbb::task_scheduler_init instead of tbb::global_control const int N = tbb::task_scheduler_init::default_num_threads(); void setThreadsAndSlots() { diff --git a/new_examples/migration/migrate_tasks_adding_work.cpp b/new_examples/migration/migrate_tasks_adding_work.cpp index 01686557e6..442b7eaed7 100644 --- a/new_examples/migration/migrate_tasks_adding_work.cpp +++ b/new_examples/migration/migrate_tasks_adding_work.cpp @@ -102,6 +102,7 @@ void parallelFwdSubTaskGroup(std::vector& x, } #if TBB_VERSION_MAJOR <= 2020 +#warning Using tbb::task directly using RootTask = tbb::empty_task; class FwdSubTask : public tbb::task { From 848c1e005f6cc0720758b2211aadb84076049047 Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Sun, 17 Nov 2024 22:26:56 -0600 Subject: [PATCH 09/34] Changes based on reviews --- .../migration/migrate_bypass_tasks.cpp | 25 +++++++++++++++++++ .../migration/migrate_task_scheduler_init.cpp | 6 ++--- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/new_examples/migration/migrate_bypass_tasks.cpp b/new_examples/migration/migrate_bypass_tasks.cpp index 8675252269..fc35d6782b 100644 --- a/new_examples/migration/migrate_bypass_tasks.cpp +++ b/new_examples/migration/migrate_bypass_tasks.cpp @@ -54,6 +54,7 @@ void serialFwdSubTiled(std::vector& x, } } +#if TBB_VERSION_MAJOR > 2020 tbb::task_handle fwdSubTGBody(tbb::task_group& tg, int N, int num_blocks, const std::pair bi, @@ -86,6 +87,30 @@ tbb::task_handle fwdSubTGBody(tbb::task_group& tg, return deferred_task; } +#else +void fwdSubTGBody(tbb::task_group& tg, + int N, int num_blocks, + const std::pair bi, + std::vector& x, + const std::vector& a, + std::vector& b, + std::vector>& ref_count) { + auto [r, c] = bi; + computeBlock(N, r, c, x, a, b); + // add successor to right if ready + if (c + 1 <= r && --ref_count[r*num_blocks + c + 1] == 0) { + tg.run([&, N, num_blocks, r, c]() { + fwdSubTGBody(tg, N, num_blocks, BlockIndex(r, c+1), x, a, b, ref_count); + }); + } + // add succesor below if ready + if (r + 1 < (size_t)num_blocks && --ref_count[(r+1)*num_blocks + c] == 0) { + tg.run([&, N, num_blocks, r, c]() { + fwdSubTGBody(tg, N, num_blocks, BlockIndex(r+1, c), x, a, b, ref_count); + }); + } +} +#endif void parallelFwdSubTaskGroup(std::vector& x, const std::vector& a, diff --git a/new_examples/migration/migrate_task_scheduler_init.cpp b/new_examples/migration/migrate_task_scheduler_init.cpp index 3f2a54c80a..94db31548f 100644 --- a/new_examples/migration/migrate_task_scheduler_init.cpp +++ b/new_examples/migration/migrate_task_scheduler_init.cpp @@ -19,7 +19,7 @@ void doWork(double seconds); #if TBB_VERSION_MAJOR > 2020 -const int N = 2*tbb::info::default_concurrency(); +const int N = tbb::info::default_concurrency(); void setThreadsAndSlots() { tbb::global_control gc(tbb::global_control::max_allowed_parallelism, N); @@ -84,11 +84,11 @@ void dumpParticipation(int p) { sum += tid_participation[i]; std::cout << ", " << tid_participation[i]; } - for (int i = p; i < 2*N; ++i) + for (int i = p; i < N; ++i) std::cout << ", -"; std::cout << "]\n" << "sum == " << sum << "\n" - << "expected sum " << 2*10*N << "\n"; + << "expected sum " << 10*N << "\n"; clearParticipation(); } From e2e813ed5322bf8bd192daddc76d4f0bcd8dd88b Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Tue, 19 Nov 2024 21:04:18 -0600 Subject: [PATCH 10/34] Reordered example --- .../global_control_and_implicit_arena.cpp | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/new_examples/performance_tuning/global_control_and_implicit_arena.cpp b/new_examples/performance_tuning/global_control_and_implicit_arena.cpp index a1074dc147..e9a3dfc97a 100644 --- a/new_examples/performance_tuning/global_control_and_implicit_arena.cpp +++ b/new_examples/performance_tuning/global_control_and_implicit_arena.cpp @@ -14,11 +14,18 @@ limitations under the License. */ -#include #include const int default_P = tbb::info::default_concurrency(); -void doWork(double seconds); + +void noteParticipation(); /* record info for participation vector */ +void dumpParticipation(int p); /* display participation vector */ + +void doWork(double seconds) { + noteParticipation(); + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < seconds); +} void arenaGlobalControlImplicitArena(int p) { tbb::global_control gc(tbb::global_control::max_allowed_parallelism, p); @@ -27,11 +34,19 @@ void arenaGlobalControlImplicitArena(int p) { [](int) { doWork(0.01); }); } + +int main() { + arenaGlobalControlImplicitArena(default_P); + dumpParticipation(default_P); + arenaGlobalControlImplicitArena(default_P/2); + dumpParticipation(default_P/2); + arenaGlobalControlImplicitArena(2*default_P); + dumpParticipation(2*default_P); + return 0; +} + #include -#include -#include -#include -#include +#include #include std::atomic next_tid; @@ -46,12 +61,6 @@ void noteParticipation() { ++tid_participation[t]; } -void doWork(double seconds) { - noteParticipation(); - tbb::tick_count t0 = tbb::tick_count::now(); - while ((tbb::tick_count::now() - t0).seconds() < seconds); -} - void clearParticipation() { next_tid = 0; my_tid.clear(); @@ -60,7 +69,6 @@ void clearParticipation() { } void dumpParticipation(int p) { - int end = next_tid; int sum = tid_participation[0]; std::cout << "[" << tid_participation[0]; for (int i = 1; i < std::min(p, default_P); ++i) { @@ -71,17 +79,9 @@ void dumpParticipation(int p) { std::cout << ", -"; std::cout << "]\n" << "sum == " << sum << "\n" - << "expected sum " << 10*default_P << "\n"; + << "expected sum " << 10*default_P << "\n\n"; clearParticipation(); } -int main() { - arenaGlobalControlImplicitArena(default_P); - dumpParticipation(default_P); - arenaGlobalControlImplicitArena(default_P/2); - dumpParticipation(default_P/2); - arenaGlobalControlImplicitArena(2*default_P); - dumpParticipation(2*default_P); - return 0; -} + From 971851a207288797f77668cc3f60a1d2cf971da7 Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Tue, 19 Nov 2024 22:05:56 -0600 Subject: [PATCH 11/34] Modified bounds --- .../global_control_and_implicit_conflict.cpp | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp index bf3fa3b863..d05ac031d5 100644 --- a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp +++ b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp @@ -14,20 +14,29 @@ limitations under the License. */ -#include #include #include const int default_P = tbb::info::default_concurrency(); -void doWork(int inc, double seconds); + void waitUntil(int N); +void noteParticipation(int offset); +void dumpParticipation(int p); + +void doWork(int offset, double seconds) { + noteParticipation(offset); + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < seconds); +} -void arenaGlobalControlImplicitArena(int p, int inc) { +void arenaGlobalControlImplicitArena(int p, int offset) { tbb::global_control gc(tbb::global_control::max_allowed_parallelism, p); tbb::parallel_for(0, 10*default_P, - [=](int) { doWork(inc, 0.01); }); + [=](int) { + doWork(offset, 0.01); + }); } void runTwoThreads(int p0, int p1) { @@ -43,29 +52,26 @@ void runTwoThreads(int p0, int p1) { t1.join(); } +int main() { + runTwoThreads(default_P/2, default_P); + dumpParticipation(default_P); + return 0; +} + #include -#include -#include -#include -#include +#include #include std::atomic next_tid; tbb::enumerable_thread_specific my_tid(-1); -std::vector> tid_participation(2*default_P); +std::vector> tid_participation(default_P); -void noteParticipation(int inc) { +void noteParticipation(int offset) { auto& t = my_tid.local(); if (t == -1) { t = next_tid++; } - tid_participation[t] += inc; -} - -void doWork(int inc, double seconds) { - noteParticipation(inc); - tbb::tick_count t0 = tbb::tick_count::now(); - while ((tbb::tick_count::now() - t0).seconds() < seconds); + tid_participation[t] += offset; } void clearParticipation() { @@ -76,10 +82,9 @@ void clearParticipation() { } void dumpParticipation(int p) { - int end = next_tid; int sum = tid_participation[0]; std::cout << "[" << tid_participation[0]; - for (int i = 1; i < 2*default_P + 1; ++i) { + for (int i = 1; i < default_P; ++i) { sum += tid_participation[i]; std::cout << ", " << tid_participation[i]; } @@ -96,9 +101,4 @@ void waitUntil(int N) { while (count_up != N); } -int main() { - runTwoThreads(default_P/2, default_P); - dumpParticipation(default_P); - return 0; -} From 58a111a384b4307497efabcde96b666f3b0475ae Mon Sep 17 00:00:00 2001 From: Michael Voss Date: Tue, 19 Nov 2024 22:17:07 -0600 Subject: [PATCH 12/34] Fixed vector size in example --- .../global_control_and_implicit_conflict.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp index d05ac031d5..ba72c3b654 100644 --- a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp +++ b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp @@ -21,7 +21,7 @@ const int default_P = tbb::info::default_concurrency(); void waitUntil(int N); void noteParticipation(int offset); -void dumpParticipation(int p); +void dumpParticipation(); void doWork(int offset, double seconds) { noteParticipation(offset); @@ -54,7 +54,7 @@ void runTwoThreads(int p0, int p1) { int main() { runTwoThreads(default_P/2, default_P); - dumpParticipation(default_P); + dumpParticipation(); return 0; } @@ -64,7 +64,7 @@ int main() { std::atomic next_tid; tbb::enumerable_thread_specific my_tid(-1); -std::vector> tid_participation(default_P); +std::vector> tid_participation(default_P+1); void noteParticipation(int offset) { auto& t = my_tid.local(); @@ -81,10 +81,10 @@ void clearParticipation() { p = 0; } -void dumpParticipation(int p) { +void dumpParticipation() { int sum = tid_participation[0]; std::cout << "[" << tid_participation[0]; - for (int i = 1; i < default_P; ++i) { + for (int i = 1; i < tid_participation.size(); ++i) { sum += tid_participation[i]; std::cout << ", " << tid_participation[i]; } From 90e2c48b8a400a63a86512629ded7769434fb8ed Mon Sep 17 00:00:00 2001 From: Michael Voss Date: Tue, 19 Nov 2024 22:56:54 -0600 Subject: [PATCH 13/34] Change barriers to ensure exact overlap of lifetimes --- .../global_control_and_implicit_conflict.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp index ba72c3b654..a51fe5de46 100644 --- a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp +++ b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp @@ -32,20 +32,24 @@ void doWork(int offset, double seconds) { void arenaGlobalControlImplicitArena(int p, int offset) { tbb::global_control gc(tbb::global_control::max_allowed_parallelism, p); + // we use waitUntil to force overlap of the gc lifetimes + waitUntil(2); + tbb::parallel_for(0, 10*default_P, [=](int) { doWork(offset, 0.01); }); + + // we prevent either gc from being destroyed until both are done + waitUntil(2); } void runTwoThreads(int p0, int p1) { std::thread t0([=]() { - waitUntil(2); arenaGlobalControlImplicitArena(p0, 1); }); std::thread t1([=]() { - waitUntil(2); arenaGlobalControlImplicitArena(p1, 10000); }); t0.join(); @@ -99,6 +103,7 @@ std::atomic count_up = 0; void waitUntil(int N) { ++count_up; while (count_up != N); + count_up = 0; } From 4341288e8ae28d244c2eefa6d94059623a646c9e Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Wed, 20 Nov 2024 09:07:22 -0600 Subject: [PATCH 14/34] Fixed waitUntil --- .../global_control_and_implicit_conflict.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp index a51fe5de46..1ffe79901b 100644 --- a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp +++ b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp @@ -19,7 +19,8 @@ const int default_P = tbb::info::default_concurrency(); -void waitUntil(int N); +using counter_t = std::atomic; +void waitUntil(int N, counter_t& c); void noteParticipation(int offset); void dumpParticipation(); @@ -29,11 +30,13 @@ void doWork(int offset, double seconds) { while ((tbb::tick_count::now() - t0).seconds() < seconds); } +counter_t counter1 = 0, counter2 = 0; + void arenaGlobalControlImplicitArena(int p, int offset) { tbb::global_control gc(tbb::global_control::max_allowed_parallelism, p); // we use waitUntil to force overlap of the gc lifetimes - waitUntil(2); + waitUntil(2, counter1); tbb::parallel_for(0, 10*default_P, @@ -42,7 +45,7 @@ void arenaGlobalControlImplicitArena(int p, int offset) { }); // we prevent either gc from being destroyed until both are done - waitUntil(2); + waitUntil(2, counter2); } void runTwoThreads(int p0, int p1) { @@ -96,14 +99,12 @@ void dumpParticipation() { << "sum == " << sum << "\n" << "expected sum == " << 10*default_P + 10*default_P*10000 << "\n"; clearParticipation(); + counter1 = 0; counter2 = 0; } -std::atomic count_up = 0; - -void waitUntil(int N) { - ++count_up; - while (count_up != N); - count_up = 0; +void waitUntil(int N, counter_t& c) { + ++c; + while (c != N); } From e254dea007e75947b3acb2674fd188a77f3a51ce Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Wed, 20 Nov 2024 09:18:43 -0600 Subject: [PATCH 15/34] Added whitespace in output --- .../performance_tuning/global_control_and_explicit_arena.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/new_examples/performance_tuning/global_control_and_explicit_arena.cpp b/new_examples/performance_tuning/global_control_and_explicit_arena.cpp index 93a8520b7e..6928dc6d70 100644 --- a/new_examples/performance_tuning/global_control_and_explicit_arena.cpp +++ b/new_examples/performance_tuning/global_control_and_explicit_arena.cpp @@ -75,7 +75,7 @@ void dumpParticipation(int p) { std::cout << ", -"; std::cout << "]\n" << "sum == " << sum << "\n" - << "expected sum " << 10*default_P << "\n"; + << "expected sum " << 10*default_P << "\n\n"; clearParticipation(); } From 3e6d92ba661f3166048475595d625831450a8260 Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Wed, 20 Nov 2024 09:40:19 -0600 Subject: [PATCH 16/34] Made the conflict examples uniform --- .../global_control_and_explicit_conflict.cpp | 70 +++++++++---------- .../global_control_and_implicit_conflict.cpp | 10 +-- 2 files changed, 37 insertions(+), 43 deletions(-) diff --git a/new_examples/performance_tuning/global_control_and_explicit_conflict.cpp b/new_examples/performance_tuning/global_control_and_explicit_conflict.cpp index a46c9d1100..35f5b5022b 100644 --- a/new_examples/performance_tuning/global_control_and_explicit_conflict.cpp +++ b/new_examples/performance_tuning/global_control_and_explicit_conflict.cpp @@ -14,44 +14,55 @@ limitations under the License. */ -#include #include #include const int default_P = tbb::info::default_concurrency(); -void doWork(int inc, double seconds); -void waitUntil(int N); +using counter_t = std::atomic; +void waitUntil(int N, counter_t& c); +void noteParticipation(int offset); +void dumpParticipation(); -void arenaGlobalControlExplicitArena(int p, int inc) { +void doWork(int offset, double seconds) { + noteParticipation(offset); + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < seconds); +} + +counter_t counter1 = 0, counter2 = 0; + +void arenaGlobalControlExplicitArena(int p, int offset) { tbb::global_control gc(tbb::global_control::max_allowed_parallelism, p); + // we use waitUntil to force overlap of the gc lifetimes + waitUntil(2, counter1); tbb::task_arena a{2*tbb::info::default_concurrency()}; a.execute([=]() { tbb::parallel_for(0, 10*tbb::info::default_concurrency(), - [=](int) { doWork(inc, 0.01); }); + [=](int) { doWork(offset, 0.01); }); }); + + // we prevent either gc from being destroyed until both are done + waitUntil(2, counter2); } void runTwoThreads(int p0, int p1) { - std::thread t0([=]() { - waitUntil(2); - arenaGlobalControlExplicitArena(p0, 1); - }); - std::thread t1([=]() { - waitUntil(2); - arenaGlobalControlExplicitArena(p1, 10000); - }); + std::thread t0([=]() { arenaGlobalControlExplicitArena(p0, 1); }); + std::thread t1([=]() { arenaGlobalControlExplicitArena(p1, 10000); }); t0.join(); t1.join(); } +int main() { + runTwoThreads(default_P/2, 2*default_P); + dumpParticipation(); + return 0; +} + #include -#include -#include -#include -#include +#include #include std::atomic next_tid; @@ -66,12 +77,6 @@ void noteParticipation(int inc) { tid_participation[t] += inc; } -void doWork(int inc, double seconds) { - noteParticipation(inc); - tbb::tick_count t0 = tbb::tick_count::now(); - while ((tbb::tick_count::now() - t0).seconds() < seconds); -} - void clearParticipation() { next_tid = 0; my_tid.clear(); @@ -79,11 +84,10 @@ void clearParticipation() { p = 0; } -void dumpParticipation(int p) { - int end = next_tid; +void dumpParticipation() { int sum = tid_participation[0]; std::cout << "[" << tid_participation[0]; - for (int i = 1; i < 2*default_P; ++i) { + for (int i = 1; i < tid_participation.size(); ++i) { sum += tid_participation[i]; std::cout << ", " << tid_participation[i]; } @@ -91,18 +95,12 @@ void dumpParticipation(int p) { << "sum == " << sum << "\n" << "expected sum == " << 10*default_P + 10*default_P*10000 << "\n"; clearParticipation(); + counter1 = 0; counter2 = 0; } -std::atomic count_up = 0; - -void waitUntil(int N) { - ++count_up; - while (count_up != N); +void waitUntil(int N, counter_t& c) { + ++c; + while (c != N); } -int main() { - runTwoThreads(default_P/2, 2*default_P); - dumpParticipation(2*default_P); - return 0; -} diff --git a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp index 1ffe79901b..8474c57de0 100644 --- a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp +++ b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp @@ -14,6 +14,7 @@ limitations under the License. */ +#include #include #include @@ -49,12 +50,8 @@ void arenaGlobalControlImplicitArena(int p, int offset) { } void runTwoThreads(int p0, int p1) { - std::thread t0([=]() { - arenaGlobalControlImplicitArena(p0, 1); - }); - std::thread t1([=]() { - arenaGlobalControlImplicitArena(p1, 10000); - }); + std::thread t0([=]() { arenaGlobalControlImplicitArena(p0, 1); }); + std::thread t1([=]() { arenaGlobalControlImplicitArena(p1, 10000); }); t0.join(); t1.join(); } @@ -65,7 +62,6 @@ int main() { return 0; } -#include #include #include From 8d0aff817127a51fcb0fdc7feb5a96e050ea707b Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Wed, 20 Nov 2024 09:52:56 -0600 Subject: [PATCH 17/34] Minor refactoring --- .../global_control_and_explicit_arena.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/new_examples/performance_tuning/global_control_and_explicit_arena.cpp b/new_examples/performance_tuning/global_control_and_explicit_arena.cpp index 6928dc6d70..65443ab64e 100644 --- a/new_examples/performance_tuning/global_control_and_explicit_arena.cpp +++ b/new_examples/performance_tuning/global_control_and_explicit_arena.cpp @@ -14,7 +14,6 @@ limitations under the License. */ -#include #include const int default_P = tbb::info::default_concurrency(); @@ -33,10 +32,7 @@ void arenaGlobalControlExplicitArena(int p) { } #include -#include -#include -#include -#include +#include #include std::atomic next_tid; From 84eaf9bf8eb9962d30b279f6843269d9faf3e966 Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Wed, 20 Nov 2024 10:35:19 -0600 Subject: [PATCH 18/34] Cleaned up priority example output --- .../priorities_and_conflict.cpp | 56 ++++++++++--------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/new_examples/performance_tuning/priorities_and_conflict.cpp b/new_examples/performance_tuning/priorities_and_conflict.cpp index b8ae8117aa..c52f2b38e3 100644 --- a/new_examples/performance_tuning/priorities_and_conflict.cpp +++ b/new_examples/performance_tuning/priorities_and_conflict.cpp @@ -14,18 +14,24 @@ limitations under the License. */ -#include + #include #include void printArrival(tbb::task_arena::priority priority); -void waitUntil(int N); + +using counter_t = std::atomic; +counter_t counter = 0; +void waitUntil(int N, counter_t& c) { + ++c; + while (c != N); +} void explicitArenaWithPriority(tbb::task_arena::priority priority) { tbb::task_arena a{tbb::info::default_concurrency(), 1, priority}; a.execute([=]() { tbb::parallel_for(0, - 10*tbb::info::default_concurrency(), + 2*tbb::info::default_concurrency(), [=](int) { printArrival(priority); }); }); } @@ -33,11 +39,11 @@ void explicitArenaWithPriority(tbb::task_arena::priority priority) { void runTwoThreads(tbb::task_arena::priority priority0, tbb::task_arena::priority priority1) { std::thread t0([=]() { - waitUntil(2); + waitUntil(2, counter); explicitArenaWithPriority(priority0); }); std::thread t1([=]() { - waitUntil(2); + waitUntil(2, counter); explicitArenaWithPriority(priority1); }); t0.join(); @@ -46,16 +52,31 @@ void runTwoThreads(tbb::task_arena::priority priority0, #include +int main() { + counter = 0; + std::printf("\n\n\n\nrunTwoThreads with low (.) and high (|)\n"); + runTwoThreads(tbb::task_arena::priority::low, tbb::task_arena::priority::high); + + counter = 0; + std::printf("\n\n\n\nrunTwoThreads with low (.) and normal (:)\n"); + runTwoThreads(tbb::task_arena::priority::low, tbb::task_arena::priority::normal); + + counter = 0; + std::printf("\n\n\n\nrunTwoThreads with normal (:) and high (|)\n"); + runTwoThreads(tbb::task_arena::priority::normal, tbb::task_arena::priority::high); + return 0; +} + void printArrival(tbb::task_arena::priority priority) { switch (priority) { case tbb::task_arena::priority::low: - std::printf(" low "); + std::printf("."); break; case tbb::task_arena::priority::normal: - std::printf(" normal "); + std::printf(":"); break; case tbb::task_arena::priority::high: - std::printf(" high "); + std::printf("|"); break; default: break; @@ -65,24 +86,5 @@ void printArrival(tbb::task_arena::priority priority) { while ((tbb::tick_count::now() - t0).seconds() < 0.01); } -std::atomic count_up = 0; -void waitUntil(int N) { - ++count_up; - while (count_up != N); -} - -int main() { - count_up = 0; - std::printf("\n\n\n\nrunTwoThreads(low, high)"); - runTwoThreads(tbb::task_arena::priority::low, tbb::task_arena::priority::high); - - count_up = 0; - std::printf("\n\n\n\nrunTwoThreads(low, normal)"); - runTwoThreads(tbb::task_arena::priority::low, tbb::task_arena::priority::normal); - count_up = 0; - std::printf("\n\n\n\nrunTwoThreads(normal, high)"); - runTwoThreads(tbb::task_arena::priority::normal, tbb::task_arena::priority::high); - return 0; -} From 4c3d84f4ceadaebae7bd353a99bd3f7c10aabd93 Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Wed, 20 Nov 2024 19:24:07 -0600 Subject: [PATCH 19/34] Fixed output for priorities example --- .../performance_tuning/priorities_and_conflict.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/new_examples/performance_tuning/priorities_and_conflict.cpp b/new_examples/performance_tuning/priorities_and_conflict.cpp index c52f2b38e3..e7a1771d37 100644 --- a/new_examples/performance_tuning/priorities_and_conflict.cpp +++ b/new_examples/performance_tuning/priorities_and_conflict.cpp @@ -54,16 +54,17 @@ void runTwoThreads(tbb::task_arena::priority priority0, int main() { counter = 0; - std::printf("\n\n\n\nrunTwoThreads with low (.) and high (|)\n"); + std::printf("\n\nrunTwoThreads with low (.) and high (|)\n"); runTwoThreads(tbb::task_arena::priority::low, tbb::task_arena::priority::high); counter = 0; - std::printf("\n\n\n\nrunTwoThreads with low (.) and normal (:)\n"); + std::printf("\n\nrunTwoThreads with low (.) and normal (:)\n"); runTwoThreads(tbb::task_arena::priority::low, tbb::task_arena::priority::normal); counter = 0; - std::printf("\n\n\n\nrunTwoThreads with normal (:) and high (|)\n"); + std::printf("\n\nrunTwoThreads with normal (:) and high (|)\n"); runTwoThreads(tbb::task_arena::priority::normal, tbb::task_arena::priority::high); + std::printf("\n"); return 0; } From 75f28906bc3b25e9593ec2339b2d57b5f9535f31 Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Wed, 20 Nov 2024 21:38:03 -0600 Subject: [PATCH 20/34] Removed tracing and added output to constraints example --- .../performance_tuning/constraints.cpp | 43 +++++++------------ 1 file changed, 15 insertions(+), 28 deletions(-) diff --git a/new_examples/performance_tuning/constraints.cpp b/new_examples/performance_tuning/constraints.cpp index 42800c6205..02ed66e897 100644 --- a/new_examples/performance_tuning/constraints.cpp +++ b/new_examples/performance_tuning/constraints.cpp @@ -14,23 +14,15 @@ limitations under the License. */ -#include "tbb/tbb.h" #include +#include -#define USE_ARENA_TRACE 1 -#if USE_ARENA_TRACE -#include "arena_trace.h" -#endif int N = 1000; double w = 0.01; double f(double v); void constrain_for_numa_nodes() { -#if USE_ARENA_TRACE - arena_tracer t{"numa_trace.json"}; -#endif - std::vector numa_nodes = tbb::info::numa_nodes(); std::vector arenas(numa_nodes.size()); std::vector task_groups(numa_nodes.size()); @@ -41,14 +33,18 @@ void constrain_for_numa_nodes() { t.add_arena(std::to_string(i), arenas[i]); #endif } - for (int i = 0; i < numa_nodes.size(); i++) { - arenas[i].execute([&task_groups, i] { + for (int i = 1; i < numa_nodes.size(); i++) { + arenas[i].enqueue([&task_groups, i] { task_groups[i].run([] { tbb::parallel_for(0, N, [](int j) { f(w); }); }); }); } - for (int i = 0; i < numa_nodes.size(); i++) { + arenas[0].execute([] { + tbb::parallel_for(0, N, [](int j) { f(w); }); + }); + + for (int i = 1; i < numa_nodes.size(); i++) { arenas[i].execute([&task_groups, i] { task_groups[i].wait(); }); @@ -56,17 +52,11 @@ void constrain_for_numa_nodes() { } void constrain_for_core_type() { - std::vector core_types = tbb::info::core_types(); tbb::task_arena arena( tbb::task_arena::constraints{}.set_core_type(core_types.back()) ); - #if USE_ARENA_TRACE - arena_tracer t{"core_trace.json"}; - t.add_arena("pcores", arena); - #endif - arena.execute([] { tbb::parallel_for(0, N, [](int) { f(w); }); }); @@ -79,11 +69,6 @@ void constrain_for_no_hyperthreading() { c.set_max_threads_per_core(1); tbb::task_arena no_ht_arena(c); - #if USE_ARENA_TRACE - arena_tracer t{"no_ht_constraints_trace.json"}; - t.add_arena("no_ht_arena", no_ht_arena); - #endif - no_ht_arena.execute( [] { tbb::parallel_for(0, N, [](int) { f(w); }); }); @@ -97,21 +82,23 @@ void limit_concurrency_for_no_hyperthreading() { int no_ht_concurrency = tbb::info::default_concurrency(c); tbb::task_arena arena( no_ht_concurrency ); - #if USE_ARENA_TRACE - arena_tracer t{"no_ht_concurrency_trace.json"}; - t.add_arena("no_ht_concurrency", arena); - #endif - arena.execute( [] { tbb::parallel_for(0, N, [](int) { f(w); }); }); } +#include + int main() { + std::cout << "Running numa node constraint example\n"; constrain_for_numa_nodes(); + std::cout << "Running core type constraint example\n"; constrain_for_core_type(); + std::cout << "Running one thread per core constraint example\n"; constrain_for_no_hyperthreading(); + std::cout << "Running limited concurrency example\n"; limit_concurrency_for_no_hyperthreading(); + std::cout << "done\n"; return 0; } From e810f96f6b1fec108d0ab9c736fda53ece4bb545 Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Thu, 21 Nov 2024 09:07:49 -0600 Subject: [PATCH 21/34] Use defer for NUMA example --- .../performance_tuning/constraints.cpp | 49 +++++++++---------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/new_examples/performance_tuning/constraints.cpp b/new_examples/performance_tuning/constraints.cpp index 02ed66e897..6df33f9366 100644 --- a/new_examples/performance_tuning/constraints.cpp +++ b/new_examples/performance_tuning/constraints.cpp @@ -23,32 +23,31 @@ double w = 0.01; double f(double v); void constrain_for_numa_nodes() { - std::vector numa_nodes = tbb::info::numa_nodes(); - std::vector arenas(numa_nodes.size()); - std::vector task_groups(numa_nodes.size()); - - for (int i = 0; i < numa_nodes.size(); i++) { - arenas[i].initialize(tbb::task_arena::constraints(numa_nodes[i]), 0); - #if USE_ARENA_TRACE - t.add_arena(std::to_string(i), arenas[i]); - #endif - } - for (int i = 1; i < numa_nodes.size(); i++) { - arenas[i].enqueue([&task_groups, i] { - task_groups[i].run([] { - tbb::parallel_for(0, N, [](int j) { f(w); }); - }); - }); - } - arenas[0].execute([] { - tbb::parallel_for(0, N, [](int j) { f(w); }); - }); + std::vector numa_nodes = tbb::info::numa_nodes(); + std::vector arenas(numa_nodes.size()); + std::vector task_groups(numa_nodes.size()); + + // initialize each arena, each constrained to a different NUMA node + for (int i = 0; i < numa_nodes.size(); i++) + arenas[i].initialize(tbb::task_arena::constraints(numa_nodes[i]), 0); + + // enqueue work to all but the first arena, using the task_group to track work + // by using defer, the task_group reference count is incremented immediately + for (int i = 1; i < numa_nodes.size(); i++) + arenas[i].enqueue( + task_groups[i].defer([] { + tbb::parallel_for(0, N, [](int j) { f(w); }); + }) + ); + + // directly execute the work to completion in the remaining arena + arenas[0].execute([] { + tbb::parallel_for(0, N, [](int j) { f(w); }); + }); - for (int i = 1; i < numa_nodes.size(); i++) { - arenas[i].execute([&task_groups, i] { - task_groups[i].wait(); - }); - } + // join the other arenas to wait on their task_groups + for (int i = 1; i < numa_nodes.size(); i++) + arenas[i].execute([&task_groups, i] { task_groups[i].wait(); }); } void constrain_for_core_type() { From ab39fd31f052b0866cd3d6cd610a911fac6024ff Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Thu, 21 Nov 2024 10:36:44 -0600 Subject: [PATCH 22/34] Added observer example --- .../task_scheduler_observer.cpp | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 new_examples/performance_tuning/task_scheduler_observer.cpp diff --git a/new_examples/performance_tuning/task_scheduler_observer.cpp b/new_examples/performance_tuning/task_scheduler_observer.cpp new file mode 100644 index 0000000000..8fbb367146 --- /dev/null +++ b/new_examples/performance_tuning/task_scheduler_observer.cpp @@ -0,0 +1,113 @@ +/* + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include +#include + +// these are placeholder for where we would put OS-specific types and calls +using affinity_mask_t = std::string; +void set_thread_affinity( int tid, const affinity_mask_t& mask ) { + std::ostringstream buffer; + buffer << std::this_thread::get_id() + << " -> (" << tid + << ", " << mask << ")\n"; + std::cout << buffer.str(); +} +void restore_thread_affinity() { + std::ostringstream buffer; + buffer << std::this_thread::get_id() + << " -> (restored)\n"; + std::cout << buffer.str(); +} + +// observer class +class PinningObserver : public tbb::task_scheduler_observer { +public: + // HW affinity mask to be used for threads in an arena + affinity_mask_t m_mask; + PinningObserver( oneapi::tbb::task_arena &a, const affinity_mask_t& mask ) + : tbb::task_scheduler_observer(a), m_mask(mask) { + observe(true); // activate the observer + } + void on_scheduler_entry( bool worker ) override { + set_thread_affinity( + tbb::this_task_arena::current_thread_index(), m_mask); + } + void on_scheduler_exit( bool worker ) override { + restore_thread_affinity(); + } +}; + +int N = 1000; +double w = 0.01; +double f(double v); + +using counter_t = std::atomic; +counter_t counter = 0; +void waitUntil(int N, counter_t& c) { + ++c; + while (c != N); +} + +void observeTwoArenas() { + int P = tbb::info::default_concurrency(); + + // two arenas, each with half the hw threads + tbb::task_arena a0(P/2); + tbb::task_arena a1(P/2); + + PinningObserver obs0(a0, "mask_zero"); + PinningObserver obs1(a1, "mask_one"); + + // Execute consecutive loops + std::cout << "Execute a0 loop\n"; + a0.execute([] { + tbb::parallel_for(0, N, [](int j) { f(w); }); + }); + std::cout << "Execute a1 loop\n"; + a1.execute([] { + tbb::parallel_for(0, N, [](int j) { f(w); }); + }); + + // Execute concurrent loops + std::cout << "Execute a0 and a1 concurrently\n"; + std::thread t0([&]() { + waitUntil(2, counter); + a0.execute([] { + tbb::parallel_for(0, N, [](int j) { f(w); }); + }); + }); + std::thread t1([&]() { + waitUntil(2, counter); + a1.execute([] { + tbb::parallel_for(0, N, [](int j) { f(w); }); + }); + }); + t0.join(); + t1.join(); +} + +int main() { + observeTwoArenas(); + return 0; +} + +double f(double v) { + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < 0.01); + return 2*v; +} From e6e5bf1ee09ffde01d12d678731feba9a4edfcd0 Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Thu, 21 Nov 2024 10:48:15 -0600 Subject: [PATCH 23/34] Updated CMakeLists --- new_examples/CMakeLists.txt | 13 ++++++++-- new_examples/algorithms/CMakeLists.txt | 3 --- new_examples/cancellation/CMakeLists.txt | 3 --- new_examples/exception/CMakeLists.txt | 3 --- new_examples/graph/CMakeLists.txt | 5 ++-- new_examples/intro/CMakeLists.txt | 3 --- new_examples/migration/CMakeLists.txt | 6 ++--- .../performance_tuning/CMakeLists.txt | 26 ++++++++++++------- new_examples/tasks/CMakeLists.txt | 3 --- 9 files changed, 32 insertions(+), 33 deletions(-) diff --git a/new_examples/CMakeLists.txt b/new_examples/CMakeLists.txt index 6651073a6e..77b8cdbe93 100644 --- a/new_examples/CMakeLists.txt +++ b/new_examples/CMakeLists.txt @@ -2,8 +2,17 @@ cmake_minimum_required (VERSION 3.4) project(tbb_tutorials LANGUAGES CXX) -set(CMAKE_CXX_COMPILER "icx-cl") -set(CMAKE_LINKER "icpx") +if (WIN32) + set(CMAKE_CXX_COMPILER "icx-cl") + set(CMAKE_LINKER "icpx") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qtbb") + set(CMAKE_CXX_LINKER_FLAGS "-Qtbb") +else() + set(CMAKE_CXX_COMPILER "icpx") + set(CMAKE_LINKER "icpx") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb") + set(CMAKE_CXX_LINKER_FLAGS "-tbb -lpthread") +endif() set(CMAKE_CXX_STANDARD 20) include (CTest) diff --git a/new_examples/algorithms/CMakeLists.txt b/new_examples/algorithms/CMakeLists.txt index f4f0aa9eca..519baa1ae7 100644 --- a/new_examples/algorithms/CMakeLists.txt +++ b/new_examples/algorithms/CMakeLists.txt @@ -1,6 +1,3 @@ -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb") -set(CMAKE_CXX_LINKER_FLAGS "-tbb") - foreach(tpp parallel_invoke_recursive_quicksort.cpp parallel_invoke_two_quicksorts.cpp parallel_for_trivial.cpp parallel_for_unoptimized_mxm.cpp parallel_reduce_max.cpp parallel_reduce_pi.cpp diff --git a/new_examples/cancellation/CMakeLists.txt b/new_examples/cancellation/CMakeLists.txt index 0f26004913..adebb4e100 100644 --- a/new_examples/cancellation/CMakeLists.txt +++ b/new_examples/cancellation/CMakeLists.txt @@ -1,6 +1,3 @@ -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb") -set(CMAKE_CXX_LINKER_FLAGS "-tbb") - foreach(tpp cancel_group_execution1.cpp cancel_group_execution2.cpp cancel_group_execution3.cpp diff --git a/new_examples/exception/CMakeLists.txt b/new_examples/exception/CMakeLists.txt index 08105e42c9..444f2c077a 100644 --- a/new_examples/exception/CMakeLists.txt +++ b/new_examples/exception/CMakeLists.txt @@ -1,6 +1,3 @@ -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb") -set(CMAKE_CXX_LINKER_FLAGS "-tbb") - foreach(tpp exception_catch1.cpp exception_catch2.cpp exception_catch3.cpp) diff --git a/new_examples/graph/CMakeLists.txt b/new_examples/graph/CMakeLists.txt index 1c971bd436..31ce92e2c7 100644 --- a/new_examples/graph/CMakeLists.txt +++ b/new_examples/graph/CMakeLists.txt @@ -1,5 +1,5 @@ -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb") -set(CMAKE_CXX_LINKER_FLAGS "-tbb") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qtbb") +set(CMAKE_CXX_LINKER_FLAGS "-Qtbb") foreach(tpp graph_composite_node.cpp graph_execute_while_building.cpp @@ -10,6 +10,7 @@ foreach(tpp graph_composite_node.cpp graph_node_priorities.cpp graph_reestablish_order.cpp graph_small_nodes.cpp + graph_stereoscopic_3d.cpp graph_two_nodes.cpp graph_two_nodes_deduced.cpp graph_with_join.cpp) diff --git a/new_examples/intro/CMakeLists.txt b/new_examples/intro/CMakeLists.txt index 1efda6b038..9e44001630 100644 --- a/new_examples/intro/CMakeLists.txt +++ b/new_examples/intro/CMakeLists.txt @@ -1,6 +1,3 @@ -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb") -set(CMAKE_CXX_LINKER_FLAGS "-tbb") - foreach(tpp intro_pi.cpp intro_pi_timing.cpp intro_flowgraph.cpp diff --git a/new_examples/migration/CMakeLists.txt b/new_examples/migration/CMakeLists.txt index d52df94a67..f489f910cb 100644 --- a/new_examples/migration/CMakeLists.txt +++ b/new_examples/migration/CMakeLists.txt @@ -1,7 +1,5 @@ -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb -pthread") -set(CMAKE_CXX_LINKER_FLAGS "-tbb -pthread") - -foreach(tpp migrate_task_scheduler_init.cpp +foreach(tpp migrate_atomics.cpp + migrate_task_scheduler_init.cpp migrate_parallel_do.cpp migrate_priorities.cpp migrate_task_blocking.cpp diff --git a/new_examples/performance_tuning/CMakeLists.txt b/new_examples/performance_tuning/CMakeLists.txt index ff880d251b..c87ef75921 100644 --- a/new_examples/performance_tuning/CMakeLists.txt +++ b/new_examples/performance_tuning/CMakeLists.txt @@ -1,17 +1,23 @@ -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qtbb") -set(CMAKE_CXX_LINKER_FLAGS "-Qtbb") +if (WIN32) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qtbb") + set(CMAKE_CXX_LINKER_FLAGS "-Qtbb") +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb") + set(CMAKE_CXX_LINKER_FLAGS "-tbb -lpthread") +endif() -foreach(tpp global_control_and_implicit_arena.cpp - global_control_and_explicit_arena.cpp - global_control_and_implicit_conflict.cpp - global_control_and_explicit_conflict.cpp - priorities_and_conflict.cpp - blocked_ranges_trivial.cpp +foreach(tpp blocked_ranges_trivial.cpp constraints.cpp + global_control_and_explicit_arena.cpp + global_control_and_explicit_conflict.cpp + global_control_and_implicit_arena.cpp + global_control_and_implicit_conflict.cpp + parallel_for_addition_partitioners.cpp parallel_for_spin_partitioners.cpp parallel_for_spin_partitioners_timed.cpp - parallel_for_addition_partitioners.cpp - parallel_for_transpose_partitioners.cpp) + parallel_for_transpose_partitioners.cpp + priorities_and_conflict.cpp + task_scheduler_observer.cpp) string(REPLACE ".cpp" "" texe ${tpp}) add_executable(${texe} ${tpp}) target_include_directories(${texe} PUBLIC diff --git a/new_examples/tasks/CMakeLists.txt b/new_examples/tasks/CMakeLists.txt index 79d468791d..3a1af18ea0 100644 --- a/new_examples/tasks/CMakeLists.txt +++ b/new_examples/tasks/CMakeLists.txt @@ -1,6 +1,3 @@ -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb -pthread") -set(CMAKE_CXX_LINKER_FLAGS "-tbb -pthread") - foreach(tpp parallel_invoke_fib.cpp task_group_fib.cpp task_group_poor_scaling.cpp From 20979fabe21d3a87b0558ff39d468545a1384fb6 Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Thu, 21 Nov 2024 10:52:24 -0600 Subject: [PATCH 24/34] Fixed graph CMakeLists.txt --- new_examples/graph/CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/new_examples/graph/CMakeLists.txt b/new_examples/graph/CMakeLists.txt index 31ce92e2c7..5f2114c42e 100644 --- a/new_examples/graph/CMakeLists.txt +++ b/new_examples/graph/CMakeLists.txt @@ -1,6 +1,3 @@ -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qtbb") -set(CMAKE_CXX_LINKER_FLAGS "-Qtbb") - foreach(tpp graph_composite_node.cpp graph_execute_while_building.cpp graph_fwd_substitution.cpp From 56d7b5ff8c8950a7e89bd23bb0d8ef314aa204ae Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Thu, 21 Nov 2024 17:44:27 -0600 Subject: [PATCH 25/34] Updated partitioner examples --- .../parallel_for_spin_partitioners.cpp | 6 ++---- .../parallel_for_spin_partitioners_timed.cpp | 19 +------------------ 2 files changed, 3 insertions(+), 22 deletions(-) diff --git a/new_examples/performance_tuning/parallel_for_spin_partitioners.cpp b/new_examples/performance_tuning/parallel_for_spin_partitioners.cpp index 5de939e84a..b2d57de755 100644 --- a/new_examples/performance_tuning/parallel_for_spin_partitioners.cpp +++ b/new_examples/performance_tuning/parallel_for_spin_partitioners.cpp @@ -23,8 +23,7 @@ template void pforWork(int N, const Partitioner& p) { tbb::parallel_for( tbb::blocked_range(0, N, 1), [](const tbb::blocked_range& r) { - int ie = r.end(); - for (int i = r.begin(); i < ie; ++i) { + for (int i = r.begin(); i < r.end(); ++i) { doWork(i); } }, p @@ -35,8 +34,7 @@ template void pforWork(int N, Partitioner& p) { tbb::parallel_for( tbb::blocked_range(0, N, 1), [](const tbb::blocked_range& r) { - int ie = r.end(); - for (int i = r.begin(); i < ie; ++i) { + for (int i = r.begin(); i < r.end(); ++i) { doWork(i); } }, p diff --git a/new_examples/performance_tuning/parallel_for_spin_partitioners_timed.cpp b/new_examples/performance_tuning/parallel_for_spin_partitioners_timed.cpp index 64272005af..51216fabfd 100644 --- a/new_examples/performance_tuning/parallel_for_spin_partitioners_timed.cpp +++ b/new_examples/performance_tuning/parallel_for_spin_partitioners_timed.cpp @@ -41,8 +41,7 @@ static inline double executePfor(int num_trials, int N, tbb::parallel_for ( tbb::blocked_range{0, N, static_cast(gs)}, [tpi](const tbb::blocked_range& r) { - int e = r.end(); - for (int i = r.begin(); i < e; ++i) { + for (int i = r.begin(); i < r.end(); ++i) { spinWaitForAtLeast(tpi); } }, @@ -53,8 +52,6 @@ static inline double executePfor(int num_trials, int N, return (t1 - t0).seconds()/num_trials; } -#define CONSTRAIN_TO_ECORES 1 - int main() { tbb::auto_partitioner auto_p; tbb::simple_partitioner simple_p; @@ -67,16 +64,6 @@ int main() { const double twenty_us = 0.00002; double timing[4][19]; - #if CONSTRAIN_TO_ECORES - std::vector core_types = tbb::info::core_types(); - tbb::task_arena::constraints c; - c.set_core_type(core_types.front()); - c.set_max_concurrency(tbb::info::default_concurrency(c) - 2); - tbb::task_arena ecore_arena(c); - std::cout << "Using arena with " << ecore_arena.max_concurrency() << " slots\n"; - ecore_arena.execute([&]() { - #endif - for (double tpi = ten_ns; tpi < twenty_us; tpi *= 10) { std::cout << "Speedups for " << tpi << " seconds per iteration" << std::endl << "partitioner"; @@ -103,10 +90,6 @@ int main() { std::cout << std::endl; } - #if CONSTRAIN_TO_ECORES - }); - #endif - return 0; } From 466cbbaa6ae348e1242da7891fb4ed9f1ebc8d38 Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Thu, 21 Nov 2024 18:14:05 -0600 Subject: [PATCH 26/34] Renamed files --- new_examples/performance_tuning/CMakeLists.txt | 4 ++-- ...or_spin_partitioners.cpp => parallel_for_partitioners.cpp} | 0 ...itioners_timed.cpp => parallel_for_partitioners_timed.cpp} | 0 new_examples/performance_tuning/task_scheduler_observer.cpp | 1 + 4 files changed, 3 insertions(+), 2 deletions(-) rename new_examples/performance_tuning/{parallel_for_spin_partitioners.cpp => parallel_for_partitioners.cpp} (100%) rename new_examples/performance_tuning/{parallel_for_spin_partitioners_timed.cpp => parallel_for_partitioners_timed.cpp} (100%) diff --git a/new_examples/performance_tuning/CMakeLists.txt b/new_examples/performance_tuning/CMakeLists.txt index c87ef75921..e5f7c92da6 100644 --- a/new_examples/performance_tuning/CMakeLists.txt +++ b/new_examples/performance_tuning/CMakeLists.txt @@ -13,8 +13,8 @@ foreach(tpp blocked_ranges_trivial.cpp global_control_and_implicit_arena.cpp global_control_and_implicit_conflict.cpp parallel_for_addition_partitioners.cpp - parallel_for_spin_partitioners.cpp - parallel_for_spin_partitioners_timed.cpp + parallel_for_partitioners.cpp + parallel_for_partitioners_timed.cpp parallel_for_transpose_partitioners.cpp priorities_and_conflict.cpp task_scheduler_observer.cpp) diff --git a/new_examples/performance_tuning/parallel_for_spin_partitioners.cpp b/new_examples/performance_tuning/parallel_for_partitioners.cpp similarity index 100% rename from new_examples/performance_tuning/parallel_for_spin_partitioners.cpp rename to new_examples/performance_tuning/parallel_for_partitioners.cpp diff --git a/new_examples/performance_tuning/parallel_for_spin_partitioners_timed.cpp b/new_examples/performance_tuning/parallel_for_partitioners_timed.cpp similarity index 100% rename from new_examples/performance_tuning/parallel_for_spin_partitioners_timed.cpp rename to new_examples/performance_tuning/parallel_for_partitioners_timed.cpp diff --git a/new_examples/performance_tuning/task_scheduler_observer.cpp b/new_examples/performance_tuning/task_scheduler_observer.cpp index 8fbb367146..b6f319eac5 100644 --- a/new_examples/performance_tuning/task_scheduler_observer.cpp +++ b/new_examples/performance_tuning/task_scheduler_observer.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include From b8cba8cf9a74e8d775d037b7a056fdf6674bd71c Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Thu, 21 Nov 2024 21:54:00 -0600 Subject: [PATCH 27/34] Added constraints for more reproducible results --- .../parallel_for_partitioners_timed.cpp | 80 +++++++----- .../parallel_for_transpose_partitioners.cpp | 117 +++++++++--------- 2 files changed, 105 insertions(+), 92 deletions(-) diff --git a/new_examples/performance_tuning/parallel_for_partitioners_timed.cpp b/new_examples/performance_tuning/parallel_for_partitioners_timed.cpp index 51216fabfd..7155b0caa9 100644 --- a/new_examples/performance_tuning/parallel_for_partitioners_timed.cpp +++ b/new_examples/performance_tuning/parallel_for_partitioners_timed.cpp @@ -53,42 +53,56 @@ static inline double executePfor(int num_trials, int N, } int main() { - tbb::auto_partitioner auto_p; - tbb::simple_partitioner simple_p; - tbb::static_partitioner static_p; - const std::string pname[4] = {"simple", "auto", "affinity", "static"}; - - const int N = 262144; - const int T = 20; - const double ten_ns = 0.00000001; - const double twenty_us = 0.00002; - double timing[4][19]; - - for (double tpi = ten_ns; tpi < twenty_us; tpi *= 10) { - std::cout << "Speedups for " << tpi << " seconds per iteration" << std::endl - << "partitioner"; - for (int gs = 1, i = 0; gs <= N; gs *= 2, ++i) - std::cout << ", " << gs; - std::cout << std::endl; - - double serial_time = executeFor(T, N, tpi); - - for (int gs = 1, i = 0; gs <= N; gs *= 2, ++i) { - tbb::affinity_partitioner affinity_p; - spinWaitForAtLeast(0.001); - timing[0][i] = executePfor(T, N, gs, simple_p, tpi); - timing[1][i] = executePfor(T, N, gs, auto_p, tpi); - timing[2][i] = executePfor(T, N, gs, affinity_p, tpi); - timing[3][i] = executePfor(T, N, gs, static_p, tpi); - } - for (int p = 0; p < 4; ++p) { - std::cout << pname[p]; + // use the most performance codes + // only a single NUMA node + // and only 1 thread per core + tbb::task_arena::constraints c; + c.set_numa_id(tbb::info::numa_nodes()[0]); + c.set_core_type(tbb::info::core_types().back()); + c.set_max_threads_per_core(1); + c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c))); + tbb::task_arena a(c); + + std::cout << "Using an arena with " << a.max_concurrency() << " slots\n"; + + a.execute([&]() { + tbb::auto_partitioner auto_p; + tbb::simple_partitioner simple_p; + tbb::static_partitioner static_p; + const std::string pname[4] = {"simple", "auto", "affinity", "static"}; + + const int N = 262144; + const int T = 20; + const double ten_ns = 0.00000001; + const double twenty_us = 0.00002; + double timing[4][19]; + + for (double tpi = ten_ns; tpi < twenty_us; tpi *= 10) { + std::cout << "Speedups for " << tpi << " seconds per iteration" << std::endl + << "partitioner"; for (int gs = 1, i = 0; gs <= N; gs *= 2, ++i) - std::cout << ", " << serial_time/timing[p][i]; + std::cout << ", " << gs; + std::cout << std::endl; + + double serial_time = executeFor(T, N, tpi); + + for (int gs = 1, i = 0; gs <= N; gs *= 2, ++i) { + tbb::affinity_partitioner affinity_p; + spinWaitForAtLeast(0.001); + timing[0][i] = executePfor(T, N, gs, simple_p, tpi); + timing[1][i] = executePfor(T, N, gs, auto_p, tpi); + timing[2][i] = executePfor(T, N, gs, affinity_p, tpi); + timing[3][i] = executePfor(T, N, gs, static_p, tpi); + } + for (int p = 0; p < 4; ++p) { + std::cout << pname[p]; + for (int gs = 1, i = 0; gs <= N; gs *= 2, ++i) + std::cout << ", " << serial_time/timing[p][i]; + std::cout << std::endl; + } std::cout << std::endl; } - std::cout << std::endl; - } + }); return 0; } diff --git a/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp b/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp index 45f68f4391..734bbd752b 100644 --- a/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp +++ b/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp @@ -15,7 +15,7 @@ */ #include -#include + double serialTranspose(int N, double *a, double *b) { tbb::tick_count t0 = tbb::tick_count::now(); @@ -99,71 +99,70 @@ double pforTranspose2d(int N, double *a, double *b, int gs) { void setArray(int N, double *a); void checkTranspose(int N, double *a); -#define CONSTRAIN_TO_ECORES 1 +#include int main() { - int N = 2<<12; // 8192 - double *a = new double[N*N]; - double *b = new double[N*N]; - setArray(N, a); - setArray(N, b); - - #if CONSTRAIN_TO_ECORES - std::vector core_types = tbb::info::core_types(); + // use the most performance codes + // only a single NUMA node + // and only 1 thread per core tbb::task_arena::constraints c; - c.set_core_type(core_types.front()); - c.set_max_concurrency(tbb::info::default_concurrency(c) - 2); - tbb::task_arena ecore_arena(c); - std::cout << "Using arena with " << ecore_arena.max_concurrency() << " slots\n"; - ecore_arena.execute([&]() { - #endif - - serialTranspose(N, a, b); - double ts = serialTranspose(N, a, b); - checkTranspose(N, b); - std::cout << "Serial Time = " << ts << std::endl; - - std::cout << "Parallel Times:" << std::endl - << "grainsize, oblivious, 1d auto, 1d simple, 2d auto, 2d simple" << std::endl; - for (int gs = 1; gs <= N; gs *= 2) { - setArray(N, a); - setArray(N, b); - serialObliviousTranspose(N, a, b, gs); - double to = serialObliviousTranspose(N, a, b, gs); - checkTranspose(N, b); - - setArray(N, a); - setArray(N, b); - pforTranspose(N, a, b, gs); - double t1d_auto = pforTranspose(N, a, b, gs); - + c.set_numa_id(tbb::info::numa_nodes()[0]); + c.set_core_type(tbb::info::core_types().back()); + c.set_max_threads_per_core(1); + c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c))); + tbb::task_arena a(c); + + std::cout << "Using an arena with " << a.max_concurrency() << " slots\n"; + + a.execute([&]() { + int N = 2<<12; // 8192 + double *a = new double[N*N]; + double *b = new double[N*N]; setArray(N, a); setArray(N, b); - pforTranspose(N, a, b, gs); - double t1d_simple = pforTranspose(N, a, b, gs); - setArray(N, a); - setArray(N, b); - pforTranspose2d(N, a, b, gs); - double t2d_auto = pforTranspose2d(N, a, b, gs); - - setArray(N, a); - setArray(N, b); - pforTranspose2d(N, a, b, gs); - double t2d_simple = pforTranspose2d(N, a, b, gs); - - std::cout << gs - << ", " << to - << ", " << t1d_auto - << ", " << t1d_simple - << ", " << t2d_auto - << ", " << t2d_simple << std::endl; - } - - #if CONSTRAIN_TO_ECORES + serialTranspose(N, a, b); + double ts = serialTranspose(N, a, b); + checkTranspose(N, b); + std::cout << "Serial Time = " << ts << std::endl; + + std::cout << "Parallel Times:" << std::endl + << "grainsize, oblivious, 1d auto, 1d simple, 2d auto, 2d simple" << std::endl; + for (int gs = 1; gs <= N; gs *= 2) { + setArray(N, a); + setArray(N, b); + serialObliviousTranspose(N, a, b, gs); + double to = serialObliviousTranspose(N, a, b, gs); + checkTranspose(N, b); + + setArray(N, a); + setArray(N, b); + pforTranspose(N, a, b, gs); + double t1d_auto = pforTranspose(N, a, b, gs); + + setArray(N, a); + setArray(N, b); + pforTranspose(N, a, b, gs); + double t1d_simple = pforTranspose(N, a, b, gs); + + setArray(N, a); + setArray(N, b); + pforTranspose2d(N, a, b, gs); + double t2d_auto = pforTranspose2d(N, a, b, gs); + + setArray(N, a); + setArray(N, b); + pforTranspose2d(N, a, b, gs); + double t2d_simple = pforTranspose2d(N, a, b, gs); + + std::cout << gs + << ", " << to + << ", " << t1d_auto + << ", " << t1d_simple + << ", " << t2d_auto + << ", " << t2d_simple << std::endl; + } }); - #endif - return 0; } From ee924fd082ff248d090950d3c215cd38fa2b784f Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Thu, 21 Nov 2024 22:05:54 -0600 Subject: [PATCH 28/34] Removed unneeded temporaries --- .../parallel_for_transpose_partitioners.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp b/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp index 734bbd752b..323a43de3b 100644 --- a/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp +++ b/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp @@ -64,8 +64,7 @@ double pforTranspose(int N, double *a, double *b, int gs) { tbb::tick_count t0 = tbb::tick_count::now(); tbb::parallel_for( tbb::blocked_range(0, N, gs), [N, a, b](const tbb::blocked_range& r) { - int ie = r.end(); - for (int i = r.begin(); i < ie; ++i) { + for (int i = r.begin(); i < r.end(); ++i) { for (int j = 0; j < N; ++j) { b[j*N+i] = a[i*N+j]; } @@ -80,13 +79,11 @@ template double pforTranspose2d(int N, double *a, double *b, int gs) { tbb::tick_count t0 = tbb::tick_count::now(); tbb::parallel_for( tbb::blocked_range2d{ - 0, N, static_cast(gs), 0, - N, static_cast(gs)}, + 0, N, static_cast(gs), + 0, N, static_cast(gs)}, [N, a, b](const tbb::blocked_range2d& r) { - int ie = r.rows().end(); - int je = r.cols().end(); - for (int i = r.rows().begin(); i < ie; ++i) { - for (int j = r.cols().begin(); j < je; ++j) { + for (int i = r.rows().begin(); i < r.rows().end(); ++i) { + for (int j = r.cols().begin(); j < r.cols().end(); ++j) { b[j*N+i] = a[i*N+j]; } } From 51ad39e5b5f16ca58176d8cab8ded748214c9fa1 Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Thu, 21 Nov 2024 23:43:45 -0600 Subject: [PATCH 29/34] Added arena to addition sample --- .../parallel_for_addition_partitioners.cpp | 101 +++++++++--------- 1 file changed, 51 insertions(+), 50 deletions(-) diff --git a/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp b/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp index 49333b9bc6..eb8001cc8b 100644 --- a/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp +++ b/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp @@ -48,61 +48,62 @@ void resetA(int N, double *a); static void warmupTBB(); int main(int argc, char *argv[]) { - int M = 10000; - int N = 100000; - - std::cout << "P = " << tbb::info::default_concurrency() - << std::endl << "N = " << N - << std::endl << "M = " << M << std::endl; - - #define CONSTRAIN_TO_FEWER_CORES 0 - #if CONSTRAIN_TO_FEWER_CORES + // use the most performance codes + // only a single NUMA node + // and only 1 thread per core tbb::task_arena::constraints c; - c.set_max_concurrency(tbb::info::default_concurrency() - 2); - tbb::task_arena cores_arena(c); - std::cout << "Using arena with " << cores_arena.max_concurrency() << " slots\n"; - cores_arena.execute([&]() { - #endif - - double *v = new double[M]; - double *a = new double[N]; - - warmupTBB(); - resetV(M, v); - resetA(N, a); - tbb::tick_count t0 = tbb::tick_count::now(); - for (int i = 0; i < M; ++i) { - parForAdd(v[i], N, a, tbb::auto_partitioner{}); - } - double auto_time = (tbb::tick_count::now() - t0).seconds(); - - warmupTBB(); - resetA(N, a); - tbb::affinity_partitioner aff_p; - t0 = tbb::tick_count::now(); - for (int i = 0; i < M; ++i) { - parForAdd(v[i], N, a, aff_p); - } - double affinity_time = (tbb::tick_count::now() - t0).seconds(); - - warmupTBB(); - resetA(N, a); - t0 = tbb::tick_count::now(); - for (int i = 0; i < M; ++i) { - parForAdd(v[i], N, a, tbb::static_partitioner{}); - } - double static_time = (tbb::tick_count::now() - t0).seconds(); + c.set_numa_id(tbb::info::numa_nodes()[0]); + c.set_core_type(tbb::info::core_types().back()); + c.set_max_threads_per_core(1); + c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c))); + tbb::task_arena arena(c); + + std::cout << "Using an arena with " << arena.max_concurrency() << " slots\n"; + + arena.execute([&]() { + int M = 10000; + int N = 100000; + + std::cout << "P = " << tbb::info::default_concurrency() + << std::endl << "N = " << N + << std::endl << "M = " << M << std::endl; + + double *v = new double[M]; + double *a = new double[N]; + + warmupTBB(); + resetV(M, v); + resetA(N, a); + tbb::tick_count t0 = tbb::tick_count::now(); + for (int i = 0; i < M; ++i) { + parForAdd(v[i], N, a, tbb::auto_partitioner{}); + } + double auto_time = (tbb::tick_count::now() - t0).seconds(); + + warmupTBB(); + resetA(N, a); + tbb::affinity_partitioner aff_p; + t0 = tbb::tick_count::now(); + for (int i = 0; i < M; ++i) { + parForAdd(v[i], N, a, aff_p); + } + double affinity_time = (tbb::tick_count::now() - t0).seconds(); - std::cout << "auto_partitioner = " << auto_time << std::endl - << "affinity_partitioner = " << affinity_time << std::endl - << "static_partitioner = " << static_time << std::endl; + warmupTBB(); + resetA(N, a); + t0 = tbb::tick_count::now(); + for (int i = 0; i < M; ++i) { + parForAdd(v[i], N, a, tbb::static_partitioner{}); + } + double static_time = (tbb::tick_count::now() - t0).seconds(); - delete [] v; - delete [] a; + std::cout << "auto_partitioner = " << auto_time << std::endl + << "affinity_partitioner = " << affinity_time << std::endl + << "static_partitioner = " << static_time << std::endl; - #if CONSTRAIN_TO_FEWER_CORES + delete [] v; + delete [] a; }); - #endif return 0; } From 67de46a8bb8c59483fb14def122431c30e4be568 Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Fri, 22 Nov 2024 00:18:35 -0600 Subject: [PATCH 30/34] Added imbalanced loop example --- .../performance_tuning/CMakeLists.txt | 1 + .../parallel_for_addition_partitioners.cpp | 6 +- .../partitioners_imbalanced_loops.cpp | 111 ++++++++++++++++++ 3 files changed, 114 insertions(+), 4 deletions(-) create mode 100644 new_examples/performance_tuning/partitioners_imbalanced_loops.cpp diff --git a/new_examples/performance_tuning/CMakeLists.txt b/new_examples/performance_tuning/CMakeLists.txt index e5f7c92da6..02edeab578 100644 --- a/new_examples/performance_tuning/CMakeLists.txt +++ b/new_examples/performance_tuning/CMakeLists.txt @@ -16,6 +16,7 @@ foreach(tpp blocked_ranges_trivial.cpp parallel_for_partitioners.cpp parallel_for_partitioners_timed.cpp parallel_for_transpose_partitioners.cpp + partitioners_imbalanced_loops.cpp priorities_and_conflict.cpp task_scheduler_observer.cpp) string(REPLACE ".cpp" "" texe ${tpp}) diff --git a/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp b/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp index eb8001cc8b..30f995e269 100644 --- a/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp +++ b/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp @@ -23,8 +23,7 @@ template void parForAdd(double v, int N, double *a, const Partitioner& p) { tbb::parallel_for( tbb::blocked_range(0, N, 1), [v, a](const tbb::blocked_range& r) { - int ie = r.end(); - for (int i = r.begin(); i < ie; ++i) { + for (int i = r.begin(); i < r.end(); ++i) { a[i] += v; } }, p @@ -35,8 +34,7 @@ template void parForAdd(double v, int N, double *a, Partitioner& p) { tbb::parallel_for( tbb::blocked_range(0, N, 1), [v, a](const tbb::blocked_range& r) { - int ie = r.end(); - for (int i = r.begin(); i < ie; ++i) { + for (int i = r.begin(); i < r.end(); ++i) { a[i] += v; } }, p diff --git a/new_examples/performance_tuning/partitioners_imbalanced_loops.cpp b/new_examples/performance_tuning/partitioners_imbalanced_loops.cpp new file mode 100644 index 0000000000..bcedeba6df --- /dev/null +++ b/new_examples/performance_tuning/partitioners_imbalanced_loops.cpp @@ -0,0 +1,111 @@ +/* + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include + +void doWork(double sec); + +template +void buildingWork(int N, const Partitioner& p) { + tbb::parallel_for( tbb::blocked_range(0, N, 1), + [](const tbb::blocked_range& r) { + for (int i = r.begin(); i < r.end(); ++i) { + doWork(i); + } + }, p + ); +} + +template +void buildingWork(int N, Partitioner& p) { + tbb::parallel_for( tbb::blocked_range(0, N, 1), + [](const tbb::blocked_range& r) { + for (int i = r.begin(); i < r.end(); ++i) { + doWork(i); + } + }, p + ); +} + +static void warmupTBB() { + // This is a simple loop that should get workers started. + // oneTBB creates workers lazily on first use of the library + // so this hides the startup time when looking at trivial + // examples that do little real work. + tbb::parallel_for(0, tbb::info::default_concurrency(), + [=](int) { + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < 0.01); + } + ); +} + +void doWork(double usec) { + double sec = usec*1e-06; + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() <= sec); +} + +int main(int argc, char *argv[]) { + // use the most performance codes + // only a single NUMA node + // and only 1 thread per core + tbb::task_arena::constraints c; + c.set_numa_id(tbb::info::numa_nodes()[0]); + c.set_core_type(tbb::info::core_types().back()); + c.set_max_threads_per_core(1); + c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c))); + tbb::task_arena a(c); + + std::cout << "Using an arena with " << a.max_concurrency() << " slots\n"; + + a.execute([&]() { + int N = 1000; + int M = 10; + + std::cout << std::endl << "M = " << M + << std::endl << "N = " << N << std::endl; + + warmupTBB(); + tbb::tick_count t0 = tbb::tick_count::now(); + for (int i = 0; i < M; ++i) { + buildingWork(N, tbb::auto_partitioner{}); + } + double auto_time = (tbb::tick_count::now() - t0).seconds(); + + warmupTBB(); + tbb::affinity_partitioner aff_p; + t0 = tbb::tick_count::now(); + for (int i = 0; i < M; ++i) { + buildingWork(N, aff_p); + } + double affinity_time = (tbb::tick_count::now() - t0).seconds(); + + warmupTBB(); + t0 = tbb::tick_count::now(); + for (int i = 0; i < M; ++i) { + buildingWork(N, tbb::static_partitioner{}); + } + double static_time = (tbb::tick_count::now() - t0).seconds(); + + std::cout << "auto_partitioner = " << auto_time << " seconds" << std::endl + << "affinity_partitioner = " << affinity_time << " seconds" << std::endl + << "static_partitioner = " << static_time << " seconds" << std::endl; + }); + return 0; +} + From dae6cd69320e1183186ddd7b87e8808a08f35225 Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Fri, 22 Nov 2024 01:02:25 -0600 Subject: [PATCH 31/34] Added deterministic reduce --- .../performance_tuning/CMakeLists.txt | 1 + ...llel_deterministic_reduce_partitioners.cpp | 136 ++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 new_examples/performance_tuning/parallel_deterministic_reduce_partitioners.cpp diff --git a/new_examples/performance_tuning/CMakeLists.txt b/new_examples/performance_tuning/CMakeLists.txt index 02edeab578..81e902a417 100644 --- a/new_examples/performance_tuning/CMakeLists.txt +++ b/new_examples/performance_tuning/CMakeLists.txt @@ -13,6 +13,7 @@ foreach(tpp blocked_ranges_trivial.cpp global_control_and_implicit_arena.cpp global_control_and_implicit_conflict.cpp parallel_for_addition_partitioners.cpp + parallel_deterministic_reduce_partitioners.cpp parallel_for_partitioners.cpp parallel_for_partitioners_timed.cpp parallel_for_transpose_partitioners.cpp diff --git a/new_examples/performance_tuning/parallel_deterministic_reduce_partitioners.cpp b/new_examples/performance_tuning/parallel_deterministic_reduce_partitioners.cpp new file mode 100644 index 0000000000..bef0d67a40 --- /dev/null +++ b/new_examples/performance_tuning/parallel_deterministic_reduce_partitioners.cpp @@ -0,0 +1,136 @@ +/* + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include +#include + +double serialPiExample(int num_intervals) { + double dx = 1.0 / num_intervals; + double sum = 0.0; + for (int i = 0; i < num_intervals; ++i) { + double x = (i+0.5)*dx; + double h = sqrt(1-x*x); + sum += h*dx; + } + return 4 * sum; +} + +template< typename Partitioner > +double reducePiExample(int num_intervals, int grainsize) { + double dx = 1.0 / num_intervals; + double sum = tbb::parallel_reduce( + /* range = */ tbb::blocked_range(0, num_intervals, grainsize), + /* idenity = */ 0.0, + /* func */ + [=](const tbb::blocked_range& r, double init) -> double { + for (int i = r.begin(); i != r.end(); ++i) { + double x = (i + 0.5)*dx; + double h = sqrt(1 - x*x); + init += h*dx; + } + return init; + }, + /* reduction */ + [](double x, double y) -> double { + return x + y; + }, + /* partitioner */ Partitioner() + ); + return 4 * sum; +} + +template< typename Partitioner > +double deterministicReducePiExample(int num_intervals, int grainsize) { + double dx = 1.0 / num_intervals; + double sum = tbb::parallel_deterministic_reduce( + /* range = */ tbb::blocked_range(0, num_intervals, grainsize), + /* identity = */ 0.0, + /* func */ + [=](const tbb::blocked_range& r, double init) -> double { + for (int i = r.begin(); i != r.end(); ++i) { + double x = (i + 0.5)*dx; + double h = sqrt(1 - x*x); + init += h*dx; + } + return init; + }, + /* reduction */ + [](double x, double y) -> double { + return x + y; + }, + /* partitioner */ Partitioner() + ); + return 4 * sum; +} + +static void warmupTBB() { + tbb::parallel_for(0, tbb::info::default_concurrency(), [](int) { + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < 0.01); + }); +} + +int main() { + // use the most performance codes + // only a single NUMA node + // and only 1 thread per core + tbb::task_arena::constraints c; + c.set_numa_id(tbb::info::numa_nodes()[0]); + c.set_core_type(tbb::info::core_types().back()); + c.set_max_threads_per_core(1); + c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c))); + tbb::task_arena a(c); + + std::cout << "Using an arena with " << a.max_concurrency() << " slots\n"; + + a.execute([&]() { + int num_intervals = 1<<26; + tbb::tick_count ts_0 = tbb::tick_count::now(); + double spi = serialPiExample(num_intervals); + tbb::tick_count ts_1 = tbb::tick_count::now(); + double serial_time = (ts_1 - ts_0).seconds(); + std::cout << "serial, " << spi << ", " << serial_time << std::endl; + warmupTBB(); + std::cout << "speedups relative to serial:" << std::endl; + std::cout << "gs, r-simple, d-simple, r-static, d-static, r-auto" << std::endl; + for (int gs = 1; gs <= num_intervals; gs *= 2) { + reducePiExample(num_intervals, gs); + tbb::tick_count t0 = tbb::tick_count::now(); + double v0 = reducePiExample(num_intervals, gs); + tbb::tick_count t1 = tbb::tick_count::now(); + double v1 = reducePiExample(num_intervals, gs); + tbb::tick_count t2 = tbb::tick_count::now(); + double v2 = reducePiExample(num_intervals, gs); + tbb::tick_count t3 = tbb::tick_count::now(); + double v3 = deterministicReducePiExample(num_intervals, gs); + tbb::tick_count t4 = tbb::tick_count::now(); + double v4 = deterministicReducePiExample(num_intervals, gs); + tbb::tick_count t5 = tbb::tick_count::now(); + std::cout << v0 << ", " << v1 << ", " << v2 + << ", " << v3 << ", " << v4 << "\n"; + std::cout << gs + << ", " << serial_time / (t2-t1).seconds() + << ", " << serial_time / (t4-t3).seconds() + << ", " << serial_time / (t3-t2).seconds() + << ", " << serial_time / (t5-t4).seconds() + << ", " << serial_time / (t1-t0).seconds() + << std::endl; + } + }); + return 0; +} + From 80b4ede9c134b2e8bf5ea51309fadd2660573ce3 Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Fri, 22 Nov 2024 19:22:25 -0600 Subject: [PATCH 32/34] Added pipeline benchmark --- .../performance_tuning/CMakeLists.txt | 1 + .../parallel_pipeline_timed.cpp | 198 ++++++++++++++++++ 2 files changed, 199 insertions(+) create mode 100644 new_examples/performance_tuning/parallel_pipeline_timed.cpp diff --git a/new_examples/performance_tuning/CMakeLists.txt b/new_examples/performance_tuning/CMakeLists.txt index 81e902a417..a48b218af5 100644 --- a/new_examples/performance_tuning/CMakeLists.txt +++ b/new_examples/performance_tuning/CMakeLists.txt @@ -17,6 +17,7 @@ foreach(tpp blocked_ranges_trivial.cpp parallel_for_partitioners.cpp parallel_for_partitioners_timed.cpp parallel_for_transpose_partitioners.cpp + parallel_pipeline_timed.cpp partitioners_imbalanced_loops.cpp priorities_and_conflict.cpp task_scheduler_observer.cpp) diff --git a/new_examples/performance_tuning/parallel_pipeline_timed.cpp b/new_examples/performance_tuning/parallel_pipeline_timed.cpp new file mode 100644 index 0000000000..ae9d5dadbd --- /dev/null +++ b/new_examples/performance_tuning/parallel_pipeline_timed.cpp @@ -0,0 +1,198 @@ +/* + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include +#include +#include + +void doWork(double sec) { + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() <= sec); +} + +tbb::filter makeMiddleFilters(int num_filters, tbb::filter_mode m, double usec) { + auto f = tbb::make_filter(m, + [=](int i) -> int { + doWork(usec); + return i; + }); + if (num_filters > 1) + return f & makeMiddleFilters(num_filters-1, m, usec); + else + return f; +} + +tbb::filter buildFilterChain(std::atomic& counter, std::vector& data, + int num_filters, tbb::filter_mode m, double usec) { + counter = data.size() - 1; + + tbb::filter middle{}; + tbb::filter end{}; + + if (num_filters > 1) { + tbb::filter start = + tbb::make_filter(m, + [&counter, &data, usec](tbb::flow_control& fc) -> int { + int i = counter--; + if (i > 0) { + doWork(usec); + return data[i]; + } else { + fc.stop(); + return 0; + } + }); + + + tbb::filter end = + tbb::make_filter(m, + [=](int i) { + doWork(usec); + }); + + if (num_filters > 2) { + tbb::filter middle = makeMiddleFilters(num_filters-2, m, usec); + return start & middle & end; + } else { + return start & end; + } + } else { + return tbb::make_filter(m, + [&](tbb::flow_control& fc) { + int i = counter--; + if (i > 0) { + doWork(usec); + } else { + fc.stop(); + } + }); + } +} + +double runSerial(std::vector& data, int num_filters, double usec, int num_items) { + tbb::tick_count t0 = tbb::tick_count::now(); + for (int i = 0; i < num_items; ++i) { + for (int j = 0; j < num_filters; ++j) { + doWork(usec); + } + } + double total_time = (tbb::tick_count::now() - t0).seconds(); + return total_time; +} + +std::vector makeVector(int N); +static void warmupTBB(); + +std::vector modes = {tbb::filter_mode::serial_in_order, + tbb::filter_mode::serial_out_of_order, + tbb::filter_mode::parallel }; +std::vector mode_name = {"serial_in_order", + "serial_out_of_order", + "parallel"}; + +std::vector spin_times = {1e-7, 1e-6, 1e-5, 1e-4}; + +void runEightFiltersOneTokenBalanced(int N, std::vector& data) { + int num_filters = 8; + + std::cout << "Varying the mode and spin_time for 8 filters:\n"; + + // Balanced serial time + std::vector serial_time; + for (double spin_time : spin_times) { + serial_time.push_back(runSerial(data, num_filters, spin_time, N)); + std::cout << serial_time.back() << ","; + } + + std::cout << "\nmode"; + for (double spin_time : spin_times) + std::cout << ", " << spin_time; + std::cout << std::endl; + + for (int m = 0; m < modes.size(); ++m) { + std::atomic counter = 0; + std::cout << mode_name[m]; + for (int st = 0; st < spin_times.size(); ++st) { + double spin_time = spin_times[st]; + + auto chain = + buildFilterChain(counter, data, num_filters, modes[m], spin_time); + + tbb::tick_count t0 = tbb::tick_count::now(); + warmupTBB(); + tbb::parallel_pipeline(1, chain); + double t = (tbb::tick_count::now() - t0).seconds(); + std::cout << ", " << serial_time[st]/t; + } + std::cout << std::endl; + + + } +} + +void runEightTokensIncreasingFiltersBalanced() { +} + +void runEightFiltersIncreasingTokensBalanced() { +} + +void runEightFiltersImbalanced() { +} + +int main() { + // use the most performance codes + // only a single NUMA node + // and only 1 thread per core + tbb::task_arena::constraints c; + c.set_numa_id(tbb::info::numa_nodes()[0]); + c.set_core_type(tbb::info::core_types().front()); + c.set_max_threads_per_core(1); + c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c))); + tbb::task_arena a(c); + + std::cout << "Using an arena with " << a.max_concurrency() << " slots\n"; + + a.execute([&]() { + int N = 8000; + std::vector data = makeVector(N); + runEightFiltersOneTokenBalanced(N, data); + }); + + return 0; +} + +std::vector makeVector(int N) { + std::vector v; + v.reserve(N); + for (int i = 0; i < N; ++i) { + v.emplace_back(i); + } + return v; +} + +static void warmupTBB() { + // This is a simple loop that should get workers started. + // oneTBB creates workers lazily on first use of the library + // so this hides the startup time when looking at trivial + // examples that do little real work. + tbb::parallel_for(0, tbb::this_task_arena::max_concurrency(), + [=](int) { + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < 0.01); + } + ); +} From a418fa2c045aec68d5079cdfc674401cb75ed4b3 Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Fri, 22 Nov 2024 20:17:29 -0600 Subject: [PATCH 33/34] Added other pipeline test cases --- .../parallel_pipeline_timed.cpp | 136 +++++++++++++++--- 1 file changed, 120 insertions(+), 16 deletions(-) diff --git a/new_examples/performance_tuning/parallel_pipeline_timed.cpp b/new_examples/performance_tuning/parallel_pipeline_timed.cpp index ae9d5dadbd..5e66274627 100644 --- a/new_examples/performance_tuning/parallel_pipeline_timed.cpp +++ b/new_examples/performance_tuning/parallel_pipeline_timed.cpp @@ -24,24 +24,29 @@ void doWork(double sec) { while ((tbb::tick_count::now() - t0).seconds() <= sec); } -tbb::filter makeMiddleFilters(int num_filters, tbb::filter_mode m, double usec) { +tbb::filter makeMiddleFilters(int num_filters, tbb::filter_mode m, double usec, + int imbalanced_filter = -1, double imbalance_factor = 1) { + double w = (imbalanced_filter == num_filters) ? usec*imbalance_factor : usec; + auto f = tbb::make_filter(m, [=](int i) -> int { - doWork(usec); + doWork(w); return i; }); if (num_filters > 1) - return f & makeMiddleFilters(num_filters-1, m, usec); + return f & makeMiddleFilters(num_filters-1, m, usec, + imbalanced_filter, imbalance_factor); else return f; } tbb::filter buildFilterChain(std::atomic& counter, std::vector& data, - int num_filters, tbb::filter_mode m, double usec) { + int num_filters, tbb::filter_mode m, double usec, + double imbalance_factor = 1) { counter = data.size() - 1; - tbb::filter middle{}; tbb::filter end{}; + int imbalanced_filter = (imbalance_factor == 1) ? -1 : num_filters - 3; if (num_filters > 1) { tbb::filter start = @@ -65,7 +70,7 @@ tbb::filter buildFilterChain(std::atomic& counter, std::vector 2) { - tbb::filter middle = makeMiddleFilters(num_filters-2, m, usec); + tbb::filter middle = makeMiddleFilters(num_filters-2, m, usec, imbalanced_filter, imbalance_factor); return start & middle & end; } else { return start & end; @@ -83,11 +88,16 @@ tbb::filter buildFilterChain(std::atomic& counter, std::vector& data, int num_filters, double usec, int num_items) { +double runSerial(std::vector& data, int num_filters, double usec, int num_items, + int imbalanced_filter = -1, double imbalance_factor = 1) { tbb::tick_count t0 = tbb::tick_count::now(); for (int i = 0; i < num_items; ++i) { for (int j = 0; j < num_filters; ++j) { - doWork(usec); + if (j == imbalanced_filter) { + doWork(usec*imbalance_factor); + } else { + doWork(usec); + } } } double total_time = (tbb::tick_count::now() - t0).seconds(); @@ -104,18 +114,18 @@ std::vector mode_name = {"serial_in_order", "serial_out_of_order", "parallel"}; -std::vector spin_times = {1e-7, 1e-6, 1e-5, 1e-4}; + void runEightFiltersOneTokenBalanced(int N, std::vector& data) { + std::vector spin_times = {1e-7, 1e-6, 1e-5, 1e-4}; int num_filters = 8; - std::cout << "Varying the mode and spin_time for 8 filters:\n"; + std::cout << "\nVarying the mode and spin_time for 8 filters:\n"; // Balanced serial time std::vector serial_time; for (double spin_time : spin_times) { serial_time.push_back(runSerial(data, num_filters, spin_time, N)); - std::cout << serial_time.back() << ","; } std::cout << "\nmode"; @@ -139,18 +149,109 @@ void runEightFiltersOneTokenBalanced(int N, std::vector& data) { std::cout << ", " << serial_time[st]/t; } std::cout << std::endl; + } +} +void runEightTokensIncreasingFilters(int N, std::vector& data) { + int max_threads = tbb::this_task_arena::max_concurrency(); + double spin_time = 0.0001; + std::cout << "\nVarying number of filters with 100us spin and 8 tokens:\n"; + + // Balanced serial time + std::vector serial_time; + for (int num_filters = 1; num_filters <= max_threads; ++num_filters) { + serial_time.push_back(runSerial(data, num_filters, spin_time, N)); } -} -void runEightTokensIncreasingFiltersBalanced() { + std::cout << "\nmode"; + for (int num_filters = 1; num_filters <= max_threads; ++num_filters) + std::cout << ", " << num_filters; + std::cout << std::endl; + + for (int m = 0; m < modes.size(); ++m) { + std::atomic counter = 0; + std::cout << mode_name[m]; + for (int num_filters = 1; num_filters <= max_threads; ++num_filters) { + auto chain = + buildFilterChain(counter, data, num_filters, modes[m], spin_time); + + tbb::tick_count t0 = tbb::tick_count::now(); + warmupTBB(); + tbb::parallel_pipeline(max_threads, chain); + double t = (tbb::tick_count::now() - t0).seconds(); + std::cout << ", " << serial_time[num_filters-1]/t; + } + std::cout << std::endl; + } } -void runEightFiltersIncreasingTokensBalanced() { +void runEightFiltersIncreasingTokens(int N, std::vector& data) { + int max_threads = tbb::this_task_arena::max_concurrency(); + int num_filters = 8; + double spin_time = 0.0001; + + std::cout << "\nVarying number of tokens with 8 filters and 100us spin:\n"; + + double serial_time = runSerial(data, num_filters, spin_time, N); + + std::cout << "\nmode"; + for (int num_tokens = 1; num_tokens <= 2*max_threads; ++num_tokens) + std::cout << ", " << num_tokens; + std::cout << std::endl; + + for (int m = 0; m < modes.size(); ++m) { + std::atomic counter = 0; + std::cout << mode_name[m]; + for (int num_tokens = 1; num_tokens <= 2*max_threads; ++num_tokens) { + auto chain = + buildFilterChain(counter, data, num_filters, modes[m], spin_time); + + tbb::tick_count t0 = tbb::tick_count::now(); + warmupTBB(); + tbb::parallel_pipeline(num_tokens, chain); + double t = (tbb::tick_count::now() - t0).seconds(); + std::cout << ", " << serial_time/t; + } + std::cout << std::endl; + } } -void runEightFiltersImbalanced() { +void runEightFiltersImbalanced(int N, std::vector& data) { + std::vector imbalance = {0.1, 0.5, 0.75, 1.5, 2, 10}; + int num_filters = 8; + int num_tokens = 8; + double spin_time = 0.0001; + + std::cout << "\nVarying imbalance of 1 of 8 filters:\n"; + + // Imbalanced serial time + std::vector serial_time; + for (double imb : imbalance) { + serial_time.push_back(runSerial(data, num_filters, spin_time, N, imb)); + } + + std::cout << "\nmode"; + for (double imb : imbalance) + std::cout << ", " << imb; + std::cout << std::endl; + + for (int m = 0; m < modes.size(); ++m) { + std::atomic counter = 0; + std::cout << mode_name[m]; + for (int imb = 0; imb < imbalance.size(); ++imb) { + double imbalance_factor = imbalance[imb]; + auto chain = + buildFilterChain(counter, data, num_filters, modes[m], spin_time, imbalance_factor); + + tbb::tick_count t0 = tbb::tick_count::now(); + warmupTBB(); + tbb::parallel_pipeline(num_tokens, chain); + double t = (tbb::tick_count::now() - t0).seconds(); + std::cout << ", " << serial_time[imb]/t; + } + std::cout << std::endl; + } } int main() { @@ -169,7 +270,10 @@ int main() { a.execute([&]() { int N = 8000; std::vector data = makeVector(N); - runEightFiltersOneTokenBalanced(N, data); + //runEightFiltersOneTokenBalanced(N, data); + //runEightTokensIncreasingFilters(N, data); + //runEightFiltersIncreasingTokens(N, data); + runEightFiltersImbalanced(N, data); }); return 0; From faf88cbdeefa570210710659acb446690128f9b0 Mon Sep 17 00:00:00 2001 From: Mike Voss Date: Wed, 4 Dec 2024 14:42:36 -0600 Subject: [PATCH 34/34] Added README for algorithms --- new_examples/algorithms/README.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 new_examples/algorithms/README.md diff --git a/new_examples/algorithms/README.md b/new_examples/algorithms/README.md new file mode 100644 index 0000000000..7cd9092e85 --- /dev/null +++ b/new_examples/algorithms/README.md @@ -0,0 +1,3 @@ +# Chapter 2: Algorithms + +This directory contains the examples for Chapter 2. \ No newline at end of file