From 16d87cb23e4102439ed1fbd3565ec147ed71fcc3 Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Thu, 7 Nov 2024 18:34:34 -0600
Subject: [PATCH 01/34] Fixed copyrights

---
 new_examples/CMakeLists.txt                   |   2 +-
 .../performance_tuning/CMakeLists.txt         |  11 +-
 new_examples/performance_tuning/arena_trace.h | 167 +++++++++++++++
 .../blocked_ranges_trivial.cpp                |  43 +---
 .../performance_tuning/constraints.cpp        | 122 +++++++++++
 .../global_control_and_explicit_arena.cpp     |   1 -
 .../parallel_for_addition_partitioners.cpp    | 135 ++++++++++++
 .../parallel_for_spin_partitioners.cpp        | 101 +++++++++
 .../parallel_for_spin_partitioners_timed.cpp  | 117 +++++++++++
 .../parallel_for_transpose_partitioners.cpp   | 194 ++++++++++++++++++
 10 files changed, 853 insertions(+), 40 deletions(-)
 create mode 100644 new_examples/performance_tuning/arena_trace.h
 create mode 100644 new_examples/performance_tuning/constraints.cpp
 create mode 100644 new_examples/performance_tuning/parallel_for_addition_partitioners.cpp
 create mode 100644 new_examples/performance_tuning/parallel_for_spin_partitioners.cpp
 create mode 100644 new_examples/performance_tuning/parallel_for_spin_partitioners_timed.cpp
 create mode 100644 new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp
diff --git a/new_examples/CMakeLists.txt b/new_examples/CMakeLists.txt
index c2d8215fd5..6651073a6e 100644
--- a/new_examples/CMakeLists.txt
+++ b/new_examples/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required (VERSION 3.4)
 
 project(tbb_tutorials LANGUAGES CXX)
 
-set(CMAKE_CXX_COMPILER "icpx")
+set(CMAKE_CXX_COMPILER "icx-cl")
 set(CMAKE_LINKER "icpx")
 set(CMAKE_CXX_STANDARD 20)
 
diff --git a/new_examples/performance_tuning/CMakeLists.txt b/new_examples/performance_tuning/CMakeLists.txt
index ee9b133a2a..ff880d251b 100644
--- a/new_examples/performance_tuning/CMakeLists.txt
+++ b/new_examples/performance_tuning/CMakeLists.txt
@@ -1,12 +1,17 @@
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb -pthread")
-set(CMAKE_CXX_LINKER_FLAGS "-tbb -pthread")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qtbb")
+set(CMAKE_CXX_LINKER_FLAGS "-Qtbb")
 
 foreach(tpp global_control_and_implicit_arena.cpp 
             global_control_and_explicit_arena.cpp 
             global_control_and_implicit_conflict.cpp 
             global_control_and_explicit_conflict.cpp 
             priorities_and_conflict.cpp 
-            blocked_ranges_trivial.cpp)
+            blocked_ranges_trivial.cpp
+            constraints.cpp
+            parallel_for_spin_partitioners.cpp
+            parallel_for_spin_partitioners_timed.cpp
+            parallel_for_addition_partitioners.cpp
+            parallel_for_transpose_partitioners.cpp)
   string(REPLACE ".cpp" "" texe ${tpp})
   add_executable(${texe} ${tpp})
   target_include_directories(${texe} PUBLIC
diff --git a/new_examples/performance_tuning/arena_trace.h b/new_examples/performance_tuning/arena_trace.h
new file mode 100644
index 0000000000..2a680ed8e5
--- /dev/null
+++ b/new_examples/performance_tuning/arena_trace.h
@@ -0,0 +1,167 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef ARENA_TRACE_H
+#define ARENA_TRACE_H 1
+
+#include <tbb/combinable.h>
+#include <tbb/enumerable_thread_specific.h>
+#include <tbb/task_scheduler_observer.h>
+#include <algorithm>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <unordered_set>
+
+class arena_tracer {
+public:
+  arena_tracer(const std::string& name) : m_t0(std::chrono::steady_clock::now()), m_name(name)  { }
+  ~arena_tracer() { dump_trace(); }
+ 
+  void add_arena(const std::string& n, tbb::task_arena& a) {
+    m_observers.local().emplace_back( std::make_shared<tracing_observer>(m_events, m_arena_id++, m_t0, n, a )); 
+  }
+
+private:
+  using time_point_type = std::chrono::time_point<std::chrono::steady_clock>;
+  time_point_type m_t0;
+  const std::string m_name;
+  std::atomic<int> m_arena_id = 0;
+
+  struct trace_event {
+    time_point_type t;
+    char ph;
+    bool is_worker;
+    std::thread::id tid;
+    int slot;
+    int pid;
+    trace_event( time_point_type t_, char ph_, 
+                 bool is_worker_, std::thread::id tid_, 
+                 int slot_, int pid_ ) : t(t_), ph(ph_), is_worker(is_worker_), tid(tid_), slot(slot_), pid(pid_) {}
+  };
+
+  using ets_vector_type = std::vector<trace_event>;
+  using ets_value_type = ets_vector_type;
+  using ets_type = tbb::combinable<ets_value_type>;
+  ets_type m_events;
+ 
+  // observer class
+  class tracing_observer : public oneapi::tbb::task_scheduler_observer {
+    ets_type& m_events;
+    int m_arena_id;
+    time_point_type m_t;
+    const std::string m_name;
+
+  public:
+    tracing_observer( ets_type& e, int arena_id, time_point_type t0,
+                      const std::string& fn, oneapi::tbb::task_arena &a ) 
+        : oneapi::tbb::task_scheduler_observer(a), m_events(e), m_arena_id(arena_id), m_t(t0), m_name(fn) {
+        observe(true); // activate the observer
+    }
+
+    ~tracing_observer() { }
+
+    int get_arena_id() { return m_arena_id; }
+    const std::string& get_arena_name() { return m_name; }
+
+    void on_scheduler_entry( bool worker ) override {
+      auto& l = m_events.local();
+      std::thread::id tid = std::this_thread::get_id();
+
+      l.emplace_back(
+        std::chrono::steady_clock::now(), 
+        'B', // ph
+        worker, // is_worker
+        tid,
+        oneapi::tbb::this_task_arena::current_thread_index(),
+        m_arena_id 
+      );
+    }
+    void on_scheduler_exit( bool worker ) override {
+      auto& l = m_events.local();
+      std::thread::id tid = std::this_thread::get_id();
+
+      l.emplace_back(
+        std::chrono::steady_clock::now(), 
+        'E', // ph
+        worker, // is_worker
+        tid,
+        oneapi::tbb::this_task_arena::current_thread_index(),
+        m_arena_id 
+      );
+    }
+  };
+
+  void dump_trace() {
+    std::unordered_set<std::thread::id> was_seen;
+
+    std::ofstream out(m_name);
+    out << "[";
+
+    bool first = true;
+    for (auto& v : m_observers.range()) {
+      for (auto& obs : v) {
+        if (!first) out << ","; 
+        first = false; 
+        out << "\n{\"name\": \"process_name\", \"ph\" : \"M\"" 
+            << ", \"pid\" : " << obs->get_arena_id()  
+            << ", \"args\" : { \"name\" : \"" << obs->get_arena_name() << "\" }"  
+            << "}";
+      }
+    } 
+
+    ets_vector_type r;
+    m_events.combine_each(
+      [&r](const ets_vector_type& v) {
+        r.insert(r.end(), v.begin(), v.end());
+      });
+    
+    std::sort(r.begin(), r.end(), [](const trace_event& a, const trace_event& b) { return a.t < b.t; });
+
+    for (auto& e : r) {
+      out << ",\n{"
+          <<   "\"name\": \"" << e.tid << "\""
+          << ", \"cat\": \"arena\""
+          << ", \"ph\": \"" << e.ph << "\""
+          << ", \"ts\": " << std::chrono::duration_cast<std::chrono::microseconds>(e.t-m_t0).count()
+          << ", \"pid\": " << e.pid
+          << ", \"tid\": " << e.slot
+          << "}";
+      if (e.ph == 'B') {
+        out << ",\n{"
+            << "\"name\": \"flow\""
+            << ", \"id\": " << e.tid 
+            << ", \"cat\": \"arena\""
+            << ", \"ts\": " << std::chrono::duration_cast<std::chrono::microseconds>(e.t-m_t0).count()
+            << ", \"pid\": " << e.pid
+            << ", \"tid\": " << e.slot;
+        if (was_seen.find(e.tid) != was_seen.end()) {
+          out << ", \"ph\": \"t\", \"bp\": \"e\" }";
+        } else {
+          was_seen.insert(e.tid);
+          out << ", \"ph\": \"s\", \"bp\": \"e\" }";
+        }
+      }
+    }
+
+    out << "\n]\n";
+  }
+
+  tbb::enumerable_thread_specific<std::vector<std::shared_ptr<tracing_observer>>> m_observers;
+};
+
+#endif
diff --git a/new_examples/performance_tuning/blocked_ranges_trivial.cpp b/new_examples/performance_tuning/blocked_ranges_trivial.cpp
index 8515b1ce57..145f795575 100644
--- a/new_examples/performance_tuning/blocked_ranges_trivial.cpp
+++ b/new_examples/performance_tuning/blocked_ranges_trivial.cpp
@@ -16,9 +16,7 @@
 
 #include <cstdio>
 #include <vector>
-
-#define TBB_PREVIEW_BLOCKED_RANGE_ND 1
-#include <tbb/tbb.h>
+#include "tbb/tbb.h"
 
 double f(double v);
 
@@ -64,26 +62,10 @@ void example3d(int Z, int P, int N, int M, double* a) {
   );
 }
 
-void exampleNd(int Z, int P, int N, int M, double* a) {
-  tbb::parallel_for(tbb::blocked_rangeNd<int,4>{{0,Z},{0,P},{0,N}, {0,M}}, 
-    [=](const tbb::blocked_rangeNd<int,4>& r_ijkl) {
-      const auto& r_i = r_ijkl.dim(0);
-      const auto& r_j = r_ijkl.dim(1);
-      const auto& r_k = r_ijkl.dim(2);
-      const auto& r_l = r_ijkl.dim(3);
-      for (int i = r_i.begin(); i < r_i.end(); ++i)
-        for (int j = r_j.begin(); j < r_j.end(); ++j)
-          for (int k = r_k.begin(); k < r_k.end(); ++k) 
-            for (int l = r_l.begin(); l < r_l.end(); ++l) 
-              a[i*P*N*M+j*N*M+k*M+l] = f(a[i*P*N*M+j*N*M+k*M+l]);
-    }
-  );
-}
-
 #include <iostream>
 
 static void warmupTBB();
-static bool resultsAreValid(int, const double*, const double*, const double*, const double*, const double*);
+static bool resultsAreValid(int, const double*, const double*, const double*, const double*);
 static void serialDouble(int, double*);
 
 int main() {
@@ -98,9 +80,9 @@ int main() {
   double* a1 = new double[Size];
   double* a2 = new double[Size];
   double* a3 = new double[Size];
-  double* aN = new double[Size];
+
   for (int i = 0; i < Size; ++i)
-    a0[i] = a1[i] = a2[i] = a3[i] = aN[i] = 1.0;
+    a0[i] = a1[i] = a2[i] = a3[i] = 1.0;
 
   // Perform serial double
   tbb::tick_count t0 = tbb::tick_count::now();
@@ -127,21 +109,14 @@ int main() {
   double tbb3d_time = (tbb::tick_count::now() - t0).seconds();
   std::printf("3d done\n");
   
-  t0 = tbb::tick_count::now();
-  exampleNd(Z, P, N, M, aN);
-  double tbbNd_time = (tbb::tick_count::now() - t0).seconds();
-  std::printf("Nd done\n");
-
-  if (resultsAreValid(Size, a0, a1, a2, a3, aN)) {
+  if (resultsAreValid(Size, a0, a1, a2, a3)) {
     std::cout << "serial_time == " << serial_time << " seconds\n"
               << "tbb1d_time == " << tbb1d_time << " seconds\n"
               << "speedup == " << serial_time/tbb1d_time << "\n"
               << "tbb2d_time == " << tbb2d_time << " seconds\n"
               << "speedup == " << serial_time/tbb2d_time << "\n"
               << "tbb3d_time == " << tbb3d_time << " seconds\n"
-              << "speedup == " << serial_time/tbb3d_time << "\n"
-              << "tbbNd_time == " << tbbNd_time << " seconds\n"
-              << "speedup == " << serial_time/tbbNd_time << "\n";
+              << "speedup == " << serial_time/tbb3d_time << "\n";
     return 0;
   } else {
     std::cout << "ERROR: invalid results!\n";
@@ -181,14 +156,12 @@ double f(double v) {
 
 static bool resultsAreValid(int N, 
                             const double* a0, const double* a1, 
-                            const double* a2, const double* a3,
-                            const double* aN) {
+                            const double* a2, const double* a3) {
   for (int i = 0; i < N; ++i) {
     if (a0[i] != 2.0 
         || a1[i] != 2.0
         || a2[i] != 2.0
-        || a3[i] != 2.0
-        || aN[i] != 2.0) {
+        || a3[i] != 2.0) {
       std::printf("%d: %f, %f, %f, %f\n", i, a0[i], a1[i], a2[i], a3[i]); 
       std::cerr << "Invalid results" << std::endl;
       return false;
diff --git a/new_examples/performance_tuning/constraints.cpp b/new_examples/performance_tuning/constraints.cpp
new file mode 100644
index 0000000000..42800c6205
--- /dev/null
+++ b/new_examples/performance_tuning/constraints.cpp
@@ -0,0 +1,122 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "tbb/tbb.h"
+#include <vector>
+
+#define USE_ARENA_TRACE 1
+#if USE_ARENA_TRACE
+#include "arena_trace.h"
+#endif
+
+int N = 1000;
+double w = 0.01;
+double f(double v);
+
+void constrain_for_numa_nodes() {
+#if USE_ARENA_TRACE
+    arena_tracer t{"numa_trace.json"};
+#endif
+
+    std::vector<tbb::numa_node_id> numa_nodes = tbb::info::numa_nodes();
+    std::vector<tbb::task_arena> arenas(numa_nodes.size());
+    std::vector<tbb::task_group> task_groups(numa_nodes.size());
+
+    for (int i = 0; i < numa_nodes.size(); i++) {
+        arenas[i].initialize(tbb::task_arena::constraints(numa_nodes[i]), 0);
+        #if USE_ARENA_TRACE
+        t.add_arena(std::to_string(i), arenas[i]);
+        #endif
+    }
+    for (int i = 0; i < numa_nodes.size(); i++) {
+        arenas[i].execute([&task_groups, i] {
+            task_groups[i].run([] {
+                tbb::parallel_for(0, N, [](int j) { f(w); });
+            });
+        });
+    }
+    for (int i = 0; i < numa_nodes.size(); i++) {
+        arenas[i].execute([&task_groups, i] {
+            task_groups[i].wait();
+        });
+    }
+}
+
+void constrain_for_core_type() {
+
+    std::vector<tbb::core_type_id> core_types = tbb::info::core_types();
+    tbb::task_arena arena(
+      tbb::task_arena::constraints{}.set_core_type(core_types.back())
+    );
+
+    #if USE_ARENA_TRACE
+      arena_tracer t{"core_trace.json"};
+      t.add_arena("pcores", arena);
+    #endif
+
+    arena.execute([] {
+        tbb::parallel_for(0, N, [](int) { f(w); });
+    });
+}
+
+void constrain_for_no_hyperthreading() {
+    tbb::task_arena::constraints c;
+    std::vector<tbb::core_type_id> core_types = tbb::info::core_types();
+    c.set_core_type(core_types.back());
+    c.set_max_threads_per_core(1);
+    tbb::task_arena no_ht_arena(c);
+
+    #if USE_ARENA_TRACE
+      arena_tracer t{"no_ht_constraints_trace.json"};
+      t.add_arena("no_ht_arena", no_ht_arena);
+    #endif
+
+    no_ht_arena.execute( [] {
+        tbb::parallel_for(0, N, [](int) { f(w); });
+    });
+}
+
+void limit_concurrency_for_no_hyperthreading() {
+    tbb::task_arena::constraints c;
+    std::vector<tbb::core_type_id> core_types = tbb::info::core_types();
+    c.set_core_type(core_types.back());
+    c.set_max_threads_per_core(1);
+    int no_ht_concurrency = tbb::info::default_concurrency(c);
+    tbb::task_arena arena( no_ht_concurrency );
+
+    #if USE_ARENA_TRACE
+      arena_tracer t{"no_ht_concurrency_trace.json"};
+      t.add_arena("no_ht_concurrency", arena);
+    #endif
+
+    arena.execute( [] {
+        tbb::parallel_for(0, N, [](int) { f(w); });
+    });
+}
+
+int main() {
+  constrain_for_numa_nodes();
+  constrain_for_core_type();
+  constrain_for_no_hyperthreading();
+  limit_concurrency_for_no_hyperthreading();
+  return 0;
+}
+
+double f(double v) {
+  tbb::tick_count t0 = tbb::tick_count::now();
+  while ((tbb::tick_count::now() - t0).seconds() < 0.01);
+  return 2*v;
+}
diff --git a/new_examples/performance_tuning/global_control_and_explicit_arena.cpp b/new_examples/performance_tuning/global_control_and_explicit_arena.cpp
index 3bf5e90c14..93a8520b7e 100644
--- a/new_examples/performance_tuning/global_control_and_explicit_arena.cpp
+++ b/new_examples/performance_tuning/global_control_and_explicit_arena.cpp
@@ -65,7 +65,6 @@ void clearParticipation() {
 }
 
 void dumpParticipation(int p) {
-  int end = next_tid;
   int sum = tid_participation[0];
   std::cout << "[" << tid_participation[0];
   for (int i = 1; i < p; ++i) {
diff --git a/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp b/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp
new file mode 100644
index 0000000000..49333b9bc6
--- /dev/null
+++ b/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp
@@ -0,0 +1,135 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include <tbb/tbb.h>
+
+template <typename Partitioner>
+void parForAdd(double v, int N, double *a, const Partitioner& p) {
+  tbb::parallel_for( tbb::blocked_range<int>(0, N, 1), 
+    [v, a](const tbb::blocked_range<int>& r) {
+      int ie = r.end();
+      for (int i = r.begin(); i < ie; ++i) {
+        a[i] += v;
+      }
+    }, p
+  );
+}
+
+template <typename Partitioner>
+void parForAdd(double v, int N, double *a, Partitioner& p) {
+  tbb::parallel_for( tbb::blocked_range<int>(0, N, 1), 
+    [v, a](const tbb::blocked_range<int>& r) {
+      int ie = r.end();
+      for (int i = r.begin(); i < ie; ++i) {
+        a[i] += v;
+      }
+    }, p
+  );
+}
+
+void resetV(int N, double *v);
+void resetA(int N, double *a);
+static void warmupTBB();
+
+int main(int argc, char *argv[]) {
+  int M = 10000;
+  int N = 100000;
+
+  std::cout << "P = " << tbb::info::default_concurrency()
+            << std::endl << "N = " << N 
+            << std::endl << "M = " << M << std::endl;
+
+  #define CONSTRAIN_TO_FEWER_CORES 0
+  #if CONSTRAIN_TO_FEWER_CORES
+  tbb::task_arena::constraints c;
+  c.set_max_concurrency(tbb::info::default_concurrency() - 2);
+  tbb::task_arena cores_arena(c);
+  std::cout << "Using arena with " << cores_arena.max_concurrency() << " slots\n";
+  cores_arena.execute([&]() {
+  #endif
+
+   double *v = new double[M];
+   double *a = new double[N]; 
+
+   warmupTBB();
+   resetV(M, v);
+   resetA(N, a);
+   tbb::tick_count t0 = tbb::tick_count::now();
+   for (int i = 0; i < M; ++i) {
+     parForAdd(v[i], N, a, tbb::auto_partitioner{});
+   }
+   double auto_time = (tbb::tick_count::now() - t0).seconds();
+
+   warmupTBB();
+   resetA(N, a);
+   tbb::affinity_partitioner aff_p;
+   t0 = tbb::tick_count::now();
+   for (int i = 0; i < M; ++i) {
+     parForAdd(v[i], N, a, aff_p); 
+   }
+   double affinity_time = (tbb::tick_count::now() - t0).seconds();
+
+   warmupTBB();
+   resetA(N, a);
+   t0 = tbb::tick_count::now();
+   for (int i = 0; i < M; ++i) {
+     parForAdd(v[i], N, a, tbb::static_partitioner{});
+  }
+  double static_time = (tbb::tick_count::now() - t0).seconds();
+
+  std::cout << "auto_partitioner = " << auto_time << std::endl
+            << "affinity_partitioner = " << affinity_time << std::endl
+            << "static_partitioner = " << static_time << std::endl;
+
+  delete [] v;
+  delete [] a;
+
+    #if CONSTRAIN_TO_FEWER_CORES
+  });
+  #endif
+
+  return 0;
+}
+
+
+void resetV(int N, double *v) {
+  for (int i = 0; i < N; ++i) {
+    v[i] = i;
+  }
+  std::shuffle(v, v+N, std::random_device{});
+}
+
+void resetA(int N, double *a) {
+  for (int i = 0; i < N; ++i) {
+    a[i] = 0;
+  }
+}
+
+static void warmupTBB() {
+  // This is a simple loop that should get workers started.
+  // oneTBB creates workers lazily on first use of the library
+  // so this hides the startup time when looking at trivial
+  // examples that do little real work. 
+  tbb::parallel_for(0, tbb::info::default_concurrency(), 
+    [=](int) {
+      tbb::tick_count t0 = tbb::tick_count::now();
+      while ((tbb::tick_count::now() - t0).seconds() < 0.01);
+    }
+  );
+}
\ No newline at end of file
diff --git a/new_examples/performance_tuning/parallel_for_spin_partitioners.cpp b/new_examples/performance_tuning/parallel_for_spin_partitioners.cpp
new file mode 100644
index 0000000000..5de939e84a
--- /dev/null
+++ b/new_examples/performance_tuning/parallel_for_spin_partitioners.cpp
@@ -0,0 +1,101 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include <iostream>
+#include <tbb/tbb.h>
+
+void doWork(double sec);
+
+template <typename Partitioner>
+void pforWork(int N, const Partitioner& p) {
+  tbb::parallel_for( tbb::blocked_range<int>(0, N, 1), 
+    [](const tbb::blocked_range<int>& r) {
+      int ie = r.end();
+      for (int i = r.begin(); i < ie; ++i) {
+        doWork(i);
+      }
+    }, p
+  );
+}
+
+template <typename Partitioner>
+void pforWork(int N, Partitioner& p) {
+  tbb::parallel_for( tbb::blocked_range<int>(0, N, 1), 
+    [](const tbb::blocked_range<int>& r) {
+      int ie = r.end();
+      for (int i = r.begin(); i < ie; ++i) {
+        doWork(i);
+      }
+    }, p
+  );
+}
+
+static void warmupTBB();
+
+int main(int argc, char *argv[]) {
+  int N = 1000;
+  int M = 10;
+
+  std::cout << "P = " << tbb::info::default_concurrency()
+            << std::endl << "M = " << M
+            << std::endl << "N = " << N << std::endl;
+
+   warmupTBB();
+   tbb::tick_count t0 = tbb::tick_count::now();
+   for (int i = 0; i < M; ++i) {
+     pforWork(N, tbb::auto_partitioner{});
+   }
+   double auto_time = (tbb::tick_count::now() - t0).seconds();
+
+   warmupTBB();
+   tbb::affinity_partitioner aff_p;
+   t0 = tbb::tick_count::now();
+   for (int i = 0; i < M; ++i) {
+     pforWork(N, aff_p); 
+   }
+   double affinity_time = (tbb::tick_count::now() - t0).seconds();
+
+   warmupTBB();
+   t0 = tbb::tick_count::now();
+   for (int i = 0; i < M; ++i) {
+     pforWork(N, tbb::static_partitioner{});
+  }
+  double static_time = (tbb::tick_count::now() - t0).seconds();
+
+  std::cout << "auto_partitioner = " << auto_time << " seconds" << std::endl
+            << "affinity_partitioner = " << affinity_time << " seconds" << std::endl
+            << "static_partitioner = " << static_time << " seconds" << std::endl;
+  return 0;
+}
+
+static void warmupTBB() {
+  // This is a simple loop that should get workers started.
+  // oneTBB creates workers lazily on first use of the library
+  // so this hides the startup time when looking at trivial
+  // examples that do little real work. 
+  tbb::parallel_for(0, tbb::info::default_concurrency(), 
+    [=](int) {
+      tbb::tick_count t0 = tbb::tick_count::now();
+      while ((tbb::tick_count::now() - t0).seconds() < 0.01);
+    }
+  );
+}
+
+void doWork(double usec) {
+  double sec = usec*1e-06;
+  tbb::tick_count t0 = tbb::tick_count::now();
+  while ((tbb::tick_count::now() - t0).seconds() <= sec);
+}
\ No newline at end of file
diff --git a/new_examples/performance_tuning/parallel_for_spin_partitioners_timed.cpp b/new_examples/performance_tuning/parallel_for_spin_partitioners_timed.cpp
new file mode 100644
index 0000000000..64272005af
--- /dev/null
+++ b/new_examples/performance_tuning/parallel_for_spin_partitioners_timed.cpp
@@ -0,0 +1,117 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include <iostream>
+#include <string>
+#include <tbb/tbb.h>
+
+static inline void spinWaitForAtLeast(double sec=0.0);
+
+static inline double executeFor(int num_trials, int N, double tpi) {
+  tbb::tick_count t0;
+  for (int t = -1; t < num_trials; ++t) {
+    if (!t) t0 = tbb::tick_count::now();
+    for (int i = 0; i < N; ++i) {
+      spinWaitForAtLeast(tpi);
+    } 
+  }
+  tbb::tick_count t1 = tbb::tick_count::now();
+  return (t1 - t0).seconds()/num_trials;
+}
+
+template< typename P >
+static inline double executePfor(int num_trials, int N,
+		                         int gs, P& p, double tpi) {
+  tbb::tick_count t0;
+  for (int t = -1; t < num_trials; ++t) {
+    if (!t) t0 = tbb::tick_count::now();
+    tbb::parallel_for (
+      tbb::blocked_range<int>{0, N, static_cast<size_t>(gs)},
+      [tpi](const tbb::blocked_range<int>& r) {
+        int e = r.end();
+        for (int i = r.begin(); i < e; ++i) {
+          spinWaitForAtLeast(tpi);
+        } 
+      }, 
+      p
+    );
+  }
+  tbb::tick_count t1 = tbb::tick_count::now();
+  return (t1 - t0).seconds()/num_trials;
+}
+
+#define CONSTRAIN_TO_ECORES 1
+
+int main() {
+  tbb::auto_partitioner auto_p;
+  tbb::simple_partitioner simple_p;
+  tbb::static_partitioner static_p;
+  const std::string pname[4] = {"simple", "auto", "affinity", "static"};
+
+  const int N = 262144;
+  const int T = 20;
+  const double ten_ns = 0.00000001;
+  const double twenty_us = 0.00002;
+  double timing[4][19];
+
+  #if CONSTRAIN_TO_ECORES
+  std::vector<tbb::core_type_id> core_types = tbb::info::core_types();
+  tbb::task_arena::constraints c;
+  c.set_core_type(core_types.front());
+  c.set_max_concurrency(tbb::info::default_concurrency(c) - 2);
+  tbb::task_arena ecore_arena(c);
+  std::cout << "Using arena with " << ecore_arena.max_concurrency() << " slots\n";
+  ecore_arena.execute([&]() {
+  #endif
+
+  for (double tpi = ten_ns; tpi < twenty_us; tpi *= 10) { 
+    std::cout << "Speedups for " << tpi << " seconds per iteration" << std::endl
+              << "partitioner";
+    for (int gs = 1, i = 0; gs <= N; gs *= 2, ++i) 
+      std::cout << ", " << gs;
+    std::cout << std::endl;
+
+    double serial_time = executeFor(T, N, tpi);
+
+    for (int gs = 1, i = 0; gs <= N; gs *= 2, ++i) {
+      tbb::affinity_partitioner affinity_p;
+      spinWaitForAtLeast(0.001);
+      timing[0][i] = executePfor(T, N, gs, simple_p, tpi);
+      timing[1][i] = executePfor(T, N, gs, auto_p, tpi);
+      timing[2][i] = executePfor(T, N, gs, affinity_p, tpi);
+      timing[3][i] = executePfor(T, N, gs, static_p, tpi);
+    }
+    for (int p = 0; p < 4; ++p) {
+      std::cout << pname[p];  
+      for (int gs = 1, i = 0; gs <= N; gs *= 2, ++i) 
+        std::cout << ", " << serial_time/timing[p][i];
+      std::cout << std::endl;
+    }
+    std::cout << std::endl;
+  }
+
+  #if CONSTRAIN_TO_ECORES
+  });
+  #endif
+
+  return 0;
+}
+
+static inline void spinWaitForAtLeast(double sec) {
+  if (sec == 0.0) return;
+  tbb::tick_count t0 = tbb::tick_count::now();
+  while ((tbb::tick_count::now() - t0).seconds() < sec);
+}
diff --git a/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp b/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp
new file mode 100644
index 0000000000..9400e077ef
--- /dev/null
+++ b/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp
@@ -0,0 +1,194 @@
+/*
+Copyright (C) 2019 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
+OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+OR OTHER DEALINGS IN THE SOFTWARE.
+
+SPDX-License-Identifier: MIT
+*/
+
+#include <tbb/tbb.h>
+#include <iostream>
+
+double serialTranspose(int N, double *a, double *b) {
+  tbb::tick_count t0 = tbb::tick_count::now();
+  for (int i = 0; i < N; ++i) {
+    for (int j = 0; j < N; ++j) {
+      b[j*N+i] = a[i*N+j];
+    }
+  }
+  tbb::tick_count t1 = tbb::tick_count::now();
+
+  return (t1-t0).seconds();   
+}
+
+void obliviousTranspose(int N, int ib, int ie, int jb, int je, 
+                              double *a, double *b, int gs) {
+  int ilen = ie-ib;
+  int jlen = je-jb;
+  if (ilen > gs ||  jlen > gs) {
+     if ( ilen > jlen ) {
+       int imid = (ib+ie)/2;
+       obliviousTranspose(N, ib, imid, jb, je, a, b, gs);
+       obliviousTranspose(N, imid, ie, jb, je, a, b, gs);
+     } else {
+       int jmid = (jb+je)/2;
+       obliviousTranspose(N, ib, ie, jb, jmid, a, b, gs);
+       obliviousTranspose(N, ib, ie, jmid, je, a, b, gs);
+     }
+  } else {
+    for (int i = ib; i < ie; ++i) {
+      for (int j = jb; j < je; ++j) {
+        b[j*N+i] = a[i*N+j];
+      }
+    }
+  }
+}
+
+double serialObliviousTranspose(int N, double *a, double *b, int gs) {
+   tbb::tick_count t0 = tbb::tick_count::now();
+   obliviousTranspose(N, 0, N, 0, N, a, b, gs);
+   tbb::tick_count t1 = tbb::tick_count::now();
+   return (t1-t0).seconds();   
+}
+
+template< typename P >
+double pforTranspose(int N, double *a, double *b, int gs) {
+   tbb::tick_count t0 = tbb::tick_count::now();
+   tbb::parallel_for( tbb::blocked_range<int>(0, N, gs),
+     [N, a, b](const tbb::blocked_range<int>& r) {
+       int ie = r.end();
+       for (int i = r.begin(); i < ie; ++i) {
+         for (int j = 0; j < N; ++j) {
+           b[j*N+i] = a[i*N+j];
+         }
+       }
+     }, P() 
+   );
+   tbb::tick_count t1 = tbb::tick_count::now();
+   return (t1-t0).seconds();   
+}
+
+template<typename P>
+double pforTranspose2d(int N, double *a, double *b, int gs) {
+  tbb::tick_count t0 = tbb::tick_count::now();
+  tbb::parallel_for( tbb::blocked_range2d<int,int>{
+        0, N, static_cast<size_t>(gs), 0, 
+        N, static_cast<size_t>(gs)},
+    [N, a, b](const tbb::blocked_range2d<int,int>& r) {
+      int ie = r.rows().end();
+      int je = r.cols().end();
+      for (int i = r.rows().begin(); i < ie; ++i) {
+        for (int j = r.cols().begin(); j < je; ++j) {
+          b[j*N+i] = a[i*N+j];
+        }
+      }
+    }, P()
+  );
+  tbb::tick_count t1 = tbb::tick_count::now();
+  return (t1-t0).seconds();
+}
+
+void setArray(int N, double *a);
+void checkTranspose(int N, double *a);
+
+#define CONSTRAIN_TO_ECORES 1
+
+int main() {
+  int N = 2<<12; // 8192
+  double *a = new double[N*N];
+  double *b = new double[N*N];
+  setArray(N, a);
+  setArray(N, b);
+
+  #if CONSTRAIN_TO_ECORES
+  std::vector<tbb::core_type_id> core_types = tbb::info::core_types();
+  tbb::task_arena::constraints c;
+  c.set_core_type(core_types.front());
+  c.set_max_concurrency(tbb::info::default_concurrency(c) - 2);
+  tbb::task_arena ecore_arena(c);
+  std::cout << "Using arena with " << ecore_arena.max_concurrency() << " slots\n";
+  ecore_arena.execute([&]() {
+  #endif
+
+  serialTranspose(N, a, b);
+  double ts = serialTranspose(N, a, b);
+  checkTranspose(N, b);
+  std::cout << "Serial Time = " << ts << std::endl;
+
+  std::cout << "Parallel Times:" << std::endl
+            << "grainsize, oblivious, 1d auto, 1d simple, 2d auto, 2d simple" << std::endl;
+  for (int gs = 1; gs <= N; gs *= 2) {
+    setArray(N, a);
+    setArray(N, b);
+    serialObliviousTranspose(N, a, b, gs);
+    double to = serialObliviousTranspose(N, a, b, gs);
+    checkTranspose(N, b);
+
+    setArray(N, a);
+    setArray(N, b);
+    pforTranspose<tbb::auto_partitioner>(N, a, b, gs);
+    double t1d_auto = pforTranspose<tbb::auto_partitioner>(N, a, b, gs);
+
+    setArray(N, a);
+    setArray(N, b);
+    pforTranspose<tbb::simple_partitioner>(N, a, b, gs);
+    double t1d_simple = pforTranspose<tbb::simple_partitioner>(N, a, b, gs);
+
+    setArray(N, a);
+    setArray(N, b);
+    pforTranspose2d<tbb::auto_partitioner>(N, a, b, gs);
+    double t2d_auto = pforTranspose2d<tbb::auto_partitioner>(N, a, b, gs);
+
+    setArray(N, a);
+    setArray(N, b);
+    pforTranspose2d<tbb::simple_partitioner>(N, a, b, gs);
+    double t2d_simple = pforTranspose2d<tbb::simple_partitioner>(N, a, b, gs);
+
+    std::cout << gs 
+              << ", " << to 
+              << ", " << t1d_auto 
+              << ", " << t1d_simple 
+              << ", " << t2d_auto
+              << ", " << t2d_simple << std::endl;
+  }
+
+  #if CONSTRAIN_TO_ECORES
+  });
+  #endif
+
+  return 0;
+}
+
+void setArray(int N, double *a) {
+  for (int i = 0; i < N; ++i) {
+    for (int j = 0; j < N; ++j) {
+      a[i*N+j] = i;
+    }
+  }
+}
+
+void checkTranspose(int N, double *a) {
+   for (int i = 0; i < N; ++i) {
+     for (int j = 0; j < N; ++j) {
+       if (a[i*N+j] != j) {
+         std::cout << "Transpose failed" << std::endl;
+       }
+     }
+   }
+}

From 315e18839eaafbe8b87cadbaf56e1895c8003ec8 Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Thu, 7 Nov 2024 18:35:20 -0600
Subject: [PATCH 02/34] Added transpose example

---
 .../parallel_for_transpose_partitioners.cpp   | 34 +++++++------------
 1 file changed, 13 insertions(+), 21 deletions(-)

diff --git a/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp b/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp
index 9400e077ef..45f68f4391 100644
--- a/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp
+++ b/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp
@@ -1,25 +1,17 @@
 /*
-Copyright (C) 2019 Intel Corporation
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"),
-to deal in the Software without restriction, including without limitation
-the rights to use, copy, modify, merge, publish, distribute, sublicense,
-and/or sell copies of the Software, and to permit persons to whom
-the Software is furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included
-in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
-OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-OR OTHER DEALINGS IN THE SOFTWARE.
-
-SPDX-License-Identifier: MIT
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
 */
 
 #include <tbb/tbb.h>

From 3bd28aacca90ecdc393e701bcee7f3cb02da0864 Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Thu, 14 Nov 2024 17:37:30 -0600
Subject: [PATCH 03/34] Fixed comments, added example

---
 new_examples/graph/graph_loops.cpp           |   3 +-
 new_examples/graph/graph_stereoscopic_3d.cpp | 227 +++++++++++++++++++
 new_examples/graph/graph_two_nodes.cpp       |   2 +-
 new_examples/graph/graph_with_join.cpp       |   2 +-
 4 files changed, 230 insertions(+), 4 deletions(-)
 create mode 100644 new_examples/graph/graph_stereoscopic_3d.cpp

diff --git a/new_examples/graph/graph_loops.cpp b/new_examples/graph/graph_loops.cpp
index b1dea8cb85..847cc544e6 100644
--- a/new_examples/graph/graph_loops.cpp
+++ b/new_examples/graph/graph_loops.cpp
@@ -27,8 +27,7 @@ void tryPutLoop() {
     }
   };
   for (int count = 0; count < limit; ++count) {
-    int value = count;
-    my_node.try_put(value);
+    my_node.try_put(count);
   }
   g.wait_for_all();
 }
diff --git a/new_examples/graph/graph_stereoscopic_3d.cpp b/new_examples/graph/graph_stereoscopic_3d.cpp
new file mode 100644
index 0000000000..35bb571b93
--- /dev/null
+++ b/new_examples/graph/graph_stereoscopic_3d.cpp
@@ -0,0 +1,227 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#define NOMINMAX
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include "../common/lodepng.h"
+#include <tbb/tbb.h>
+
+class Image {
+public:
+  uint64_t frameNumber = -1;
+  unsigned int width = 0, height = 0;
+  std::shared_ptr<std::vector<unsigned char>> buffer; 
+  static const int numChannels = 4; 
+  static const int redOffset = 0; 
+  static const int greenOffset = 1; 
+  static const int blueOffset = 2; 
+
+  Image() {} 
+  Image(uint64_t frame_number, const std::string& file_name);
+  Image(const Image& p);
+  virtual ~Image() {}
+  void write() const;
+};
+
+int getNextFrameNumber();
+Image getLeftImage(uint64_t frameNumber);
+Image getRightImage(uint64_t frameNumber);
+void increasePNGChannel(Image& image, int channel_offset, int increase);
+void mergeImages(Image& right, const Image& left);
+
+void stereo3D() {
+  using Image = Image;
+  // step 1: create graph object
+  tbb::flow::graph g;
+
+  // step 2: create nodes
+  tbb::flow::input_node frame_no_node{g,
+    []( tbb::flow_control &fc ) -> uint64_t {
+      uint64_t frame_number = getNextFrameNumber();
+      if (frame_number)
+        return frame_number;
+      else {
+        fc.stop();
+        return frame_number;
+      }
+        
+    }
+  };
+  tbb::flow::function_node<uint64_t, Image> get_left_node{g, 
+    /* concurrency */ tbb::flow::serial,
+    [] (uint64_t frame_number) -> Image {
+      return getLeftImage(frame_number);
+    }
+  };
+  tbb::flow::function_node<uint64_t, Image> get_right_node{g, 
+    /* concurrency */ tbb::flow::serial,
+    [] (uint64_t frame_number) -> Image {
+      return getRightImage(frame_number);
+    }
+  };
+  tbb::flow::function_node<Image, Image> increase_left_node{g, 
+    /* concurrency */ tbb::flow::unlimited,
+    [] (Image left) -> Image {
+        increasePNGChannel(left, Image::redOffset, 10);
+        return left;
+    }
+  };
+  tbb::flow::function_node<Image, Image> increase_right_node{g, 
+    /* concurrency */ tbb::flow::unlimited,
+    [] (Image right) -> Image {
+        increasePNGChannel(right, Image::blueOffset, 10);
+        return right;
+    }
+  };
+  tbb::flow::join_node<std::tuple<Image, Image>, tbb::flow::tag_matching >
+    join_images_node(g, [] (Image left) { return left.frameNumber; },
+                        [] (Image right) { return right.frameNumber; } );
+  tbb::flow::function_node<std::tuple<Image, Image>, Image> merge_images_node{g, 
+    /* concurrency */ tbb::flow::unlimited,
+    [] (std::tuple<Image, Image> t) -> Image {
+      auto& l = std::get<0>(t);
+      auto& r = std::get<1>(t);
+      mergeImages(r, l);
+      return r;
+    }
+  };
+  tbb::flow::function_node<Image> write_node{g, 
+    /* concurrency */ tbb::flow::unlimited,
+    [] (Image img) {
+      img.write();
+    }
+  };
+
+  // step 3: add edges
+  tbb::flow::make_edge(frame_no_node, get_left_node);
+  tbb::flow::make_edge(frame_no_node, get_right_node);
+  tbb::flow::make_edge(get_left_node, increase_left_node);
+  tbb::flow::make_edge(get_right_node, increase_right_node);
+  tbb::flow::make_edge(increase_left_node, 
+                       tbb::flow::input_port<0>(join_images_node));
+  tbb::flow::make_edge(increase_right_node, 
+                       tbb::flow::input_port<1>(join_images_node));
+  tbb::flow::make_edge(join_images_node, merge_images_node);
+  tbb::flow::make_edge(merge_images_node, write_node);
+
+  // step 4: send messages in to the graph
+  frame_no_node.activate();
+  // step 5: wait for graph to complete
+  g.wait_for_all();
+}
+
+Image::Image(uint64_t frame_number, const std::string& file_name) :
+  frameNumber{frame_number}, buffer{std::make_shared< std::vector<unsigned char> >()} {
+  if (lodepng::decode(*buffer, width, height, file_name)) {
+     std::cerr << "Error: could not read PNG file!" << std::endl;
+     width = height = 0;
+  }
+};
+
+Image::Image(const Image& p) : frameNumber{p.frameNumber}, 
+                                        width{p.width}, height{p.height},
+                                        buffer{p.buffer} {}
+
+void Image::write() const {
+  std::string file_name = std::string("out") + std::to_string(frameNumber) + ".png";
+  if (lodepng::encode(file_name, *buffer, width, height)) {
+    std::cerr << "Error: could not write PNG file!" << std::endl;
+  }
+}
+
+static int stereo3DFrameCounter = 0;
+static int stero3DNumImages = 0;
+
+void initStereo3D(int num_images) {
+  stereo3DFrameCounter = 0;
+  stero3DNumImages = num_images;
+}
+
+int getNextFrameNumber() {
+  if ( stereo3DFrameCounter < stero3DNumImages ) {
+    return ++stereo3DFrameCounter;
+  } else {
+    return 0;
+  }
+}
+
+Image getLeftImage(uint64_t frameNumber) {
+  return Image(frameNumber, "input1.png");
+}
+
+Image getRightImage(uint64_t frameNumber) {
+  return Image(frameNumber, "input2.png");
+}
+
+void increasePNGChannel(Image& image, int channel_offset, int increase) {
+  const int height_base = Image::numChannels * image.width;
+  std::vector<unsigned char>& buffer = *image.buffer;
+
+  // Increase selected color channel by a predefined value
+  for (unsigned int y = 0; y < image.height; y++) {
+    const int height_offset = height_base * y;
+    for (unsigned int x = 0; x < image.width; x++) {
+      int pixel_offset = height_offset + Image::numChannels * x + channel_offset;
+      buffer[pixel_offset] = static_cast<uint8_t>(std::min(buffer[pixel_offset] + increase, 255));
+    }
+  }
+}
+
+void mergeImages(Image& right, const Image& left) {
+  const int channels_per_pixel = Image::numChannels;
+  const int height_base = channels_per_pixel * right.width;
+  std::vector<unsigned char>& left_buffer = *left.buffer;
+  std::vector<unsigned char>& right_buffer = *right.buffer;
+
+  for (unsigned int y = 0; y < right.height; y++) {
+    const int height_offset = height_base * y;
+    for (unsigned int x = 0; x < right.width; x++) {
+      const int pixel_offset = height_offset + channels_per_pixel * x;
+      const int red_index = pixel_offset + Image::redOffset;
+      right_buffer[red_index] = left_buffer[red_index];
+    }
+  }
+}
+
+static void warmupTBB() {
+  tbb::parallel_for(0, tbb::this_task_arena::max_concurrency(), [](int) {
+    tbb::tick_count t0 = tbb::tick_count::now();
+    while ((tbb::tick_count::now() - t0).seconds() < 0.01);
+  });
+}
+
+int main(int argc, char *argv[]) {
+  int num_images = 3;
+
+  initStereo3D(num_images);
+
+  warmupTBB();
+  double parallel_time = 0.0;
+  {
+    tbb::tick_count t0 = tbb::tick_count::now();
+    stereo3D(); 
+    parallel_time = (tbb::tick_count::now() - t0).seconds();
+  }
+
+  std::cout << "parallel_time == " << parallel_time << " seconds" << std::endl;
+  return 0;
+}
+
diff --git a/new_examples/graph/graph_two_nodes.cpp b/new_examples/graph/graph_two_nodes.cpp
index 1a3d647744..481667e714 100644
--- a/new_examples/graph/graph_two_nodes.cpp
+++ b/new_examples/graph/graph_two_nodes.cpp
@@ -40,7 +40,7 @@ void graphTwoNodes() {
   // step 3: add edges
   tbb::flow::make_edge(my_first_node, my_second_node);
 
-  // step 4: send messages
+  // step 4: send message that eagerly starts graph execution
   my_first_node.try_put(10);
 
   // step 5: wait for graph to complete
diff --git a/new_examples/graph/graph_with_join.cpp b/new_examples/graph/graph_with_join.cpp
index 88ad9639cb..2efc46212a 100644
--- a/new_examples/graph/graph_with_join.cpp
+++ b/new_examples/graph/graph_with_join.cpp
@@ -55,7 +55,7 @@ void graphJoin() {
   make_edge(my_other_node, tbb::flow::input_port<1>(my_join_node));
   make_edge(my_join_node, my_final_node);
 
-  // step 4: send messages
+  // step 4: send messages that eagerly start graph execution
   my_node.try_put(1);
   my_other_node.try_put(2);
   // step 5: wait for the graph to complete

From 884582ef972f8b1c1ce3b41f9ac7e6f67f25ab08 Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Thu, 14 Nov 2024 21:34:41 -0600
Subject: [PATCH 04/34] Updated execute_while_building example

---
 .../graph/graph_execute_while_building.cpp    | 26 ++++++++++++-------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/new_examples/graph/graph_execute_while_building.cpp b/new_examples/graph/graph_execute_while_building.cpp
index 7a977221df..387bb9a9f9 100644
--- a/new_examples/graph/graph_execute_while_building.cpp
+++ b/new_examples/graph/graph_execute_while_building.cpp
@@ -16,6 +16,7 @@
 
 #include <iostream>
 #include <memory>
+#include <vector>
 #include <tbb/tbb.h>
 
 struct config_t { 
@@ -23,16 +24,17 @@ struct config_t {
   int predecessor; 
 };  
 
-config_t configuration[] = { { 0, 3 }, 
-                           { 1, 4 }, 
-                           { 2, 5 }, 
-                           { 3, -1 }, 
-                           { 4, -1 }, 
-                           { 5, 3 }, 
-                           { 6, 4 }, 
-                           { 7, 1 } };
+// each element defines a node and what other node it must wait for
+std::vector<config_t> configuration =   { { 0, 3 }, 
+                                          { 1, 4 }, 
+                                          { 2, 5 }, 
+                                          { 3, -1 }, 
+                                          { 4, -1 }, 
+                                          { 5, 3 }, 
+                                          { 6, 4 }, 
+                                          { 7, 1 } };
 
-int num_nodes = sizeof(configuration) / sizeof(config_t);
+const int num_nodes = configuration.size();
 
 int main() {
   tbb::flow::graph g;
@@ -57,14 +59,18 @@ int main() {
                           return m;
                         }
                        });
-    // connect the new node to its future
+    // connect the new node to its "future"
     tbb::flow::make_edge(*work_nodes[c.id], future_nodes[c.id]); 
 
     // start the node or link it to predecessor's promise
     if (c.predecessor != -1) {
+       // must connect to predecessor's "future"
+       // if the future is already written to this will start the node
+       // otherwise it will be started when the future is written
        std::printf("new %d with %d -> %d\n", c.id, c.predecessor, c.id);
        tbb::flow::make_edge(future_nodes[c.predecessor], *work_nodes[c.id]);
     } else {
+       // does not need to wait and can be started immediately
        std::printf("starting %d from main\n", c.id);
        work_nodes[c.id]->try_put(tbb::flow::continue_msg{});
     }

From 27d71ea301578b142b1de5983d7d39776103bc44 Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Thu, 14 Nov 2024 22:32:20 -0600
Subject: [PATCH 05/34] Added noexcept to lightweight samples

---
 new_examples/graph/graph_small_nodes.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/new_examples/graph/graph_small_nodes.cpp b/new_examples/graph/graph_small_nodes.cpp
index 345ff196e6..6e2c9f5051 100644
--- a/new_examples/graph/graph_small_nodes.cpp
+++ b/new_examples/graph/graph_small_nodes.cpp
@@ -57,9 +57,9 @@ void small_nodes_lightweight() {
   tbb::flow::function_node< int, int > add( g, tbb::flow::unlimited, 
                      [](const int &v) { return v+1; } );
   tbb::flow::function_node< int, int, tbb::flow::lightweight >  multiply( g, tbb::flow::unlimited, 
-                          [](const int &v) { return v*2; } );
+                          [](const int &v) noexcept { return v*2; } );
   tbb::flow::function_node< int, int, tbb::flow::lightweight >  cube( g, tbb::flow::unlimited, 
-                      [](const int &v) { return v*v*v; } );
+                      [](const int &v) noexcept { return v*v*v; } );
 
   tbb::flow::make_edge(add, multiply);
   tbb::flow::make_edge(multiply, cube);
@@ -73,7 +73,7 @@ void small_nodes_combined_lightweight() {
   tbb::flow::graph g;
 
   tbb::flow::function_node< int, int, tbb::flow::lightweight > combined_node( g, tbb::flow::unlimited, 
-                     [](const int &v) { 
+                     [](const int &v) noexcept { 
                         auto v2 = (v+1)*2;
                         return v2*v2*v2;
                      });

From b8dda7f1b8698769e9ef3a43d45d6529a470d7ea Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Sat, 16 Nov 2024 13:31:10 -0600
Subject: [PATCH 06/34] Fixed minor typos

---
 new_examples/tasks/parallel_invoke_fib.cpp | 2 +-
 new_examples/tasks/resumable_tasks.cpp     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/new_examples/tasks/parallel_invoke_fib.cpp b/new_examples/tasks/parallel_invoke_fib.cpp
index a5e2f7c584..903729fc0f 100644
--- a/new_examples/tasks/parallel_invoke_fib.cpp
+++ b/new_examples/tasks/parallel_invoke_fib.cpp
@@ -73,7 +73,7 @@ int main(int argc, char** argv)
 
   std::cout << "SerialFib:   " << fib_s << " Time: " << t_s << "\n";
   std::cout << "ParallelFib: " << fib_p << " Time: " << t_p << " Speedup: " << t_s/t_p << "\n";
-  std::cout << "ParallelFibCutoff_30: " << fib_c << " Time: " << t_p << " Speedup: " << t_s/t_c << "\n";
+  std::cout << "ParallelFibCutoff_30: " << fib_c << " Time: " << t_c << " Speedup: " << t_s/t_c << "\n";
   return 0;
 }
 
diff --git a/new_examples/tasks/resumable_tasks.cpp b/new_examples/tasks/resumable_tasks.cpp
index 9cd3cc8982..c1bedaf921 100755
--- a/new_examples/tasks/resumable_tasks.cpp
+++ b/new_examples/tasks/resumable_tasks.cpp
@@ -32,7 +32,7 @@ int main() {
         tbb::task::suspend([=,&sycl_q](tbb::task::suspend_point tag) {
           auto sycl_event = sycl_q.fill(a, id, N);
           sycl_q.submit([=](sycl::handler& sycl_h) {
-            sycl_h.depends_on(sycl_event); // only run after e is done
+            sycl_h.depends_on(sycl_event); // run after sycl_event is done
             sycl_h.host_task([tag]() { 
               tbb::task::resume(tag);
             });

From e1132a1fb2885d446f3d1d8e60962a86362e8f2e Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Sat, 16 Nov 2024 20:07:40 -0600
Subject: [PATCH 07/34] Added atomics migration example

---
 new_examples/migration/migrate_atomics.cpp    | 79 +++++++++++++++++++
 .../migration/migrate_task_scheduler_init.cpp |  1 -
 2 files changed, 79 insertions(+), 1 deletion(-)
 create mode 100644 new_examples/migration/migrate_atomics.cpp

diff --git a/new_examples/migration/migrate_atomics.cpp b/new_examples/migration/migrate_atomics.cpp
new file mode 100644
index 0000000000..822858a98c
--- /dev/null
+++ b/new_examples/migration/migrate_atomics.cpp
@@ -0,0 +1,79 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include <vector>
+#include <iostream>
+#include <algorithm>
+#include <random>
+
+#include <tbb/tick_count.h>
+#include <tbb/parallel_for.h>
+
+#if TBB_VERSION_MAJOR > 2020
+#include <atomic>
+#else
+#include <tbb/atomic.h>
+#endif
+
+int main(int argc, char** argv) {
+  long int n = 1000000000;
+  constexpr int num_bins = 256;
+
+  // Initialize random number generator
+  std::random_device seed;    // Random device seed
+  std::mt19937 mte{seed()};   // mersenne_twister_engine
+  std::uniform_int_distribution<> uniform{0,num_bins};
+  // Initialize image  
+  std::vector<uint8_t> image; // empty vector
+  image.reserve(n);           // image vector prealocated
+  std::generate_n(std::back_inserter(image), n,
+                    [&] { return uniform(mte); }
+                 );
+  // Initialize histogram
+  std::vector<int> hist(num_bins);
+
+  // Serial execution
+  tbb::tick_count t0 = tbb::tick_count::now();
+  std::for_each(image.begin(), image.end(),
+                [&](uint8_t i){hist[i]++;});
+  tbb::tick_count t1 = tbb::tick_count::now();
+  double t_serial = (t1 - t0).seconds();
+
+  // Parallel execution
+  #if TBB_VERSION_MAJOR > 2020
+  std::vector<std::atomic<int>> hist_p(num_bins);
+  #else
+  std::vector<tbb::atomic<int>> hist_p(num_bins);
+  #endif
+
+  t0 = tbb::tick_count::now();
+  parallel_for(tbb::blocked_range<size_t>{0, image.size()},
+              [&](const tbb::blocked_range<size_t>& r)
+              {
+                for (size_t i = r.begin(); i < r.end(); ++i)
+                  hist_p[image[i]]++;
+              });
+  t1 = tbb::tick_count::now();
+  double t_parallel = (t1 - t0).seconds();
+
+  std::cout << "Serial: "   << t_serial   << ", ";
+  std::cout << "Parallel: " << t_parallel << ", ";
+  std::cout << "Speed-up: " << t_serial/t_parallel << std::endl;
+
+  if (!std::equal(hist.begin(),hist.end(),hist_p.begin()))
+      std::cerr << "Parallel computation failed!!" << std::endl;
+  return 0;
+}
diff --git a/new_examples/migration/migrate_task_scheduler_init.cpp b/new_examples/migration/migrate_task_scheduler_init.cpp
index 7021a38a31..4e1114c6fb 100644
--- a/new_examples/migration/migrate_task_scheduler_init.cpp
+++ b/new_examples/migration/migrate_task_scheduler_init.cpp
@@ -77,7 +77,6 @@ void clearParticipation() {
 }
 
 void dumpParticipation(int p) {
-  int end = next_tid;
   int sum = tid_participation[0];
   std::cout << "[" << tid_participation[0];
   for (int i = 1; i < p; ++i) {

From 055781ad76569c631986a2a08652a47016a02a6b Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Sat, 16 Nov 2024 20:31:56 -0600
Subject: [PATCH 08/34] Added warnings for old TBB usage

---
 new_examples/migration/migrate_atomics.cpp             | 1 +
 new_examples/migration/migrate_bypass_tasks.cpp        | 1 +
 new_examples/migration/migrate_parallel_do.cpp         | 1 +
 new_examples/migration/migrate_priorities.cpp          | 1 +
 new_examples/migration/migrate_recycling_tasks.cpp     | 5 +++--
 new_examples/migration/migrate_task_blocking.cpp       | 1 +
 new_examples/migration/migrate_task_scheduler_init.cpp | 1 +
 new_examples/migration/migrate_tasks_adding_work.cpp   | 1 +
 8 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/new_examples/migration/migrate_atomics.cpp b/new_examples/migration/migrate_atomics.cpp
index 822858a98c..95c0c551b5 100644
--- a/new_examples/migration/migrate_atomics.cpp
+++ b/new_examples/migration/migrate_atomics.cpp
@@ -56,6 +56,7 @@ int main(int argc, char** argv) {
   #if TBB_VERSION_MAJOR > 2020
   std::vector<std::atomic<int>> hist_p(num_bins);
   #else
+  #warning Using tbb::atomic instead of std::atomic
   std::vector<tbb::atomic<int>> hist_p(num_bins);
   #endif
 
diff --git a/new_examples/migration/migrate_bypass_tasks.cpp b/new_examples/migration/migrate_bypass_tasks.cpp
index 64ba8f6b81..8675252269 100644
--- a/new_examples/migration/migrate_bypass_tasks.cpp
+++ b/new_examples/migration/migrate_bypass_tasks.cpp
@@ -114,6 +114,7 @@ void parallelFwdSubTaskGroup(std::vector<double>& x,
 }
 
 #if TBB_VERSION_MAJOR <= 2020
+#warning Using tbb::task directly
 using RootTask = tbb::empty_task;
 
 class FwdSubTask : public tbb::task {
diff --git a/new_examples/migration/migrate_parallel_do.cpp b/new_examples/migration/migrate_parallel_do.cpp
index 8672f9108d..335d94890d 100644
--- a/new_examples/migration/migrate_parallel_do.cpp
+++ b/new_examples/migration/migrate_parallel_do.cpp
@@ -87,6 +87,7 @@ void parallelFwdSub(std::vector<double>& x,
     }
   );
 #else
+#warning Using tbb::parallel_do instead of tbb::parallel_for_each
   tbb::parallel_do( &top_left, &top_left+1, 
     [&](const BlockIndex& bi, tbb::parallel_do_feeder<BlockIndex>& f) {
       auto [r, c] = bi;
diff --git a/new_examples/migration/migrate_priorities.cpp b/new_examples/migration/migrate_priorities.cpp
index 3b6cc43e51..e65bb4f5cf 100644
--- a/new_examples/migration/migrate_priorities.cpp
+++ b/new_examples/migration/migrate_priorities.cpp
@@ -71,6 +71,7 @@ void runParallelForWithHighPriority() {
   std::printf("\n");
 }
 #else
+#warning Using tbb::task directly
 auto P = tbb::task_scheduler_init::default_num_threads();
 
 class MyTask : public tbb::task {
diff --git a/new_examples/migration/migrate_recycling_tasks.cpp b/new_examples/migration/migrate_recycling_tasks.cpp
index eae856a95b..f066c130a6 100644
--- a/new_examples/migration/migrate_recycling_tasks.cpp
+++ b/new_examples/migration/migrate_recycling_tasks.cpp
@@ -62,8 +62,8 @@ class FwdSubFunctor {
                 const std::vector<double>& a, 
                 std::vector<double>& b,
                 std::vector<std::atomic<char>>& ref_count) : 
-                my_tg(tg), my_N(N), my_num_blocks(num_blocks), 
-                my_index(new BlockIndex{bi}), 
+                my_tg(tg), my_index(new BlockIndex{bi}),
+                my_N(N), my_num_blocks(num_blocks),  
                 my_x(x), my_a(a), my_b(b), my_ref_count(ref_count) {}
 
   void operator()() const {
@@ -123,6 +123,7 @@ void parallelFwdSub(std::vector<double>& x,
   tg.wait();
 }
 #else
+#warning Using tbb::task directly with recycling 
 using RootTask = tbb::empty_task;
 
 class FwdSubTask : public tbb::task {
diff --git a/new_examples/migration/migrate_task_blocking.cpp b/new_examples/migration/migrate_task_blocking.cpp
index b09edbaca3..0852e38108 100644
--- a/new_examples/migration/migrate_task_blocking.cpp
+++ b/new_examples/migration/migrate_task_blocking.cpp
@@ -28,6 +28,7 @@ void taskBlocking() {
   g.wait();
 }
 #else
+#warning Using tbb::task directly
 const int P = tbb::task_scheduler_init::default_num_threads();
 
 class MyTask : public tbb::task {
diff --git a/new_examples/migration/migrate_task_scheduler_init.cpp b/new_examples/migration/migrate_task_scheduler_init.cpp
index 4e1114c6fb..3f2a54c80a 100644
--- a/new_examples/migration/migrate_task_scheduler_init.cpp
+++ b/new_examples/migration/migrate_task_scheduler_init.cpp
@@ -32,6 +32,7 @@ void setThreadsAndSlots() {
   });
 }
 #else
+#warning Using tbb::task_scheduler_init instead of tbb::global_control
 const int N = tbb::task_scheduler_init::default_num_threads();
 
 void setThreadsAndSlots() {
diff --git a/new_examples/migration/migrate_tasks_adding_work.cpp b/new_examples/migration/migrate_tasks_adding_work.cpp
index 01686557e6..442b7eaed7 100644
--- a/new_examples/migration/migrate_tasks_adding_work.cpp
+++ b/new_examples/migration/migrate_tasks_adding_work.cpp
@@ -102,6 +102,7 @@ void parallelFwdSubTaskGroup(std::vector<double>& x,
 }
 
 #if TBB_VERSION_MAJOR <= 2020
+#warning Using tbb::task directly
 using RootTask = tbb::empty_task;
 
 class FwdSubTask : public tbb::task {

From 848c1e005f6cc0720758b2211aadb84076049047 Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Sun, 17 Nov 2024 22:26:56 -0600
Subject: [PATCH 09/34] Changes based on reviews

---
 .../migration/migrate_bypass_tasks.cpp        | 25 +++++++++++++++++++
 .../migration/migrate_task_scheduler_init.cpp |  6 ++---
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/new_examples/migration/migrate_bypass_tasks.cpp b/new_examples/migration/migrate_bypass_tasks.cpp
index 8675252269..fc35d6782b 100644
--- a/new_examples/migration/migrate_bypass_tasks.cpp
+++ b/new_examples/migration/migrate_bypass_tasks.cpp
@@ -54,6 +54,7 @@ void serialFwdSubTiled(std::vector<double>& x,
   }
 }
 
+#if TBB_VERSION_MAJOR > 2020
 tbb::task_handle fwdSubTGBody(tbb::task_group& tg,
                               int N, int num_blocks, 
                               const std::pair<size_t, size_t> bi, 
@@ -86,6 +87,30 @@ tbb::task_handle fwdSubTGBody(tbb::task_group& tg,
 
   return deferred_task;
 }
+#else
+void fwdSubTGBody(tbb::task_group& tg,
+                int N, int num_blocks, 
+                const std::pair<size_t, size_t> bi, 
+                std::vector<double>& x, 
+                const std::vector<double>& a, 
+                std::vector<double>& b,
+                std::vector<std::atomic<char>>& ref_count) {
+  auto [r, c] = bi;
+  computeBlock(N, r, c, x, a, b);
+  // add successor to right if ready
+  if (c + 1 <= r && --ref_count[r*num_blocks + c + 1] == 0) {
+    tg.run([&, N, num_blocks, r, c]() { 
+      fwdSubTGBody(tg, N, num_blocks, BlockIndex(r, c+1), x, a, b, ref_count);
+    });
+  }
+  // add succesor below if ready
+  if (r + 1 < (size_t)num_blocks && --ref_count[(r+1)*num_blocks + c] == 0) {
+    tg.run([&, N, num_blocks, r, c]() { 
+      fwdSubTGBody(tg, N, num_blocks, BlockIndex(r+1, c), x, a, b, ref_count);
+    });
+  }
+}
+#endif
 
 void parallelFwdSubTaskGroup(std::vector<double>& x, 
                              const std::vector<double>& a, 
diff --git a/new_examples/migration/migrate_task_scheduler_init.cpp b/new_examples/migration/migrate_task_scheduler_init.cpp
index 3f2a54c80a..94db31548f 100644
--- a/new_examples/migration/migrate_task_scheduler_init.cpp
+++ b/new_examples/migration/migrate_task_scheduler_init.cpp
@@ -19,7 +19,7 @@
 void doWork(double seconds);
 
 #if TBB_VERSION_MAJOR > 2020
-const int N = 2*tbb::info::default_concurrency();
+const int N = tbb::info::default_concurrency();
 
 void setThreadsAndSlots() {
   tbb::global_control gc(tbb::global_control::max_allowed_parallelism, N);
@@ -84,11 +84,11 @@ void dumpParticipation(int p) {
     sum += tid_participation[i];
     std::cout << ", " << tid_participation[i];
   }
-  for (int i = p; i < 2*N; ++i) 
+  for (int i = p; i < N; ++i) 
     std::cout << ", -";
   std::cout << "]\n" 
             << "sum == " << sum  << "\n"
-            << "expected sum " << 2*10*N << "\n";
+            << "expected sum " << 10*N << "\n";
   clearParticipation();
 }
 

From e2e813ed5322bf8bd192daddc76d4f0bcd8dd88b Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Tue, 19 Nov 2024 21:04:18 -0600
Subject: [PATCH 10/34] Reordered example

---
 .../global_control_and_implicit_arena.cpp     | 46 +++++++++----------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/new_examples/performance_tuning/global_control_and_implicit_arena.cpp b/new_examples/performance_tuning/global_control_and_implicit_arena.cpp
index a1074dc147..e9a3dfc97a 100644
--- a/new_examples/performance_tuning/global_control_and_implicit_arena.cpp
+++ b/new_examples/performance_tuning/global_control_and_implicit_arena.cpp
@@ -14,11 +14,18 @@
     limitations under the License.
 */
 
-#include <iostream>
 #include <tbb/tbb.h>
 
 const int default_P = tbb::info::default_concurrency();
-void doWork(double seconds);
+
+void noteParticipation(); /* record info for participation vector */
+void dumpParticipation(int p); /* display participation vector */
+
+void doWork(double seconds) {
+  noteParticipation();
+  tbb::tick_count t0 = tbb::tick_count::now();
+  while ((tbb::tick_count::now() - t0).seconds() < seconds);
+}
 
 void arenaGlobalControlImplicitArena(int p) {
   tbb::global_control gc(tbb::global_control::max_allowed_parallelism, p);
@@ -27,11 +34,19 @@ void arenaGlobalControlImplicitArena(int p) {
                     [](int) { doWork(0.01); });
 }
 
+
+int main() {
+  arenaGlobalControlImplicitArena(default_P);
+  dumpParticipation(default_P);
+  arenaGlobalControlImplicitArena(default_P/2);
+  dumpParticipation(default_P/2);
+  arenaGlobalControlImplicitArena(2*default_P);
+  dumpParticipation(2*default_P);
+  return 0;
+}
+
 #include <atomic>
-#include <cstdio>
-#include <vector>
-#include <map>
-#include <set>
+#include <iostream>
 #include <vector>
 
 std::atomic<int> next_tid;
@@ -46,12 +61,6 @@ void noteParticipation() {
   ++tid_participation[t];
 }
 
-void doWork(double seconds) {
-  noteParticipation();
-  tbb::tick_count t0 = tbb::tick_count::now();
-  while ((tbb::tick_count::now() - t0).seconds() < seconds);
-}
-
 void clearParticipation() {
   next_tid = 0;
   my_tid.clear();
@@ -60,7 +69,6 @@ void clearParticipation() {
 }
 
 void dumpParticipation(int p) {
-  int end = next_tid;
   int sum = tid_participation[0];
   std::cout << "[" << tid_participation[0];
   for (int i = 1; i < std::min(p, default_P); ++i) {
@@ -71,17 +79,9 @@ void dumpParticipation(int p) {
     std::cout << ", -";
   std::cout << "]\n" 
             << "sum == " << sum  << "\n"
-            << "expected sum " << 10*default_P << "\n";
+            << "expected sum " << 10*default_P << "\n\n";
   clearParticipation();
 }
 
-int main() {
-  arenaGlobalControlImplicitArena(default_P);
-  dumpParticipation(default_P);
-  arenaGlobalControlImplicitArena(default_P/2);
-  dumpParticipation(default_P/2);
-  arenaGlobalControlImplicitArena(2*default_P);
-  dumpParticipation(2*default_P);
-  return 0;
-}
+
 

From 971851a207288797f77668cc3f60a1d2cf971da7 Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Tue, 19 Nov 2024 22:05:56 -0600
Subject: [PATCH 11/34] Modified bounds

---
 .../global_control_and_implicit_conflict.cpp  | 48 +++++++++----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp
index bf3fa3b863..d05ac031d5 100644
--- a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp
+++ b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp
@@ -14,20 +14,29 @@
     limitations under the License.
 */
 
-#include <iostream>
 #include <thread>
 #include <tbb/tbb.h>
 
 const int default_P = tbb::info::default_concurrency();
-void doWork(int inc, double seconds);
+
 void waitUntil(int N);
+void noteParticipation(int offset);
+void dumpParticipation(int p);
+
+void doWork(int offset, double seconds) {
+  noteParticipation(offset);
+  tbb::tick_count t0 = tbb::tick_count::now();
+  while ((tbb::tick_count::now() - t0).seconds() < seconds);
+}
 
-void arenaGlobalControlImplicitArena(int p, int inc) {
+void arenaGlobalControlImplicitArena(int p, int offset) {
   tbb::global_control gc(tbb::global_control::max_allowed_parallelism, p);
 
   tbb::parallel_for(0, 
                     10*default_P, 
-                    [=](int) { doWork(inc, 0.01); });
+                    [=](int) { 
+                      doWork(offset, 0.01); 
+                    });
 }
 
 void runTwoThreads(int p0, int p1) {
@@ -43,29 +52,26 @@ void runTwoThreads(int p0, int p1) {
   t1.join();
 }
 
+int main() {
+  runTwoThreads(default_P/2, default_P);
+  dumpParticipation(default_P);
+  return 0;
+}
+
 #include <atomic>
-#include <cstdio>
-#include <vector>
-#include <map>
-#include <set>
+#include <iostream>
 #include <vector>
 
 std::atomic<int> next_tid;
 tbb::enumerable_thread_specific<int> my_tid(-1);
-std::vector<std::atomic<int>> tid_participation(2*default_P);
+std::vector<std::atomic<int>> tid_participation(default_P);
 
-void noteParticipation(int inc) {
+void noteParticipation(int offset) {
   auto& t = my_tid.local();
   if (t == -1) {
     t = next_tid++;
   }
-  tid_participation[t] += inc;
-}
-
-void doWork(int inc, double seconds) {
-  noteParticipation(inc);
-  tbb::tick_count t0 = tbb::tick_count::now();
-  while ((tbb::tick_count::now() - t0).seconds() < seconds);
+  tid_participation[t] += offset;
 }
 
 void clearParticipation() {
@@ -76,10 +82,9 @@ void clearParticipation() {
 }
 
 void dumpParticipation(int p) {
-  int end = next_tid;
   int sum = tid_participation[0];
   std::cout << "[" << tid_participation[0];
-  for (int i = 1; i < 2*default_P + 1; ++i) {
+  for (int i = 1; i < default_P; ++i) {
     sum += tid_participation[i];
     std::cout << ", " << tid_participation[i];
   }
@@ -96,9 +101,4 @@ void waitUntil(int N) {
   while (count_up != N);
 }
 
-int main() {
-  runTwoThreads(default_P/2, default_P);
-  dumpParticipation(default_P);
-  return 0;
-}
 

From 58a111a384b4307497efabcde96b666f3b0475ae Mon Sep 17 00:00:00 2001
From: Michael Voss <MichaelJ.Voss@intel.com>
Date: Tue, 19 Nov 2024 22:17:07 -0600
Subject: [PATCH 12/34] Fixed vector size in example

---
 .../global_control_and_implicit_conflict.cpp           | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp
index d05ac031d5..ba72c3b654 100644
--- a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp
+++ b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp
@@ -21,7 +21,7 @@ const int default_P = tbb::info::default_concurrency();
 
 void waitUntil(int N);
 void noteParticipation(int offset);
-void dumpParticipation(int p);
+void dumpParticipation();
 
 void doWork(int offset, double seconds) {
   noteParticipation(offset);
@@ -54,7 +54,7 @@ void runTwoThreads(int p0, int p1) {
 
 int main() {
   runTwoThreads(default_P/2, default_P);
-  dumpParticipation(default_P);
+  dumpParticipation();
   return 0;
 }
 
@@ -64,7 +64,7 @@ int main() {
 
 std::atomic<int> next_tid;
 tbb::enumerable_thread_specific<int> my_tid(-1);
-std::vector<std::atomic<int>> tid_participation(default_P);
+std::vector<std::atomic<int>> tid_participation(default_P+1);
 
 void noteParticipation(int offset) {
   auto& t = my_tid.local();
@@ -81,10 +81,10 @@ void clearParticipation() {
     p = 0;
 }
 
-void dumpParticipation(int p) {
+void dumpParticipation() {
   int sum = tid_participation[0];
   std::cout << "[" << tid_participation[0];
-  for (int i = 1; i < default_P; ++i) {
+  for (int i = 1; i < tid_participation.size(); ++i) {
     sum += tid_participation[i];
     std::cout << ", " << tid_participation[i];
   }

From 90e2c48b8a400a63a86512629ded7769434fb8ed Mon Sep 17 00:00:00 2001
From: Michael Voss <MichaelJ.Voss@intel.com>
Date: Tue, 19 Nov 2024 22:56:54 -0600
Subject: [PATCH 13/34] Change barriers to ensure exact overlap of lifetimes

---
 .../global_control_and_implicit_conflict.cpp             | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp
index ba72c3b654..a51fe5de46 100644
--- a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp
+++ b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp
@@ -32,20 +32,24 @@ void doWork(int offset, double seconds) {
 void arenaGlobalControlImplicitArena(int p, int offset) {
   tbb::global_control gc(tbb::global_control::max_allowed_parallelism, p);
 
+  // we use waitUntil to force overlap of the gc lifetimes
+  waitUntil(2);
+
   tbb::parallel_for(0, 
                     10*default_P, 
                     [=](int) { 
                       doWork(offset, 0.01); 
                     });
+
+  // we prevent either gc from being destroyed until both are done
+  waitUntil(2);
 }
 
 void runTwoThreads(int p0, int p1) {
   std::thread t0([=]() {
-    waitUntil(2);
     arenaGlobalControlImplicitArena(p0, 1);
   });
   std::thread t1([=]() {
-    waitUntil(2);
     arenaGlobalControlImplicitArena(p1, 10000);
   });
   t0.join();
@@ -99,6 +103,7 @@ std::atomic<int> count_up = 0;
 void waitUntil(int N) {
   ++count_up;
   while (count_up != N);
+  count_up = 0;
 }
 
 

From 4341288e8ae28d244c2eefa6d94059623a646c9e Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Wed, 20 Nov 2024 09:07:22 -0600
Subject: [PATCH 14/34] Fixed waitUntil

---
 .../global_control_and_implicit_conflict.cpp  | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp
index a51fe5de46..1ffe79901b 100644
--- a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp
+++ b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp
@@ -19,7 +19,8 @@
 
 const int default_P = tbb::info::default_concurrency();
 
-void waitUntil(int N);
+using counter_t = std::atomic<int>;
+void waitUntil(int N, counter_t& c);
 void noteParticipation(int offset);
 void dumpParticipation();
 
@@ -29,11 +30,13 @@ void doWork(int offset, double seconds) {
   while ((tbb::tick_count::now() - t0).seconds() < seconds);
 }
 
+counter_t counter1 = 0, counter2 = 0;
+
 void arenaGlobalControlImplicitArena(int p, int offset) {
   tbb::global_control gc(tbb::global_control::max_allowed_parallelism, p);
 
   // we use waitUntil to force overlap of the gc lifetimes
-  waitUntil(2);
+  waitUntil(2, counter1);
 
   tbb::parallel_for(0, 
                     10*default_P, 
@@ -42,7 +45,7 @@ void arenaGlobalControlImplicitArena(int p, int offset) {
                     });
 
   // we prevent either gc from being destroyed until both are done
-  waitUntil(2);
+  waitUntil(2, counter2);
 }
 
 void runTwoThreads(int p0, int p1) {
@@ -96,14 +99,12 @@ void dumpParticipation() {
             << "sum == " << sum << "\n"
             << "expected sum == " << 10*default_P + 10*default_P*10000 << "\n";
   clearParticipation();
+  counter1 = 0; counter2 = 0;
 }
 
-std::atomic<int> count_up = 0;
-
-void waitUntil(int N) {
-  ++count_up;
-  while (count_up != N);
-  count_up = 0;
+void waitUntil(int N, counter_t& c) {
+  ++c;
+  while (c != N);
 }
 
 

From e254dea007e75947b3acb2674fd188a77f3a51ce Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Wed, 20 Nov 2024 09:18:43 -0600
Subject: [PATCH 15/34] Added whitespace in output

---
 .../performance_tuning/global_control_and_explicit_arena.cpp    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/new_examples/performance_tuning/global_control_and_explicit_arena.cpp b/new_examples/performance_tuning/global_control_and_explicit_arena.cpp
index 93a8520b7e..6928dc6d70 100644
--- a/new_examples/performance_tuning/global_control_and_explicit_arena.cpp
+++ b/new_examples/performance_tuning/global_control_and_explicit_arena.cpp
@@ -75,7 +75,7 @@ void dumpParticipation(int p) {
     std::cout << ", -";
   std::cout << "]\n" 
             << "sum == " << sum  << "\n"
-            << "expected sum " << 10*default_P << "\n";
+            << "expected sum " << 10*default_P << "\n\n";
   clearParticipation();
 }
 

From 3e6d92ba661f3166048475595d625831450a8260 Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Wed, 20 Nov 2024 09:40:19 -0600
Subject: [PATCH 16/34] Made the conflict examples uniform

---
 .../global_control_and_explicit_conflict.cpp  | 70 +++++++++----------
 .../global_control_and_implicit_conflict.cpp  | 10 +--
 2 files changed, 37 insertions(+), 43 deletions(-)

diff --git a/new_examples/performance_tuning/global_control_and_explicit_conflict.cpp b/new_examples/performance_tuning/global_control_and_explicit_conflict.cpp
index a46c9d1100..35f5b5022b 100644
--- a/new_examples/performance_tuning/global_control_and_explicit_conflict.cpp
+++ b/new_examples/performance_tuning/global_control_and_explicit_conflict.cpp
@@ -14,44 +14,55 @@
     limitations under the License.
 */
 
-#include <iostream>
 #include <thread>
 #include <tbb/tbb.h>
 
 const int default_P = tbb::info::default_concurrency();
-void doWork(int inc, double seconds);
-void waitUntil(int N);
+using counter_t = std::atomic<int>;
+void waitUntil(int N, counter_t& c);
+void noteParticipation(int offset);
+void dumpParticipation();
 
-void arenaGlobalControlExplicitArena(int p, int inc) {
+void doWork(int offset, double seconds) {
+  noteParticipation(offset);
+  tbb::tick_count t0 = tbb::tick_count::now();
+  while ((tbb::tick_count::now() - t0).seconds() < seconds);
+}
+
+counter_t counter1 = 0, counter2 = 0;
+
+void arenaGlobalControlExplicitArena(int p, int offset) {
   tbb::global_control gc(tbb::global_control::max_allowed_parallelism, p);
 
+  // we use waitUntil to force overlap of the gc lifetimes
+  waitUntil(2, counter1);
   tbb::task_arena a{2*tbb::info::default_concurrency()};
 
   a.execute([=]() {
     tbb::parallel_for(0, 
                       10*tbb::info::default_concurrency(), 
-                      [=](int) { doWork(inc, 0.01); });
+                      [=](int) { doWork(offset, 0.01); });
   });
+
+  // we prevent either gc from being destroyed until both are done
+  waitUntil(2, counter2);
 }
 
 void runTwoThreads(int p0, int p1) {
-  std::thread t0([=]() {
-    waitUntil(2);
-    arenaGlobalControlExplicitArena(p0, 1);
-  });
-  std::thread t1([=]() {
-    waitUntil(2);
-    arenaGlobalControlExplicitArena(p1, 10000);
-  });
+  std::thread t0([=]() { arenaGlobalControlExplicitArena(p0, 1); });
+  std::thread t1([=]() { arenaGlobalControlExplicitArena(p1, 10000); });
   t0.join();
   t1.join();
 }
 
+int main() {
+  runTwoThreads(default_P/2, 2*default_P);
+  dumpParticipation();
+  return 0;
+}
+
 #include <atomic>
-#include <cstdio>
-#include <vector>
-#include <map>
-#include <set>
+#include <iostream>
 #include <vector>
 
 std::atomic<int> next_tid;
@@ -66,12 +77,6 @@ void noteParticipation(int inc) {
   tid_participation[t] += inc;
 }
 
-void doWork(int inc, double seconds) {
-  noteParticipation(inc);
-  tbb::tick_count t0 = tbb::tick_count::now();
-  while ((tbb::tick_count::now() - t0).seconds() < seconds);
-}
-
 void clearParticipation() {
   next_tid = 0;
   my_tid.clear();
@@ -79,11 +84,10 @@ void clearParticipation() {
     p = 0;
 }
 
-void dumpParticipation(int p) {
-  int end = next_tid;
+void dumpParticipation() {
   int sum = tid_participation[0];
   std::cout << "[" << tid_participation[0];
-  for (int i = 1; i < 2*default_P; ++i) {
+  for (int i = 1; i < tid_participation.size(); ++i) {
     sum += tid_participation[i];
     std::cout << ", " << tid_participation[i];
   }
@@ -91,18 +95,12 @@ void dumpParticipation(int p) {
             << "sum == " << sum << "\n"
             << "expected sum == " << 10*default_P + 10*default_P*10000 << "\n";
   clearParticipation();
+  counter1 = 0; counter2 = 0;
 }
 
-std::atomic<int> count_up = 0;
-
-void waitUntil(int N) {
-  ++count_up;
-  while (count_up != N);
+void waitUntil(int N, counter_t& c) {
+  ++c;
+  while (c != N);
 }
 
-int main() {
-  runTwoThreads(default_P/2, 2*default_P);
-  dumpParticipation(2*default_P);
-  return 0;
-}
 
diff --git a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp
index 1ffe79901b..8474c57de0 100644
--- a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp
+++ b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp
@@ -14,6 +14,7 @@
     limitations under the License.
 */
 
+#include <atomic>
 #include <thread>
 #include <tbb/tbb.h>
 
@@ -49,12 +50,8 @@ void arenaGlobalControlImplicitArena(int p, int offset) {
 }
 
 void runTwoThreads(int p0, int p1) {
-  std::thread t0([=]() {
-    arenaGlobalControlImplicitArena(p0, 1);
-  });
-  std::thread t1([=]() {
-    arenaGlobalControlImplicitArena(p1, 10000);
-  });
+  std::thread t0([=]() { arenaGlobalControlImplicitArena(p0, 1); });
+  std::thread t1([=]() { arenaGlobalControlImplicitArena(p1, 10000); });
   t0.join();
   t1.join();
 }
@@ -65,7 +62,6 @@ int main() {
   return 0;
 }
 
-#include <atomic>
 #include <iostream>
 #include <vector>
 

From 8d0aff817127a51fcb0fdc7feb5a96e050ea707b Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Wed, 20 Nov 2024 09:52:56 -0600
Subject: [PATCH 17/34] Minor refactoring

---
 .../global_control_and_explicit_arena.cpp                   | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/new_examples/performance_tuning/global_control_and_explicit_arena.cpp b/new_examples/performance_tuning/global_control_and_explicit_arena.cpp
index 6928dc6d70..65443ab64e 100644
--- a/new_examples/performance_tuning/global_control_and_explicit_arena.cpp
+++ b/new_examples/performance_tuning/global_control_and_explicit_arena.cpp
@@ -14,7 +14,6 @@
     limitations under the License.
 */
 
-#include <iostream>
 #include <tbb/tbb.h>
 
 const int default_P = tbb::info::default_concurrency();
@@ -33,10 +32,7 @@ void arenaGlobalControlExplicitArena(int p) {
 }
 
 #include <atomic>
-#include <cstdio>
-#include <vector>
-#include <map>
-#include <set>
+#include <iostream>
 #include <vector>
 
 std::atomic<int> next_tid;

From 84eaf9bf8eb9962d30b279f6843269d9faf3e966 Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Wed, 20 Nov 2024 10:35:19 -0600
Subject: [PATCH 18/34] Cleaned up priority example output

---
 .../priorities_and_conflict.cpp               | 56 ++++++++++---------
 1 file changed, 29 insertions(+), 27 deletions(-)

diff --git a/new_examples/performance_tuning/priorities_and_conflict.cpp b/new_examples/performance_tuning/priorities_and_conflict.cpp
index b8ae8117aa..c52f2b38e3 100644
--- a/new_examples/performance_tuning/priorities_and_conflict.cpp
+++ b/new_examples/performance_tuning/priorities_and_conflict.cpp
@@ -14,18 +14,24 @@
     limitations under the License.
 */
 
-#include <iostream>
+
 #include <thread>
 #include <tbb/tbb.h>
 
 void printArrival(tbb::task_arena::priority priority);
-void waitUntil(int N);
+
+using counter_t = std::atomic<int>;
+counter_t counter = 0;
+void waitUntil(int N, counter_t& c) {
+  ++c;
+  while (c != N);
+}
 
 void explicitArenaWithPriority(tbb::task_arena::priority priority) {
   tbb::task_arena a{tbb::info::default_concurrency(), 1, priority};
   a.execute([=]() {
     tbb::parallel_for(0, 
-                      10*tbb::info::default_concurrency(), 
+                      2*tbb::info::default_concurrency(), 
                       [=](int) { printArrival(priority); });
   });
 }
@@ -33,11 +39,11 @@ void explicitArenaWithPriority(tbb::task_arena::priority priority) {
 void runTwoThreads(tbb::task_arena::priority priority0, 
                    tbb::task_arena::priority priority1) {
   std::thread t0([=]() {
-    waitUntil(2);
+    waitUntil(2, counter);
     explicitArenaWithPriority(priority0);
   });
   std::thread t1([=]() {
-    waitUntil(2);
+    waitUntil(2, counter);
     explicitArenaWithPriority(priority1);
   });
   t0.join();
@@ -46,16 +52,31 @@ void runTwoThreads(tbb::task_arena::priority priority0,
 
 #include <cstdio>
 
+int main() {
+  counter = 0;
+  std::printf("\n\n\n\nrunTwoThreads with low (.) and high (|)\n");
+  runTwoThreads(tbb::task_arena::priority::low, tbb::task_arena::priority::high);
+
+  counter = 0;
+  std::printf("\n\n\n\nrunTwoThreads with low (.) and normal (:)\n");
+  runTwoThreads(tbb::task_arena::priority::low, tbb::task_arena::priority::normal);
+
+  counter = 0;
+  std::printf("\n\n\n\nrunTwoThreads with normal (:) and high (|)\n");
+  runTwoThreads(tbb::task_arena::priority::normal, tbb::task_arena::priority::high);
+  return 0;
+}
+
 void printArrival(tbb::task_arena::priority priority) {
   switch (priority) {
     case tbb::task_arena::priority::low:
-      std::printf(" low ");
+      std::printf(".");
       break;
     case tbb::task_arena::priority::normal:
-      std::printf(" normal ");
+      std::printf(":");
       break;
     case tbb::task_arena::priority::high:
-      std::printf(" high ");
+      std::printf("|");
       break;
     default:
       break;
@@ -65,24 +86,5 @@ void printArrival(tbb::task_arena::priority priority) {
   while ((tbb::tick_count::now() - t0).seconds() < 0.01);
 }
 
-std::atomic<int> count_up = 0;
-void waitUntil(int N) {
-  ++count_up;
-  while (count_up != N);
-}
-
-int main() {
-  count_up = 0;
-  std::printf("\n\n\n\nrunTwoThreads(low, high)");
-  runTwoThreads(tbb::task_arena::priority::low, tbb::task_arena::priority::high);
-
-  count_up = 0;
-  std::printf("\n\n\n\nrunTwoThreads(low, normal)");
-  runTwoThreads(tbb::task_arena::priority::low, tbb::task_arena::priority::normal);
 
-  count_up = 0;
-  std::printf("\n\n\n\nrunTwoThreads(normal, high)");
-  runTwoThreads(tbb::task_arena::priority::normal, tbb::task_arena::priority::high);
-  return 0;
-}
 

From 4c3d84f4ceadaebae7bd353a99bd3f7c10aabd93 Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Wed, 20 Nov 2024 19:24:07 -0600
Subject: [PATCH 19/34] Fixed output for priorities example

---
 .../performance_tuning/priorities_and_conflict.cpp         | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/new_examples/performance_tuning/priorities_and_conflict.cpp b/new_examples/performance_tuning/priorities_and_conflict.cpp
index c52f2b38e3..e7a1771d37 100644
--- a/new_examples/performance_tuning/priorities_and_conflict.cpp
+++ b/new_examples/performance_tuning/priorities_and_conflict.cpp
@@ -54,16 +54,17 @@ void runTwoThreads(tbb::task_arena::priority priority0,
 
 int main() {
   counter = 0;
-  std::printf("\n\n\n\nrunTwoThreads with low (.) and high (|)\n");
+  std::printf("\n\nrunTwoThreads with low (.) and high (|)\n");
   runTwoThreads(tbb::task_arena::priority::low, tbb::task_arena::priority::high);
 
   counter = 0;
-  std::printf("\n\n\n\nrunTwoThreads with low (.) and normal (:)\n");
+  std::printf("\n\nrunTwoThreads with low (.) and normal (:)\n");
   runTwoThreads(tbb::task_arena::priority::low, tbb::task_arena::priority::normal);
 
   counter = 0;
-  std::printf("\n\n\n\nrunTwoThreads with normal (:) and high (|)\n");
+  std::printf("\n\nrunTwoThreads with normal (:) and high (|)\n");
   runTwoThreads(tbb::task_arena::priority::normal, tbb::task_arena::priority::high);
+  std::printf("\n");
   return 0;
 }
 

From 75f28906bc3b25e9593ec2339b2d57b5f9535f31 Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Wed, 20 Nov 2024 21:38:03 -0600
Subject: [PATCH 20/34] Removed tracing and added output to constraints example

---
 .../performance_tuning/constraints.cpp        | 43 +++++++------------
 1 file changed, 15 insertions(+), 28 deletions(-)

diff --git a/new_examples/performance_tuning/constraints.cpp b/new_examples/performance_tuning/constraints.cpp
index 42800c6205..02ed66e897 100644
--- a/new_examples/performance_tuning/constraints.cpp
+++ b/new_examples/performance_tuning/constraints.cpp
@@ -14,23 +14,15 @@
     limitations under the License.
 */
 
-#include "tbb/tbb.h"
 #include <vector>
+#include <tbb/tbb.h>
 
-#define USE_ARENA_TRACE 1
-#if USE_ARENA_TRACE
-#include "arena_trace.h"
-#endif
 
 int N = 1000;
 double w = 0.01;
 double f(double v);
 
 void constrain_for_numa_nodes() {
-#if USE_ARENA_TRACE
-    arena_tracer t{"numa_trace.json"};
-#endif
-
     std::vector<tbb::numa_node_id> numa_nodes = tbb::info::numa_nodes();
     std::vector<tbb::task_arena> arenas(numa_nodes.size());
     std::vector<tbb::task_group> task_groups(numa_nodes.size());
@@ -41,14 +33,18 @@ void constrain_for_numa_nodes() {
         t.add_arena(std::to_string(i), arenas[i]);
         #endif
     }
-    for (int i = 0; i < numa_nodes.size(); i++) {
-        arenas[i].execute([&task_groups, i] {
+    for (int i = 1; i < numa_nodes.size(); i++) {
+        arenas[i].enqueue([&task_groups, i] {
             task_groups[i].run([] {
                 tbb::parallel_for(0, N, [](int j) { f(w); });
             });
         });
     }
-    for (int i = 0; i < numa_nodes.size(); i++) {
+    arenas[0].execute([] {
+        tbb::parallel_for(0, N, [](int j) { f(w); });
+    });
+
+    for (int i = 1; i < numa_nodes.size(); i++) {
         arenas[i].execute([&task_groups, i] {
             task_groups[i].wait();
         });
@@ -56,17 +52,11 @@ void constrain_for_numa_nodes() {
 }
 
 void constrain_for_core_type() {
-
     std::vector<tbb::core_type_id> core_types = tbb::info::core_types();
     tbb::task_arena arena(
       tbb::task_arena::constraints{}.set_core_type(core_types.back())
     );
 
-    #if USE_ARENA_TRACE
-      arena_tracer t{"core_trace.json"};
-      t.add_arena("pcores", arena);
-    #endif
-
     arena.execute([] {
         tbb::parallel_for(0, N, [](int) { f(w); });
     });
@@ -79,11 +69,6 @@ void constrain_for_no_hyperthreading() {
     c.set_max_threads_per_core(1);
     tbb::task_arena no_ht_arena(c);
 
-    #if USE_ARENA_TRACE
-      arena_tracer t{"no_ht_constraints_trace.json"};
-      t.add_arena("no_ht_arena", no_ht_arena);
-    #endif
-
     no_ht_arena.execute( [] {
         tbb::parallel_for(0, N, [](int) { f(w); });
     });
@@ -97,21 +82,23 @@ void limit_concurrency_for_no_hyperthreading() {
     int no_ht_concurrency = tbb::info::default_concurrency(c);
     tbb::task_arena arena( no_ht_concurrency );
 
-    #if USE_ARENA_TRACE
-      arena_tracer t{"no_ht_concurrency_trace.json"};
-      t.add_arena("no_ht_concurrency", arena);
-    #endif
-
     arena.execute( [] {
         tbb::parallel_for(0, N, [](int) { f(w); });
     });
 }
 
+#include <iostream>
+
 int main() {
+  std::cout << "Running numa node constraint example\n";
   constrain_for_numa_nodes();
+  std::cout << "Running core type constraint example\n";
   constrain_for_core_type();
+  std::cout << "Running one thread per core constraint example\n";
   constrain_for_no_hyperthreading();
+    std::cout << "Running limited concurrency example\n";
   limit_concurrency_for_no_hyperthreading();
+  std::cout << "done\n";
   return 0;
 }
 

From e810f96f6b1fec108d0ab9c736fda53ece4bb545 Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Thu, 21 Nov 2024 09:07:49 -0600
Subject: [PATCH 21/34] Use defer for NUMA example

---
 .../performance_tuning/constraints.cpp        | 49 +++++++++----------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/new_examples/performance_tuning/constraints.cpp b/new_examples/performance_tuning/constraints.cpp
index 02ed66e897..6df33f9366 100644
--- a/new_examples/performance_tuning/constraints.cpp
+++ b/new_examples/performance_tuning/constraints.cpp
@@ -23,32 +23,31 @@ double w = 0.01;
 double f(double v);
 
 void constrain_for_numa_nodes() {
-    std::vector<tbb::numa_node_id> numa_nodes = tbb::info::numa_nodes();
-    std::vector<tbb::task_arena> arenas(numa_nodes.size());
-    std::vector<tbb::task_group> task_groups(numa_nodes.size());
-
-    for (int i = 0; i < numa_nodes.size(); i++) {
-        arenas[i].initialize(tbb::task_arena::constraints(numa_nodes[i]), 0);
-        #if USE_ARENA_TRACE
-        t.add_arena(std::to_string(i), arenas[i]);
-        #endif
-    }
-    for (int i = 1; i < numa_nodes.size(); i++) {
-        arenas[i].enqueue([&task_groups, i] {
-            task_groups[i].run([] {
-                tbb::parallel_for(0, N, [](int j) { f(w); });
-            });
-        });
-    }
-    arenas[0].execute([] {
-        tbb::parallel_for(0, N, [](int j) { f(w); });
-    });
+  std::vector<tbb::numa_node_id> numa_nodes = tbb::info::numa_nodes();
+  std::vector<tbb::task_arena> arenas(numa_nodes.size());
+  std::vector<tbb::task_group> task_groups(numa_nodes.size());
+
+  // initialize each arena, each constrained to a different NUMA node
+  for (int i = 0; i < numa_nodes.size(); i++)
+    arenas[i].initialize(tbb::task_arena::constraints(numa_nodes[i]), 0);
+
+  // enqueue work to all but the first arena, using the task_group to track work
+  // by using defer, the task_group reference count is incremented immediately
+  for (int i = 1; i < numa_nodes.size(); i++)
+    arenas[i].enqueue(
+      task_groups[i].defer([] { 
+        tbb::parallel_for(0, N, [](int j) { f(w); }); 
+      })
+    );
+
+  // directly execute the work to completion in the remaining arena
+  arenas[0].execute([] {
+    tbb::parallel_for(0, N, [](int j) { f(w); });
+  });
 
-    for (int i = 1; i < numa_nodes.size(); i++) {
-        arenas[i].execute([&task_groups, i] {
-            task_groups[i].wait();
-        });
-    }
+  // join the other arenas to wait on their task_groups
+  for (int i = 1; i < numa_nodes.size(); i++)
+    arenas[i].execute([&task_groups, i] { task_groups[i].wait(); });
 }
 
 void constrain_for_core_type() {

From ab39fd31f052b0866cd3d6cd610a911fac6024ff Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Thu, 21 Nov 2024 10:36:44 -0600
Subject: [PATCH 22/34] Added observer example

---
 .../task_scheduler_observer.cpp               | 113 ++++++++++++++++++
 1 file changed, 113 insertions(+)
 create mode 100644 new_examples/performance_tuning/task_scheduler_observer.cpp

diff --git a/new_examples/performance_tuning/task_scheduler_observer.cpp b/new_examples/performance_tuning/task_scheduler_observer.cpp
new file mode 100644
index 0000000000..8fbb367146
--- /dev/null
+++ b/new_examples/performance_tuning/task_scheduler_observer.cpp
@@ -0,0 +1,113 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include <iostream>
+#include <thread>
+#include <tbb/tbb.h>
+
+// these are placeholder for where we would put OS-specific types and calls
+using affinity_mask_t = std::string;
+void set_thread_affinity( int tid, const affinity_mask_t& mask ) {
+  std::ostringstream buffer;
+  buffer << std::this_thread::get_id()
+         << " -> (" << tid
+         << ", " << mask << ")\n";
+  std::cout << buffer.str();
+}
+void restore_thread_affinity() {
+  std::ostringstream buffer;
+  buffer <<  std::this_thread::get_id() 
+         << " -> (restored)\n";
+  std::cout << buffer.str();
+}
+
+// observer class
+class PinningObserver : public tbb::task_scheduler_observer {
+public:
+    // HW affinity mask to be used for threads in an arena
+    affinity_mask_t m_mask;
+    PinningObserver( oneapi::tbb::task_arena &a, const affinity_mask_t& mask )
+        : tbb::task_scheduler_observer(a), m_mask(mask) {
+        observe(true); // activate the observer
+    }
+    void on_scheduler_entry( bool worker ) override {
+        set_thread_affinity(
+            tbb::this_task_arena::current_thread_index(), m_mask);
+    }
+    void on_scheduler_exit( bool worker ) override {
+        restore_thread_affinity();
+    }
+};
+
+int N = 1000;
+double w = 0.01;
+double f(double v);
+
+using counter_t = std::atomic<int>;
+counter_t counter = 0;
+void waitUntil(int N, counter_t& c) {
+  ++c;
+  while (c != N);
+}
+
+void observeTwoArenas() {
+  int P = tbb::info::default_concurrency();
+
+  // two arenas, each with half the hw threads
+  tbb::task_arena a0(P/2);
+  tbb::task_arena a1(P/2);
+
+  PinningObserver obs0(a0, "mask_zero");
+  PinningObserver obs1(a1, "mask_one");
+
+  // Execute consecutive loops
+  std::cout << "Execute a0 loop\n";
+  a0.execute([] {
+    tbb::parallel_for(0, N, [](int j) { f(w); });
+  });
+  std::cout << "Execute a1 loop\n";
+  a1.execute([] {
+    tbb::parallel_for(0, N, [](int j) { f(w); });
+  });
+
+  // Execute concurrent loops
+  std::cout << "Execute a0 and a1 concurrently\n";
+  std::thread t0([&]() { 
+    waitUntil(2, counter);
+    a0.execute([] {
+      tbb::parallel_for(0, N, [](int j) { f(w); });
+    });
+  });
+  std::thread t1([&]() { 
+    waitUntil(2, counter);
+    a1.execute([] {
+      tbb::parallel_for(0, N, [](int j) { f(w); });
+    });
+  });
+  t0.join();
+  t1.join();
+}
+
+int main() {
+  observeTwoArenas();
+  return 0;
+}
+
+double f(double v) {
+  tbb::tick_count t0 = tbb::tick_count::now();
+  while ((tbb::tick_count::now() - t0).seconds() < 0.01);
+  return 2*v;
+}

From e6e5bf1ee09ffde01d12d678731feba9a4edfcd0 Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Thu, 21 Nov 2024 10:48:15 -0600
Subject: [PATCH 23/34] Updated CMakeLists

---
 new_examples/CMakeLists.txt                   | 13 ++++++++--
 new_examples/algorithms/CMakeLists.txt        |  3 ---
 new_examples/cancellation/CMakeLists.txt      |  3 ---
 new_examples/exception/CMakeLists.txt         |  3 ---
 new_examples/graph/CMakeLists.txt             |  5 ++--
 new_examples/intro/CMakeLists.txt             |  3 ---
 new_examples/migration/CMakeLists.txt         |  6 ++---
 .../performance_tuning/CMakeLists.txt         | 26 ++++++++++++-------
 new_examples/tasks/CMakeLists.txt             |  3 ---
 9 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/new_examples/CMakeLists.txt b/new_examples/CMakeLists.txt
index 6651073a6e..77b8cdbe93 100644
--- a/new_examples/CMakeLists.txt
+++ b/new_examples/CMakeLists.txt
@@ -2,8 +2,17 @@ cmake_minimum_required (VERSION 3.4)
 
 project(tbb_tutorials LANGUAGES CXX)
 
-set(CMAKE_CXX_COMPILER "icx-cl")
-set(CMAKE_LINKER "icpx")
+if (WIN32)
+  set(CMAKE_CXX_COMPILER "icx-cl")
+  set(CMAKE_LINKER "icpx")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qtbb")
+  set(CMAKE_CXX_LINKER_FLAGS "-Qtbb")
+else()
+  set(CMAKE_CXX_COMPILER "icpx")
+  set(CMAKE_LINKER "icpx")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb")
+  set(CMAKE_CXX_LINKER_FLAGS "-tbb -lpthread") 
+endif()
 set(CMAKE_CXX_STANDARD 20)
 
 include (CTest)
diff --git a/new_examples/algorithms/CMakeLists.txt b/new_examples/algorithms/CMakeLists.txt
index f4f0aa9eca..519baa1ae7 100644
--- a/new_examples/algorithms/CMakeLists.txt
+++ b/new_examples/algorithms/CMakeLists.txt
@@ -1,6 +1,3 @@
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb")
-set(CMAKE_CXX_LINKER_FLAGS "-tbb")
-
 foreach(tpp parallel_invoke_recursive_quicksort.cpp parallel_invoke_two_quicksorts.cpp 
             parallel_for_trivial.cpp parallel_for_unoptimized_mxm.cpp
             parallel_reduce_max.cpp parallel_reduce_pi.cpp
diff --git a/new_examples/cancellation/CMakeLists.txt b/new_examples/cancellation/CMakeLists.txt
index 0f26004913..adebb4e100 100644
--- a/new_examples/cancellation/CMakeLists.txt
+++ b/new_examples/cancellation/CMakeLists.txt
@@ -1,6 +1,3 @@
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb")
-set(CMAKE_CXX_LINKER_FLAGS "-tbb")
-
 foreach(tpp cancel_group_execution1.cpp
             cancel_group_execution2.cpp
             cancel_group_execution3.cpp
diff --git a/new_examples/exception/CMakeLists.txt b/new_examples/exception/CMakeLists.txt
index 08105e42c9..444f2c077a 100644
--- a/new_examples/exception/CMakeLists.txt
+++ b/new_examples/exception/CMakeLists.txt
@@ -1,6 +1,3 @@
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb")
-set(CMAKE_CXX_LINKER_FLAGS "-tbb")
-
 foreach(tpp exception_catch1.cpp
             exception_catch2.cpp
             exception_catch3.cpp)
diff --git a/new_examples/graph/CMakeLists.txt b/new_examples/graph/CMakeLists.txt
index 1c971bd436..31ce92e2c7 100644
--- a/new_examples/graph/CMakeLists.txt
+++ b/new_examples/graph/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb")
-set(CMAKE_CXX_LINKER_FLAGS "-tbb")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qtbb")
+set(CMAKE_CXX_LINKER_FLAGS "-Qtbb")
 
 foreach(tpp graph_composite_node.cpp
             graph_execute_while_building.cpp
@@ -10,6 +10,7 @@ foreach(tpp graph_composite_node.cpp
             graph_node_priorities.cpp
             graph_reestablish_order.cpp
             graph_small_nodes.cpp
+            graph_stereoscopic_3d.cpp
             graph_two_nodes.cpp 
             graph_two_nodes_deduced.cpp
             graph_with_join.cpp)
diff --git a/new_examples/intro/CMakeLists.txt b/new_examples/intro/CMakeLists.txt
index 1efda6b038..9e44001630 100644
--- a/new_examples/intro/CMakeLists.txt
+++ b/new_examples/intro/CMakeLists.txt
@@ -1,6 +1,3 @@
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb")
-set(CMAKE_CXX_LINKER_FLAGS "-tbb")
-
 foreach(tpp intro_pi.cpp
             intro_pi_timing.cpp
             intro_flowgraph.cpp
diff --git a/new_examples/migration/CMakeLists.txt b/new_examples/migration/CMakeLists.txt
index d52df94a67..f489f910cb 100644
--- a/new_examples/migration/CMakeLists.txt
+++ b/new_examples/migration/CMakeLists.txt
@@ -1,7 +1,5 @@
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb -pthread")
-set(CMAKE_CXX_LINKER_FLAGS "-tbb -pthread")
-
-foreach(tpp migrate_task_scheduler_init.cpp
+foreach(tpp migrate_atomics.cpp
+            migrate_task_scheduler_init.cpp
             migrate_parallel_do.cpp
             migrate_priorities.cpp
             migrate_task_blocking.cpp
diff --git a/new_examples/performance_tuning/CMakeLists.txt b/new_examples/performance_tuning/CMakeLists.txt
index ff880d251b..c87ef75921 100644
--- a/new_examples/performance_tuning/CMakeLists.txt
+++ b/new_examples/performance_tuning/CMakeLists.txt
@@ -1,17 +1,23 @@
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qtbb")
-set(CMAKE_CXX_LINKER_FLAGS "-Qtbb")
+if (WIN32)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qtbb")
+  set(CMAKE_CXX_LINKER_FLAGS "-Qtbb")
+else()
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb")
+  set(CMAKE_CXX_LINKER_FLAGS "-tbb -lpthread") 
+endif()
 
-foreach(tpp global_control_and_implicit_arena.cpp 
-            global_control_and_explicit_arena.cpp 
-            global_control_and_implicit_conflict.cpp 
-            global_control_and_explicit_conflict.cpp 
-            priorities_and_conflict.cpp 
-            blocked_ranges_trivial.cpp
+foreach(tpp blocked_ranges_trivial.cpp
             constraints.cpp
+            global_control_and_explicit_arena.cpp
+            global_control_and_explicit_conflict.cpp
+            global_control_and_implicit_arena.cpp
+            global_control_and_implicit_conflict.cpp
+            parallel_for_addition_partitioners.cpp
             parallel_for_spin_partitioners.cpp
             parallel_for_spin_partitioners_timed.cpp
-            parallel_for_addition_partitioners.cpp
-            parallel_for_transpose_partitioners.cpp)
+            parallel_for_transpose_partitioners.cpp
+            priorities_and_conflict.cpp
+            task_scheduler_observer.cpp)
   string(REPLACE ".cpp" "" texe ${tpp})
   add_executable(${texe} ${tpp})
   target_include_directories(${texe} PUBLIC
diff --git a/new_examples/tasks/CMakeLists.txt b/new_examples/tasks/CMakeLists.txt
index 79d468791d..3a1af18ea0 100644
--- a/new_examples/tasks/CMakeLists.txt
+++ b/new_examples/tasks/CMakeLists.txt
@@ -1,6 +1,3 @@
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb -pthread")
-set(CMAKE_CXX_LINKER_FLAGS "-tbb -pthread")
-
 foreach(tpp parallel_invoke_fib.cpp
             task_group_fib.cpp
             task_group_poor_scaling.cpp

From 20979fabe21d3a87b0558ff39d468545a1384fb6 Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Thu, 21 Nov 2024 10:52:24 -0600
Subject: [PATCH 24/34] Fixed graph CMakeLists.txt

---
 new_examples/graph/CMakeLists.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/new_examples/graph/CMakeLists.txt b/new_examples/graph/CMakeLists.txt
index 31ce92e2c7..5f2114c42e 100644
--- a/new_examples/graph/CMakeLists.txt
+++ b/new_examples/graph/CMakeLists.txt
@@ -1,6 +1,3 @@
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qtbb")
-set(CMAKE_CXX_LINKER_FLAGS "-Qtbb")
-
 foreach(tpp graph_composite_node.cpp
             graph_execute_while_building.cpp
             graph_fwd_substitution.cpp

From 56d7b5ff8c8950a7e89bd23bb0d8ef314aa204ae Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Thu, 21 Nov 2024 17:44:27 -0600
Subject: [PATCH 25/34] Updated partitioner examples

---
 .../parallel_for_spin_partitioners.cpp        |  6 ++----
 .../parallel_for_spin_partitioners_timed.cpp  | 19 +------------------
 2 files changed, 3 insertions(+), 22 deletions(-)

diff --git a/new_examples/performance_tuning/parallel_for_spin_partitioners.cpp b/new_examples/performance_tuning/parallel_for_spin_partitioners.cpp
index 5de939e84a..b2d57de755 100644
--- a/new_examples/performance_tuning/parallel_for_spin_partitioners.cpp
+++ b/new_examples/performance_tuning/parallel_for_spin_partitioners.cpp
@@ -23,8 +23,7 @@ template <typename Partitioner>
 void pforWork(int N, const Partitioner& p) {
   tbb::parallel_for( tbb::blocked_range<int>(0, N, 1), 
     [](const tbb::blocked_range<int>& r) {
-      int ie = r.end();
-      for (int i = r.begin(); i < ie; ++i) {
+      for (int i = r.begin(); i < r.end(); ++i) {
         doWork(i);
       }
     }, p
@@ -35,8 +34,7 @@ template <typename Partitioner>
 void pforWork(int N, Partitioner& p) {
   tbb::parallel_for( tbb::blocked_range<int>(0, N, 1), 
     [](const tbb::blocked_range<int>& r) {
-      int ie = r.end();
-      for (int i = r.begin(); i < ie; ++i) {
+      for (int i = r.begin(); i < r.end(); ++i) {
         doWork(i);
       }
     }, p
diff --git a/new_examples/performance_tuning/parallel_for_spin_partitioners_timed.cpp b/new_examples/performance_tuning/parallel_for_spin_partitioners_timed.cpp
index 64272005af..51216fabfd 100644
--- a/new_examples/performance_tuning/parallel_for_spin_partitioners_timed.cpp
+++ b/new_examples/performance_tuning/parallel_for_spin_partitioners_timed.cpp
@@ -41,8 +41,7 @@ static inline double executePfor(int num_trials, int N,
     tbb::parallel_for (
       tbb::blocked_range<int>{0, N, static_cast<size_t>(gs)},
       [tpi](const tbb::blocked_range<int>& r) {
-        int e = r.end();
-        for (int i = r.begin(); i < e; ++i) {
+        for (int i = r.begin(); i < r.end(); ++i) {
           spinWaitForAtLeast(tpi);
         } 
       }, 
@@ -53,8 +52,6 @@ static inline double executePfor(int num_trials, int N,
   return (t1 - t0).seconds()/num_trials;
 }
 
-#define CONSTRAIN_TO_ECORES 1
-
 int main() {
   tbb::auto_partitioner auto_p;
   tbb::simple_partitioner simple_p;
@@ -67,16 +64,6 @@ int main() {
   const double twenty_us = 0.00002;
   double timing[4][19];
 
-  #if CONSTRAIN_TO_ECORES
-  std::vector<tbb::core_type_id> core_types = tbb::info::core_types();
-  tbb::task_arena::constraints c;
-  c.set_core_type(core_types.front());
-  c.set_max_concurrency(tbb::info::default_concurrency(c) - 2);
-  tbb::task_arena ecore_arena(c);
-  std::cout << "Using arena with " << ecore_arena.max_concurrency() << " slots\n";
-  ecore_arena.execute([&]() {
-  #endif
-
   for (double tpi = ten_ns; tpi < twenty_us; tpi *= 10) { 
     std::cout << "Speedups for " << tpi << " seconds per iteration" << std::endl
               << "partitioner";
@@ -103,10 +90,6 @@ int main() {
     std::cout << std::endl;
   }
 
-  #if CONSTRAIN_TO_ECORES
-  });
-  #endif
-
   return 0;
 }
 

From 466cbbaa6ae348e1242da7891fb4ed9f1ebc8d38 Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Thu, 21 Nov 2024 18:14:05 -0600
Subject: [PATCH 26/34] Renamed files

---
 new_examples/performance_tuning/CMakeLists.txt                | 4 ++--
 ...or_spin_partitioners.cpp => parallel_for_partitioners.cpp} | 0
 ...itioners_timed.cpp => parallel_for_partitioners_timed.cpp} | 0
 new_examples/performance_tuning/task_scheduler_observer.cpp   | 1 +
 4 files changed, 3 insertions(+), 2 deletions(-)
 rename new_examples/performance_tuning/{parallel_for_spin_partitioners.cpp => parallel_for_partitioners.cpp} (100%)
 rename new_examples/performance_tuning/{parallel_for_spin_partitioners_timed.cpp => parallel_for_partitioners_timed.cpp} (100%)

diff --git a/new_examples/performance_tuning/CMakeLists.txt b/new_examples/performance_tuning/CMakeLists.txt
index c87ef75921..e5f7c92da6 100644
--- a/new_examples/performance_tuning/CMakeLists.txt
+++ b/new_examples/performance_tuning/CMakeLists.txt
@@ -13,8 +13,8 @@ foreach(tpp blocked_ranges_trivial.cpp
             global_control_and_implicit_arena.cpp
             global_control_and_implicit_conflict.cpp
             parallel_for_addition_partitioners.cpp
-            parallel_for_spin_partitioners.cpp
-            parallel_for_spin_partitioners_timed.cpp
+            parallel_for_partitioners.cpp
+            parallel_for_partitioners_timed.cpp
             parallel_for_transpose_partitioners.cpp
             priorities_and_conflict.cpp
             task_scheduler_observer.cpp)
diff --git a/new_examples/performance_tuning/parallel_for_spin_partitioners.cpp b/new_examples/performance_tuning/parallel_for_partitioners.cpp
similarity index 100%
rename from new_examples/performance_tuning/parallel_for_spin_partitioners.cpp
rename to new_examples/performance_tuning/parallel_for_partitioners.cpp
diff --git a/new_examples/performance_tuning/parallel_for_spin_partitioners_timed.cpp b/new_examples/performance_tuning/parallel_for_partitioners_timed.cpp
similarity index 100%
rename from new_examples/performance_tuning/parallel_for_spin_partitioners_timed.cpp
rename to new_examples/performance_tuning/parallel_for_partitioners_timed.cpp
diff --git a/new_examples/performance_tuning/task_scheduler_observer.cpp b/new_examples/performance_tuning/task_scheduler_observer.cpp
index 8fbb367146..b6f319eac5 100644
--- a/new_examples/performance_tuning/task_scheduler_observer.cpp
+++ b/new_examples/performance_tuning/task_scheduler_observer.cpp
@@ -15,6 +15,7 @@
 */
 
 #include <iostream>
+#include <sstream>
 #include <thread>
 #include <tbb/tbb.h>
 

From b8cba8cf9a74e8d775d037b7a056fdf6674bd71c Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Thu, 21 Nov 2024 21:54:00 -0600
Subject: [PATCH 27/34] Added constraints for more reproducible results

---
 .../parallel_for_partitioners_timed.cpp       |  80 +++++++-----
 .../parallel_for_transpose_partitioners.cpp   | 117 +++++++++---------
 2 files changed, 105 insertions(+), 92 deletions(-)

diff --git a/new_examples/performance_tuning/parallel_for_partitioners_timed.cpp b/new_examples/performance_tuning/parallel_for_partitioners_timed.cpp
index 51216fabfd..7155b0caa9 100644
--- a/new_examples/performance_tuning/parallel_for_partitioners_timed.cpp
+++ b/new_examples/performance_tuning/parallel_for_partitioners_timed.cpp
@@ -53,42 +53,56 @@ static inline double executePfor(int num_trials, int N,
 }
 
 int main() {
-  tbb::auto_partitioner auto_p;
-  tbb::simple_partitioner simple_p;
-  tbb::static_partitioner static_p;
-  const std::string pname[4] = {"simple", "auto", "affinity", "static"};
-
-  const int N = 262144;
-  const int T = 20;
-  const double ten_ns = 0.00000001;
-  const double twenty_us = 0.00002;
-  double timing[4][19];
-
-  for (double tpi = ten_ns; tpi < twenty_us; tpi *= 10) { 
-    std::cout << "Speedups for " << tpi << " seconds per iteration" << std::endl
-              << "partitioner";
-    for (int gs = 1, i = 0; gs <= N; gs *= 2, ++i) 
-      std::cout << ", " << gs;
-    std::cout << std::endl;
-
-    double serial_time = executeFor(T, N, tpi);
-
-    for (int gs = 1, i = 0; gs <= N; gs *= 2, ++i) {
-      tbb::affinity_partitioner affinity_p;
-      spinWaitForAtLeast(0.001);
-      timing[0][i] = executePfor(T, N, gs, simple_p, tpi);
-      timing[1][i] = executePfor(T, N, gs, auto_p, tpi);
-      timing[2][i] = executePfor(T, N, gs, affinity_p, tpi);
-      timing[3][i] = executePfor(T, N, gs, static_p, tpi);
-    }
-    for (int p = 0; p < 4; ++p) {
-      std::cout << pname[p];  
+  // use the most performance codes
+  // only a single NUMA node
+  // and only 1 thread per core
+  tbb::task_arena::constraints c;
+  c.set_numa_id(tbb::info::numa_nodes()[0]);
+  c.set_core_type(tbb::info::core_types().back());
+  c.set_max_threads_per_core(1);
+  c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c)));
+  tbb::task_arena a(c);
+
+  std::cout << "Using an arena with " << a.max_concurrency() << " slots\n";
+
+  a.execute([&]() {
+    tbb::auto_partitioner auto_p;
+    tbb::simple_partitioner simple_p;
+    tbb::static_partitioner static_p;
+    const std::string pname[4] = {"simple", "auto", "affinity", "static"};
+
+    const int N = 262144;
+    const int T = 20;
+    const double ten_ns = 0.00000001;
+    const double twenty_us = 0.00002;
+    double timing[4][19];
+
+    for (double tpi = ten_ns; tpi < twenty_us; tpi *= 10) { 
+      std::cout << "Speedups for " << tpi << " seconds per iteration" << std::endl
+                << "partitioner";
       for (int gs = 1, i = 0; gs <= N; gs *= 2, ++i) 
-        std::cout << ", " << serial_time/timing[p][i];
+        std::cout << ", " << gs;
+      std::cout << std::endl;
+
+      double serial_time = executeFor(T, N, tpi);
+
+      for (int gs = 1, i = 0; gs <= N; gs *= 2, ++i) {
+        tbb::affinity_partitioner affinity_p;
+        spinWaitForAtLeast(0.001);
+        timing[0][i] = executePfor(T, N, gs, simple_p, tpi);
+        timing[1][i] = executePfor(T, N, gs, auto_p, tpi);
+        timing[2][i] = executePfor(T, N, gs, affinity_p, tpi);
+        timing[3][i] = executePfor(T, N, gs, static_p, tpi);
+      }
+      for (int p = 0; p < 4; ++p) {
+        std::cout << pname[p];  
+        for (int gs = 1, i = 0; gs <= N; gs *= 2, ++i) 
+          std::cout << ", " << serial_time/timing[p][i];
+        std::cout << std::endl;
+      }
       std::cout << std::endl;
     }
-    std::cout << std::endl;
-  }
+  });
 
   return 0;
 }
diff --git a/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp b/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp
index 45f68f4391..734bbd752b 100644
--- a/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp
+++ b/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp
@@ -15,7 +15,7 @@
 */
 
 #include <tbb/tbb.h>
-#include <iostream>
+
 
 double serialTranspose(int N, double *a, double *b) {
   tbb::tick_count t0 = tbb::tick_count::now();
@@ -99,71 +99,70 @@ double pforTranspose2d(int N, double *a, double *b, int gs) {
 void setArray(int N, double *a);
 void checkTranspose(int N, double *a);
 
-#define CONSTRAIN_TO_ECORES 1
+#include <iostream>
 
 int main() {
-  int N = 2<<12; // 8192
-  double *a = new double[N*N];
-  double *b = new double[N*N];
-  setArray(N, a);
-  setArray(N, b);
-
-  #if CONSTRAIN_TO_ECORES
-  std::vector<tbb::core_type_id> core_types = tbb::info::core_types();
+  // use the most performance codes
+  // only a single NUMA node
+  // and only 1 thread per core
   tbb::task_arena::constraints c;
-  c.set_core_type(core_types.front());
-  c.set_max_concurrency(tbb::info::default_concurrency(c) - 2);
-  tbb::task_arena ecore_arena(c);
-  std::cout << "Using arena with " << ecore_arena.max_concurrency() << " slots\n";
-  ecore_arena.execute([&]() {
-  #endif
-
-  serialTranspose(N, a, b);
-  double ts = serialTranspose(N, a, b);
-  checkTranspose(N, b);
-  std::cout << "Serial Time = " << ts << std::endl;
-
-  std::cout << "Parallel Times:" << std::endl
-            << "grainsize, oblivious, 1d auto, 1d simple, 2d auto, 2d simple" << std::endl;
-  for (int gs = 1; gs <= N; gs *= 2) {
-    setArray(N, a);
-    setArray(N, b);
-    serialObliviousTranspose(N, a, b, gs);
-    double to = serialObliviousTranspose(N, a, b, gs);
-    checkTranspose(N, b);
-
-    setArray(N, a);
-    setArray(N, b);
-    pforTranspose<tbb::auto_partitioner>(N, a, b, gs);
-    double t1d_auto = pforTranspose<tbb::auto_partitioner>(N, a, b, gs);
-
+  c.set_numa_id(tbb::info::numa_nodes()[0]);
+  c.set_core_type(tbb::info::core_types().back());
+  c.set_max_threads_per_core(1);
+  c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c)));
+  tbb::task_arena a(c);
+
+  std::cout << "Using an arena with " << a.max_concurrency() << " slots\n";
+
+  a.execute([&]() {
+    int N = 2<<12; // 8192
+    double *a = new double[N*N];
+    double *b = new double[N*N];
     setArray(N, a);
     setArray(N, b);
-    pforTranspose<tbb::simple_partitioner>(N, a, b, gs);
-    double t1d_simple = pforTranspose<tbb::simple_partitioner>(N, a, b, gs);
 
-    setArray(N, a);
-    setArray(N, b);
-    pforTranspose2d<tbb::auto_partitioner>(N, a, b, gs);
-    double t2d_auto = pforTranspose2d<tbb::auto_partitioner>(N, a, b, gs);
-
-    setArray(N, a);
-    setArray(N, b);
-    pforTranspose2d<tbb::simple_partitioner>(N, a, b, gs);
-    double t2d_simple = pforTranspose2d<tbb::simple_partitioner>(N, a, b, gs);
-
-    std::cout << gs 
-              << ", " << to 
-              << ", " << t1d_auto 
-              << ", " << t1d_simple 
-              << ", " << t2d_auto
-              << ", " << t2d_simple << std::endl;
-  }
-
-  #if CONSTRAIN_TO_ECORES
+    serialTranspose(N, a, b);
+    double ts = serialTranspose(N, a, b);
+    checkTranspose(N, b);
+    std::cout << "Serial Time = " << ts << std::endl;
+
+    std::cout << "Parallel Times:" << std::endl
+              << "grainsize, oblivious, 1d auto, 1d simple, 2d auto, 2d simple" << std::endl;
+    for (int gs = 1; gs <= N; gs *= 2) {
+      setArray(N, a);
+      setArray(N, b);
+      serialObliviousTranspose(N, a, b, gs);
+      double to = serialObliviousTranspose(N, a, b, gs);
+      checkTranspose(N, b);
+
+      setArray(N, a);
+      setArray(N, b);
+      pforTranspose<tbb::auto_partitioner>(N, a, b, gs);
+      double t1d_auto = pforTranspose<tbb::auto_partitioner>(N, a, b, gs);
+
+      setArray(N, a);
+      setArray(N, b);
+      pforTranspose<tbb::simple_partitioner>(N, a, b, gs);
+      double t1d_simple = pforTranspose<tbb::simple_partitioner>(N, a, b, gs);
+
+      setArray(N, a);
+      setArray(N, b);
+      pforTranspose2d<tbb::auto_partitioner>(N, a, b, gs);
+      double t2d_auto = pforTranspose2d<tbb::auto_partitioner>(N, a, b, gs);
+
+      setArray(N, a);
+      setArray(N, b);
+      pforTranspose2d<tbb::simple_partitioner>(N, a, b, gs);
+      double t2d_simple = pforTranspose2d<tbb::simple_partitioner>(N, a, b, gs);
+
+      std::cout << gs 
+                << ", " << to 
+                << ", " << t1d_auto 
+                << ", " << t1d_simple 
+                << ", " << t2d_auto
+                << ", " << t2d_simple << std::endl;
+    }
   });
-  #endif
-
   return 0;
 }
 

From ee924fd082ff248d090950d3c215cd38fa2b784f Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Thu, 21 Nov 2024 22:05:54 -0600
Subject: [PATCH 28/34] Removed unneeded temporaries

---
 .../parallel_for_transpose_partitioners.cpp         | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp b/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp
index 734bbd752b..323a43de3b 100644
--- a/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp
+++ b/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp
@@ -64,8 +64,7 @@ double pforTranspose(int N, double *a, double *b, int gs) {
    tbb::tick_count t0 = tbb::tick_count::now();
    tbb::parallel_for( tbb::blocked_range<int>(0, N, gs),
      [N, a, b](const tbb::blocked_range<int>& r) {
-       int ie = r.end();
-       for (int i = r.begin(); i < ie; ++i) {
+       for (int i = r.begin(); i < r.end(); ++i) {
          for (int j = 0; j < N; ++j) {
            b[j*N+i] = a[i*N+j];
          }
@@ -80,13 +79,11 @@ template<typename P>
 double pforTranspose2d(int N, double *a, double *b, int gs) {
   tbb::tick_count t0 = tbb::tick_count::now();
   tbb::parallel_for( tbb::blocked_range2d<int,int>{
-        0, N, static_cast<size_t>(gs), 0, 
-        N, static_cast<size_t>(gs)},
+                        0, N, static_cast<size_t>(gs), 
+                        0, N, static_cast<size_t>(gs)},
     [N, a, b](const tbb::blocked_range2d<int,int>& r) {
-      int ie = r.rows().end();
-      int je = r.cols().end();
-      for (int i = r.rows().begin(); i < ie; ++i) {
-        for (int j = r.cols().begin(); j < je; ++j) {
+      for (int i = r.rows().begin(); i < r.rows().end(); ++i) {
+        for (int j = r.cols().begin(); j < r.cols().end(); ++j) {
           b[j*N+i] = a[i*N+j];
         }
       }

From 51ad39e5b5f16ca58176d8cab8ded748214c9fa1 Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Thu, 21 Nov 2024 23:43:45 -0600
Subject: [PATCH 29/34] Added arena to addition sample

---
 .../parallel_for_addition_partitioners.cpp    | 101 +++++++++---------
 1 file changed, 51 insertions(+), 50 deletions(-)

diff --git a/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp b/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp
index 49333b9bc6..eb8001cc8b 100644
--- a/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp
+++ b/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp
@@ -48,61 +48,62 @@ void resetA(int N, double *a);
 static void warmupTBB();
 
 int main(int argc, char *argv[]) {
-  int M = 10000;
-  int N = 100000;
-
-  std::cout << "P = " << tbb::info::default_concurrency()
-            << std::endl << "N = " << N 
-            << std::endl << "M = " << M << std::endl;
-
-  #define CONSTRAIN_TO_FEWER_CORES 0
-  #if CONSTRAIN_TO_FEWER_CORES
+  // use the most performance codes
+  // only a single NUMA node
+  // and only 1 thread per core
   tbb::task_arena::constraints c;
-  c.set_max_concurrency(tbb::info::default_concurrency() - 2);
-  tbb::task_arena cores_arena(c);
-  std::cout << "Using arena with " << cores_arena.max_concurrency() << " slots\n";
-  cores_arena.execute([&]() {
-  #endif
-
-   double *v = new double[M];
-   double *a = new double[N]; 
-
-   warmupTBB();
-   resetV(M, v);
-   resetA(N, a);
-   tbb::tick_count t0 = tbb::tick_count::now();
-   for (int i = 0; i < M; ++i) {
-     parForAdd(v[i], N, a, tbb::auto_partitioner{});
-   }
-   double auto_time = (tbb::tick_count::now() - t0).seconds();
-
-   warmupTBB();
-   resetA(N, a);
-   tbb::affinity_partitioner aff_p;
-   t0 = tbb::tick_count::now();
-   for (int i = 0; i < M; ++i) {
-     parForAdd(v[i], N, a, aff_p); 
-   }
-   double affinity_time = (tbb::tick_count::now() - t0).seconds();
-
-   warmupTBB();
-   resetA(N, a);
-   t0 = tbb::tick_count::now();
-   for (int i = 0; i < M; ++i) {
-     parForAdd(v[i], N, a, tbb::static_partitioner{});
-  }
-  double static_time = (tbb::tick_count::now() - t0).seconds();
+  c.set_numa_id(tbb::info::numa_nodes()[0]);
+  c.set_core_type(tbb::info::core_types().back());
+  c.set_max_threads_per_core(1);
+  c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c)));
+  tbb::task_arena arena(c);
+
+  std::cout << "Using an arena with " << arena.max_concurrency() << " slots\n";
+
+  arena.execute([&]() {
+    int M = 10000;
+    int N = 100000;
+
+    std::cout << "P = " << tbb::info::default_concurrency()
+              << std::endl << "N = " << N 
+              << std::endl << "M = " << M << std::endl;
+
+    double *v = new double[M];
+    double *a = new double[N]; 
+
+    warmupTBB();
+    resetV(M, v);
+    resetA(N, a);
+    tbb::tick_count t0 = tbb::tick_count::now();
+    for (int i = 0; i < M; ++i) {
+      parForAdd(v[i], N, a, tbb::auto_partitioner{});
+    }
+    double auto_time = (tbb::tick_count::now() - t0).seconds();
+
+    warmupTBB();
+    resetA(N, a);
+    tbb::affinity_partitioner aff_p;
+    t0 = tbb::tick_count::now();
+    for (int i = 0; i < M; ++i) {
+      parForAdd(v[i], N, a, aff_p); 
+    }
+    double affinity_time = (tbb::tick_count::now() - t0).seconds();
 
-  std::cout << "auto_partitioner = " << auto_time << std::endl
-            << "affinity_partitioner = " << affinity_time << std::endl
-            << "static_partitioner = " << static_time << std::endl;
+    warmupTBB();
+    resetA(N, a);
+    t0 = tbb::tick_count::now();
+    for (int i = 0; i < M; ++i) {
+      parForAdd(v[i], N, a, tbb::static_partitioner{});
+    }
+    double static_time = (tbb::tick_count::now() - t0).seconds();
 
-  delete [] v;
-  delete [] a;
+    std::cout << "auto_partitioner = " << auto_time << std::endl
+              << "affinity_partitioner = " << affinity_time << std::endl
+              << "static_partitioner = " << static_time << std::endl;
 
-    #if CONSTRAIN_TO_FEWER_CORES
+    delete [] v;
+    delete [] a;
   });
-  #endif
 
   return 0;
 }

From 67de46a8bb8c59483fb14def122431c30e4be568 Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Fri, 22 Nov 2024 00:18:35 -0600
Subject: [PATCH 30/34] Added imbalanced loop example

---
 .../performance_tuning/CMakeLists.txt         |   1 +
 .../parallel_for_addition_partitioners.cpp    |   6 +-
 .../partitioners_imbalanced_loops.cpp         | 111 ++++++++++++++++++
 3 files changed, 114 insertions(+), 4 deletions(-)
 create mode 100644 new_examples/performance_tuning/partitioners_imbalanced_loops.cpp

diff --git a/new_examples/performance_tuning/CMakeLists.txt b/new_examples/performance_tuning/CMakeLists.txt
index e5f7c92da6..02edeab578 100644
--- a/new_examples/performance_tuning/CMakeLists.txt
+++ b/new_examples/performance_tuning/CMakeLists.txt
@@ -16,6 +16,7 @@ foreach(tpp blocked_ranges_trivial.cpp
             parallel_for_partitioners.cpp
             parallel_for_partitioners_timed.cpp
             parallel_for_transpose_partitioners.cpp
+            partitioners_imbalanced_loops.cpp
             priorities_and_conflict.cpp
             task_scheduler_observer.cpp)
   string(REPLACE ".cpp" "" texe ${tpp})
diff --git a/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp b/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp
index eb8001cc8b..30f995e269 100644
--- a/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp
+++ b/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp
@@ -23,8 +23,7 @@ template <typename Partitioner>
 void parForAdd(double v, int N, double *a, const Partitioner& p) {
   tbb::parallel_for( tbb::blocked_range<int>(0, N, 1), 
     [v, a](const tbb::blocked_range<int>& r) {
-      int ie = r.end();
-      for (int i = r.begin(); i < ie; ++i) {
+      for (int i = r.begin(); i < r.end(); ++i) {
         a[i] += v;
       }
     }, p
@@ -35,8 +34,7 @@ template <typename Partitioner>
 void parForAdd(double v, int N, double *a, Partitioner& p) {
   tbb::parallel_for( tbb::blocked_range<int>(0, N, 1), 
     [v, a](const tbb::blocked_range<int>& r) {
-      int ie = r.end();
-      for (int i = r.begin(); i < ie; ++i) {
+      for (int i = r.begin(); i < r.end(); ++i) {
         a[i] += v;
       }
     }, p
diff --git a/new_examples/performance_tuning/partitioners_imbalanced_loops.cpp b/new_examples/performance_tuning/partitioners_imbalanced_loops.cpp
new file mode 100644
index 0000000000..bcedeba6df
--- /dev/null
+++ b/new_examples/performance_tuning/partitioners_imbalanced_loops.cpp
@@ -0,0 +1,111 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include <iostream>
+#include <tbb/tbb.h>
+
+void doWork(double sec);
+
+template <typename Partitioner>
+void buildingWork(int N, const Partitioner& p) {
+  tbb::parallel_for( tbb::blocked_range<int>(0, N, 1), 
+    [](const tbb::blocked_range<int>& r) {
+      for (int i = r.begin(); i < r.end(); ++i) {
+        doWork(i);
+      }
+    }, p
+  );
+}
+
+template <typename Partitioner>
+void buildingWork(int N, Partitioner& p) {
+  tbb::parallel_for( tbb::blocked_range<int>(0, N, 1), 
+    [](const tbb::blocked_range<int>& r) {
+      for (int i = r.begin(); i <  r.end(); ++i) {
+        doWork(i);
+      }
+    }, p
+  );
+}
+
+static void warmupTBB() {
+  // This is a simple loop that should get workers started.
+  // oneTBB creates workers lazily on first use of the library
+  // so this hides the startup time when looking at trivial
+  // examples that do little real work. 
+  tbb::parallel_for(0, tbb::info::default_concurrency(), 
+    [=](int) {
+      tbb::tick_count t0 = tbb::tick_count::now();
+      while ((tbb::tick_count::now() - t0).seconds() < 0.01);
+    }
+  );
+}
+
+void doWork(double usec) {
+  double sec = usec*1e-06;
+  tbb::tick_count t0 = tbb::tick_count::now();
+  while ((tbb::tick_count::now() - t0).seconds() <= sec);
+}
+
+int main(int argc, char *argv[]) {
+  // use the most performance codes
+  // only a single NUMA node
+  // and only 1 thread per core
+  tbb::task_arena::constraints c;
+  c.set_numa_id(tbb::info::numa_nodes()[0]);
+  c.set_core_type(tbb::info::core_types().back());
+  c.set_max_threads_per_core(1);
+  c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c)));
+  tbb::task_arena a(c);
+
+  std::cout << "Using an arena with " << a.max_concurrency() << " slots\n";
+
+  a.execute([&]() {
+    int N = 1000;
+    int M = 10;
+
+    std::cout << std::endl << "M = " << M
+              << std::endl << "N = " << N << std::endl;
+
+    warmupTBB();
+    tbb::tick_count t0 = tbb::tick_count::now();
+    for (int i = 0; i < M; ++i) {
+      buildingWork(N, tbb::auto_partitioner{});
+    }
+    double auto_time = (tbb::tick_count::now() - t0).seconds();
+
+    warmupTBB();
+    tbb::affinity_partitioner aff_p;
+    t0 = tbb::tick_count::now();
+    for (int i = 0; i < M; ++i) {
+      buildingWork(N, aff_p); 
+    }
+    double affinity_time = (tbb::tick_count::now() - t0).seconds();
+
+    warmupTBB();
+    t0 = tbb::tick_count::now();
+    for (int i = 0; i < M; ++i) {
+      buildingWork(N, tbb::static_partitioner{});
+    }
+    double static_time = (tbb::tick_count::now() - t0).seconds();
+
+    std::cout << "auto_partitioner = " << auto_time << " seconds" << std::endl
+              << "affinity_partitioner = " << affinity_time << " seconds" << std::endl
+              << "static_partitioner = " << static_time << " seconds" << std::endl;
+  });
+  return 0;
+}
+

From dae6cd69320e1183186ddd7b87e8808a08f35225 Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Fri, 22 Nov 2024 01:02:25 -0600
Subject: [PATCH 31/34] Added deterministic reduce

---
 .../performance_tuning/CMakeLists.txt         |   1 +
 ...llel_deterministic_reduce_partitioners.cpp | 136 ++++++++++++++++++
 2 files changed, 137 insertions(+)
 create mode 100644 new_examples/performance_tuning/parallel_deterministic_reduce_partitioners.cpp

diff --git a/new_examples/performance_tuning/CMakeLists.txt b/new_examples/performance_tuning/CMakeLists.txt
index 02edeab578..81e902a417 100644
--- a/new_examples/performance_tuning/CMakeLists.txt
+++ b/new_examples/performance_tuning/CMakeLists.txt
@@ -13,6 +13,7 @@ foreach(tpp blocked_ranges_trivial.cpp
             global_control_and_implicit_arena.cpp
             global_control_and_implicit_conflict.cpp
             parallel_for_addition_partitioners.cpp
+            parallel_deterministic_reduce_partitioners.cpp
             parallel_for_partitioners.cpp
             parallel_for_partitioners_timed.cpp
             parallel_for_transpose_partitioners.cpp
diff --git a/new_examples/performance_tuning/parallel_deterministic_reduce_partitioners.cpp b/new_examples/performance_tuning/parallel_deterministic_reduce_partitioners.cpp
new file mode 100644
index 0000000000..bef0d67a40
--- /dev/null
+++ b/new_examples/performance_tuning/parallel_deterministic_reduce_partitioners.cpp
@@ -0,0 +1,136 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include <iostream>
+#include <math.h>
+#include <tbb/tbb.h>
+
+double serialPiExample(int num_intervals) {
+  double dx = 1.0 / num_intervals;
+  double sum = 0.0;
+  for (int i = 0; i < num_intervals; ++i) {
+    double x = (i+0.5)*dx;
+    double h = sqrt(1-x*x);
+    sum += h*dx;
+  }
+  return 4 * sum;
+}
+
+template< typename Partitioner >
+double reducePiExample(int num_intervals, int grainsize) {
+  double dx = 1.0 / num_intervals;
+  double sum = tbb::parallel_reduce(
+    /* range = */ tbb::blocked_range<int>(0, num_intervals, grainsize), 
+    /* idenity = */ 0.0,
+    /* func */ 
+    [=](const tbb::blocked_range<int>& r, double init) -> double {
+      for (int i = r.begin(); i != r.end(); ++i) {
+        double x = (i + 0.5)*dx;
+        double h = sqrt(1 - x*x);
+        init += h*dx;
+      }
+      return init;
+    },
+    /* reduction */
+    [](double x, double y) -> double {
+        return x + y;
+    }, 
+    /* partitioner */ Partitioner()
+  );
+  return 4 * sum;
+}
+
+template< typename Partitioner >
+double deterministicReducePiExample(int num_intervals, int grainsize) {
+  double dx = 1.0 / num_intervals;
+  double sum = tbb::parallel_deterministic_reduce(
+    /* range = */ tbb::blocked_range<int>(0, num_intervals, grainsize), 
+    /* identity = */ 0.0,
+    /* func */ 
+    [=](const tbb::blocked_range<int>& r, double init) -> double {
+      for (int i = r.begin(); i != r.end(); ++i) {
+        double x = (i + 0.5)*dx;
+        double h = sqrt(1 - x*x);
+        init += h*dx;
+      }
+      return init;
+    },
+    /* reduction */
+    [](double x, double y) -> double {
+      return x + y;
+    },
+    /* partitioner */ Partitioner()
+  );
+  return 4 * sum;
+}
+
+static void warmupTBB() {
+  tbb::parallel_for(0, tbb::info::default_concurrency(), [](int) {
+    tbb::tick_count t0 = tbb::tick_count::now();
+    while ((tbb::tick_count::now() - t0).seconds() < 0.01);
+  });
+}
+
+int main() {
+  // use the most performance codes
+  // only a single NUMA node
+  // and only 1 thread per core
+  tbb::task_arena::constraints c;
+  c.set_numa_id(tbb::info::numa_nodes()[0]);
+  c.set_core_type(tbb::info::core_types().back());
+  c.set_max_threads_per_core(1);
+  c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c)));
+  tbb::task_arena a(c);
+
+  std::cout << "Using an arena with " << a.max_concurrency() << " slots\n";
+
+  a.execute([&]() {
+    int num_intervals = 1<<26;
+    tbb::tick_count ts_0 = tbb::tick_count::now();
+    double spi = serialPiExample(num_intervals);
+    tbb::tick_count ts_1 = tbb::tick_count::now();
+    double serial_time = (ts_1 - ts_0).seconds();
+    std::cout << "serial, " << spi << ", " << serial_time << std::endl;
+    warmupTBB();
+    std::cout << "speedups relative to serial:" << std::endl;
+    std::cout << "gs, r-simple, d-simple, r-static, d-static, r-auto" << std::endl;
+    for (int gs = 1; gs <= num_intervals; gs *= 2) {
+        reducePiExample<tbb::auto_partitioner>(num_intervals, gs);
+        tbb::tick_count t0 = tbb::tick_count::now();
+        double v0 = reducePiExample<tbb::auto_partitioner>(num_intervals, gs);
+        tbb::tick_count t1 = tbb::tick_count::now();
+        double v1 = reducePiExample<tbb::simple_partitioner>(num_intervals, gs);
+        tbb::tick_count t2 = tbb::tick_count::now();
+        double v2 = reducePiExample<tbb::static_partitioner>(num_intervals, gs);
+        tbb::tick_count t3 = tbb::tick_count::now();
+        double v3 = deterministicReducePiExample<tbb::simple_partitioner>(num_intervals, gs);
+        tbb::tick_count t4 = tbb::tick_count::now();
+        double v4 = deterministicReducePiExample<tbb::static_partitioner>(num_intervals, gs);
+        tbb::tick_count t5 = tbb::tick_count::now();
+        std::cout << v0 << ", " << v1 << ", " << v2 
+                  << ", " << v3 << ", " << v4 << "\n";
+        std::cout << gs 
+                << ", " << serial_time / (t2-t1).seconds()
+                << ", " << serial_time / (t4-t3).seconds()
+                << ", " << serial_time / (t3-t2).seconds()
+                << ", " << serial_time / (t5-t4).seconds() 
+                << ", " << serial_time / (t1-t0).seconds()
+                << std::endl;
+    }
+  });
+  return 0;
+}
+

From 80b4ede9c134b2e8bf5ea51309fadd2660573ce3 Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Fri, 22 Nov 2024 19:22:25 -0600
Subject: [PATCH 32/34] Added pipeline benchmark

---
 .../performance_tuning/CMakeLists.txt         |   1 +
 .../parallel_pipeline_timed.cpp               | 198 ++++++++++++++++++
 2 files changed, 199 insertions(+)
 create mode 100644 new_examples/performance_tuning/parallel_pipeline_timed.cpp

diff --git a/new_examples/performance_tuning/CMakeLists.txt b/new_examples/performance_tuning/CMakeLists.txt
index 81e902a417..a48b218af5 100644
--- a/new_examples/performance_tuning/CMakeLists.txt
+++ b/new_examples/performance_tuning/CMakeLists.txt
@@ -17,6 +17,7 @@ foreach(tpp blocked_ranges_trivial.cpp
             parallel_for_partitioners.cpp
             parallel_for_partitioners_timed.cpp
             parallel_for_transpose_partitioners.cpp
+            parallel_pipeline_timed.cpp
             partitioners_imbalanced_loops.cpp
             priorities_and_conflict.cpp
             task_scheduler_observer.cpp)
diff --git a/new_examples/performance_tuning/parallel_pipeline_timed.cpp b/new_examples/performance_tuning/parallel_pipeline_timed.cpp
new file mode 100644
index 0000000000..ae9d5dadbd
--- /dev/null
+++ b/new_examples/performance_tuning/parallel_pipeline_timed.cpp
@@ -0,0 +1,198 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include <iostream>
+#include <memory>
+#include <vector>
+#include <tbb/tbb.h>
+
+void doWork(double sec) {
+  tbb::tick_count t0 = tbb::tick_count::now();
+  while ((tbb::tick_count::now() - t0).seconds() <= sec);
+}
+
+tbb::filter<int,int> makeMiddleFilters(int num_filters, tbb::filter_mode m, double usec) {
+  auto f = tbb::make_filter<int, int>(m,
+                                      [=](int i) -> int {
+                                        doWork(usec);
+                                        return i;
+                                      }); 
+  if (num_filters > 1)
+    return f & makeMiddleFilters(num_filters-1, m, usec);
+  else
+    return f;
+}
+
+tbb::filter<void,void> buildFilterChain(std::atomic<int>& counter, std::vector<int>& data, 
+                                        int num_filters, tbb::filter_mode m, double usec) {
+  counter = data.size() - 1;
+
+  tbb::filter<int, int> middle{};
+  tbb::filter<int,void> end{};
+
+  if (num_filters > 1) {
+    tbb::filter<void,int> start = 
+      tbb::make_filter<void, int>(m,
+                                  [&counter, &data, usec](tbb::flow_control& fc) -> int {
+                                    int i = counter--;
+                                    if (i > 0) {
+                                      doWork(usec);
+                                      return data[i];
+                                    } else {
+                                      fc.stop();
+                                      return 0;
+                                    }
+                                  });
+
+
+      tbb::filter<int, void> end = 
+        tbb::make_filter<int, void>(m,
+                                    [=](int i) {
+                                      doWork(usec);
+                                    });
+
+      if (num_filters > 2) {
+        tbb::filter<int, int> middle = makeMiddleFilters(num_filters-2, m, usec);
+        return start & middle & end;
+      } else {
+        return start & end;
+      }
+  } else {
+    return tbb::make_filter<void, void>(m,
+                                        [&](tbb::flow_control& fc) {
+                                          int i = counter--;
+                                          if (i > 0) {
+                                            doWork(usec);
+                                          } else {
+                                            fc.stop();
+                                          }
+                                        });
+  }
+}
+
+double runSerial(std::vector<int>& data, int num_filters, double usec, int num_items) {
+  tbb::tick_count t0 = tbb::tick_count::now();
+  for (int i = 0; i < num_items; ++i) {
+    for (int j = 0; j < num_filters; ++j) {
+      doWork(usec);
+    }
+  }
+  double total_time = (tbb::tick_count::now() - t0).seconds();
+  return total_time;   
+}
+
+std::vector<int> makeVector(int N);
+static void warmupTBB();
+
+std::vector<tbb::filter_mode> modes = {tbb::filter_mode::serial_in_order, 
+                                        tbb::filter_mode::serial_out_of_order, 
+                                        tbb::filter_mode::parallel };
+std::vector<std::string> mode_name = {"serial_in_order", 
+                                      "serial_out_of_order", 
+                                      "parallel"};
+
+std::vector<double> spin_times = {1e-7, 1e-6, 1e-5, 1e-4};
+
+void runEightFiltersOneTokenBalanced(int N, std::vector<int>& data) {
+  int num_filters = 8;
+
+  std::cout << "Varying the mode and spin_time for 8 filters:\n";
+
+  // Balanced serial time
+  std::vector<double> serial_time;
+  for (double spin_time : spin_times) {
+    serial_time.push_back(runSerial(data, num_filters, spin_time, N));
+    std::cout << serial_time.back() << ",";
+  }
+
+  std::cout << "\nmode";
+  for (double spin_time : spin_times) 
+    std::cout << ", " << spin_time;
+  std::cout << std::endl;
+
+  for (int m = 0; m < modes.size(); ++m) {
+    std::atomic<int> counter = 0;
+    std::cout << mode_name[m];
+    for (int st = 0; st < spin_times.size(); ++st) {
+      double spin_time = spin_times[st];
+
+      auto chain =
+        buildFilterChain(counter, data, num_filters, modes[m], spin_time);
+
+      tbb::tick_count t0 = tbb::tick_count::now();
+      warmupTBB();
+      tbb::parallel_pipeline(1, chain);
+      double t = (tbb::tick_count::now() - t0).seconds();
+      std::cout << ", " << serial_time[st]/t;
+    }
+    std::cout << std::endl; 
+
+
+  }
+}
+
+void runEightTokensIncreasingFiltersBalanced() {
+}
+
+void runEightFiltersIncreasingTokensBalanced() {
+}
+
+void runEightFiltersImbalanced() {
+}
+
+int main() {
+  // use the most performance codes
+  // only a single NUMA node
+  // and only 1 thread per core
+  tbb::task_arena::constraints c;
+  c.set_numa_id(tbb::info::numa_nodes()[0]);
+  c.set_core_type(tbb::info::core_types().front());
+  c.set_max_threads_per_core(1);
+  c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c)));
+  tbb::task_arena a(c);
+
+  std::cout << "Using an arena with " << a.max_concurrency() << " slots\n";
+
+  a.execute([&]() {
+    int N = 8000;
+    std::vector<int> data = makeVector(N);
+    runEightFiltersOneTokenBalanced(N, data);
+  });
+
+  return 0;
+}
+
+std::vector<int> makeVector(int N) {
+   std::vector<int> v;
+   v.reserve(N);
+   for (int i = 0; i < N; ++i) {
+     v.emplace_back(i);
+   }
+   return v;
+}
+
+static void warmupTBB() {
+  // This is a simple loop that should get workers started.
+  // oneTBB creates workers lazily on first use of the library
+  // so this hides the startup time when looking at trivial
+  // examples that do little real work. 
+  tbb::parallel_for(0, tbb::this_task_arena::max_concurrency(), 
+    [=](int) {
+      tbb::tick_count t0 = tbb::tick_count::now();
+      while ((tbb::tick_count::now() - t0).seconds() < 0.01);
+    }
+  );
+}

From a418fa2c045aec68d5079cdfc674401cb75ed4b3 Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Fri, 22 Nov 2024 20:17:29 -0600
Subject: [PATCH 33/34] Added other pipeline test cases

---
 .../parallel_pipeline_timed.cpp               | 136 +++++++++++++++---
 1 file changed, 120 insertions(+), 16 deletions(-)

diff --git a/new_examples/performance_tuning/parallel_pipeline_timed.cpp b/new_examples/performance_tuning/parallel_pipeline_timed.cpp
index ae9d5dadbd..5e66274627 100644
--- a/new_examples/performance_tuning/parallel_pipeline_timed.cpp
+++ b/new_examples/performance_tuning/parallel_pipeline_timed.cpp
@@ -24,24 +24,29 @@ void doWork(double sec) {
   while ((tbb::tick_count::now() - t0).seconds() <= sec);
 }
 
-tbb::filter<int,int> makeMiddleFilters(int num_filters, tbb::filter_mode m, double usec) {
+tbb::filter<int,int> makeMiddleFilters(int num_filters, tbb::filter_mode m, double usec,
+                                       int imbalanced_filter = -1, double imbalance_factor = 1) {
+  double w = (imbalanced_filter == num_filters) ? usec*imbalance_factor : usec;
+
   auto f = tbb::make_filter<int, int>(m,
                                       [=](int i) -> int {
-                                        doWork(usec);
+                                        doWork(w);
                                         return i;
                                       }); 
   if (num_filters > 1)
-    return f & makeMiddleFilters(num_filters-1, m, usec);
+    return f & makeMiddleFilters(num_filters-1, m, usec, 
+                                 imbalanced_filter, imbalance_factor);
   else
     return f;
 }
 
 tbb::filter<void,void> buildFilterChain(std::atomic<int>& counter, std::vector<int>& data, 
-                                        int num_filters, tbb::filter_mode m, double usec) {
+                                        int num_filters, tbb::filter_mode m, double usec,
+                                        double imbalance_factor = 1) {
   counter = data.size() - 1;
-
   tbb::filter<int, int> middle{};
   tbb::filter<int,void> end{};
+  int imbalanced_filter = (imbalance_factor == 1) ? -1 : num_filters - 3;
 
   if (num_filters > 1) {
     tbb::filter<void,int> start = 
@@ -65,7 +70,7 @@ tbb::filter<void,void> buildFilterChain(std::atomic<int>& counter, std::vector<i
                                     });
 
       if (num_filters > 2) {
-        tbb::filter<int, int> middle = makeMiddleFilters(num_filters-2, m, usec);
+        tbb::filter<int, int> middle = makeMiddleFilters(num_filters-2, m, usec, imbalanced_filter, imbalance_factor);
         return start & middle & end;
       } else {
         return start & end;
@@ -83,11 +88,16 @@ tbb::filter<void,void> buildFilterChain(std::atomic<int>& counter, std::vector<i
   }
 }
 
-double runSerial(std::vector<int>& data, int num_filters, double usec, int num_items) {
+double runSerial(std::vector<int>& data, int num_filters, double usec, int num_items,
+                 int imbalanced_filter = -1, double imbalance_factor = 1) {
   tbb::tick_count t0 = tbb::tick_count::now();
   for (int i = 0; i < num_items; ++i) {
     for (int j = 0; j < num_filters; ++j) {
-      doWork(usec);
+      if (j == imbalanced_filter) {
+        doWork(usec*imbalance_factor);
+      } else {
+        doWork(usec);
+      }
     }
   }
   double total_time = (tbb::tick_count::now() - t0).seconds();
@@ -104,18 +114,18 @@ std::vector<std::string> mode_name = {"serial_in_order",
                                       "serial_out_of_order", 
                                       "parallel"};
 
-std::vector<double> spin_times = {1e-7, 1e-6, 1e-5, 1e-4};
+
 
 void runEightFiltersOneTokenBalanced(int N, std::vector<int>& data) {
+  std::vector<double> spin_times = {1e-7, 1e-6, 1e-5, 1e-4};
   int num_filters = 8;
 
-  std::cout << "Varying the mode and spin_time for 8 filters:\n";
+  std::cout << "\nVarying the mode and spin_time for 8 filters:\n";
 
   // Balanced serial time
   std::vector<double> serial_time;
   for (double spin_time : spin_times) {
     serial_time.push_back(runSerial(data, num_filters, spin_time, N));
-    std::cout << serial_time.back() << ",";
   }
 
   std::cout << "\nmode";
@@ -139,18 +149,109 @@ void runEightFiltersOneTokenBalanced(int N, std::vector<int>& data) {
       std::cout << ", " << serial_time[st]/t;
     }
     std::cout << std::endl; 
+  }
+}
 
+void runEightTokensIncreasingFilters(int N, std::vector<int>& data) {
+  int max_threads = tbb::this_task_arena::max_concurrency();
+  double spin_time = 0.0001;
 
+  std::cout << "\nVarying number of filters with 100us spin and 8 tokens:\n";
+
+  // Balanced serial time
+  std::vector<double> serial_time;
+  for (int num_filters = 1; num_filters <= max_threads; ++num_filters) {
+    serial_time.push_back(runSerial(data, num_filters, spin_time, N));
   }
-}
 
-void runEightTokensIncreasingFiltersBalanced() {
+  std::cout << "\nmode";
+  for (int num_filters = 1; num_filters <= max_threads; ++num_filters)
+    std::cout << ", " << num_filters;
+  std::cout << std::endl;
+
+  for (int m = 0; m < modes.size(); ++m) {
+    std::atomic<int> counter = 0;
+    std::cout << mode_name[m];
+    for (int num_filters = 1; num_filters <= max_threads; ++num_filters) {
+      auto chain =
+        buildFilterChain(counter, data, num_filters, modes[m], spin_time);
+
+      tbb::tick_count t0 = tbb::tick_count::now();
+      warmupTBB();
+      tbb::parallel_pipeline(max_threads, chain);
+      double t = (tbb::tick_count::now() - t0).seconds();
+      std::cout << ", " << serial_time[num_filters-1]/t;
+    }
+    std::cout << std::endl; 
+  }
 }
 
-void runEightFiltersIncreasingTokensBalanced() {
+void runEightFiltersIncreasingTokens(int N, std::vector<int>& data) {
+  int max_threads = tbb::this_task_arena::max_concurrency();
+  int num_filters = 8;
+  double spin_time = 0.0001;
+
+  std::cout << "\nVarying number of tokens with 8 filters and 100us spin:\n";
+
+  double serial_time = runSerial(data, num_filters, spin_time, N);
+
+  std::cout << "\nmode";
+  for (int num_tokens = 1; num_tokens <= 2*max_threads; ++num_tokens)
+    std::cout << ", " << num_tokens;
+  std::cout << std::endl;
+
+  for (int m = 0; m < modes.size(); ++m) {
+    std::atomic<int> counter = 0;
+    std::cout << mode_name[m];
+    for (int num_tokens = 1; num_tokens <= 2*max_threads; ++num_tokens) {
+      auto chain =
+        buildFilterChain(counter, data, num_filters, modes[m], spin_time);
+
+      tbb::tick_count t0 = tbb::tick_count::now();
+      warmupTBB();
+      tbb::parallel_pipeline(num_tokens, chain);
+      double t = (tbb::tick_count::now() - t0).seconds();
+      std::cout << ", " << serial_time/t;
+    }
+    std::cout << std::endl; 
+  }
 }
 
-void runEightFiltersImbalanced() {
+void runEightFiltersImbalanced(int N, std::vector<int>& data) {
+  std::vector<double> imbalance = {0.1, 0.5, 0.75, 1.5, 2, 10};
+  int num_filters = 8;
+  int num_tokens = 8;
+  double spin_time = 0.0001;
+
+  std::cout << "\nVarying imbalance of 1 of 8 filters:\n";
+
+  // Imbalanced serial time
+  std::vector<double> serial_time;
+  for (double imb : imbalance) {
+    serial_time.push_back(runSerial(data, num_filters, spin_time, N, imb));
+  }
+
+  std::cout << "\nmode";
+  for (double imb : imbalance)
+    std::cout << ", " << imb;
+  std::cout << std::endl;
+
+  for (int m = 0; m < modes.size(); ++m) {
+    std::atomic<int> counter = 0;
+    std::cout << mode_name[m];
+    for (int imb = 0; imb < imbalance.size(); ++imb) {
+      double imbalance_factor = imbalance[imb];
+      auto chain =
+        buildFilterChain(counter, data, num_filters, modes[m], spin_time, imbalance_factor);
+
+      tbb::tick_count t0 = tbb::tick_count::now();
+      warmupTBB();
+      tbb::parallel_pipeline(num_tokens, chain);
+      double t = (tbb::tick_count::now() - t0).seconds();
+      std::cout << ", " << serial_time[imb]/t;
+    }
+    std::cout << std::endl; 
+  }
 }
 
 int main() {
@@ -169,7 +270,10 @@ int main() {
   a.execute([&]() {
     int N = 8000;
     std::vector<int> data = makeVector(N);
-    runEightFiltersOneTokenBalanced(N, data);
+    //runEightFiltersOneTokenBalanced(N, data);
+    //runEightTokensIncreasingFilters(N, data);
+    //runEightFiltersIncreasingTokens(N, data);
+    runEightFiltersImbalanced(N, data);
   });
 
   return 0;

From faf88cbdeefa570210710659acb446690128f9b0 Mon Sep 17 00:00:00 2001
From: Mike Voss <michaelj.voss@intel.com>
Date: Wed, 4 Dec 2024 14:42:36 -0600
Subject: [PATCH 34/34] Added README for algorithms

---
 new_examples/algorithms/README.md | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 new_examples/algorithms/README.md

diff --git a/new_examples/algorithms/README.md b/new_examples/algorithms/README.md
new file mode 100644
index 0000000000..7cd9092e85
--- /dev/null
+++ b/new_examples/algorithms/README.md
@@ -0,0 +1,3 @@
+# Chapter 2: Algorithms
+
+This directory contains the examples for Chapter 2.
\ No newline at end of file