diff --git a/new_examples/CMakeLists.txt b/new_examples/CMakeLists.txt
index 33dbebb7c4..cf5875dd35 100644
--- a/new_examples/CMakeLists.txt
+++ b/new_examples/CMakeLists.txt
@@ -2,8 +2,17 @@ cmake_minimum_required (VERSION 3.4)
 
 project(tbb_tutorials LANGUAGES CXX)
 
-set(CMAKE_CXX_COMPILER "icpx")
-set(CMAKE_LINKER "icpx")
+if (WIN32)
+  set(CMAKE_CXX_COMPILER "icx-cl")
+  set(CMAKE_LINKER "icpx")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qtbb")
+  set(CMAKE_CXX_LINKER_FLAGS "-Qtbb")
+else()
+  set(CMAKE_CXX_COMPILER "icpx")
+  set(CMAKE_LINKER "icpx")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb")
+  set(CMAKE_CXX_LINKER_FLAGS "-tbb -lpthread") 
+endif()
 set(CMAKE_CXX_STANDARD 20)
 
 include (CTest)
diff --git a/new_examples/algorithms/CMakeLists.txt b/new_examples/algorithms/CMakeLists.txt
index 5c5e670e30..2821d69f0b 100644
--- a/new_examples/algorithms/CMakeLists.txt
+++ b/new_examples/algorithms/CMakeLists.txt
@@ -1,6 +1,3 @@
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb")
-set(CMAKE_CXX_LINKER_FLAGS "-tbb")
-
 foreach(tpp parallel_invoke_recursive_quicksort.cpp parallel_invoke_two_quicksorts.cpp 
             parallel_for_trivial.cpp parallel_for_unoptimized_mxm.cpp
             parallel_reduce_max.cpp parallel_reduce_pi.cpp
diff --git a/new_examples/algorithms/README.md b/new_examples/algorithms/README.md
new file mode 100644
index 0000000000..7cd9092e85
--- /dev/null
+++ b/new_examples/algorithms/README.md
@@ -0,0 +1,3 @@
+# Chapter 2: Algorithms
+
+This directory contains the examples for Chapter 2.
\ No newline at end of file
diff --git a/new_examples/cancellation/CMakeLists.txt b/new_examples/cancellation/CMakeLists.txt
index 0f26004913..adebb4e100 100644
--- a/new_examples/cancellation/CMakeLists.txt
+++ b/new_examples/cancellation/CMakeLists.txt
@@ -1,6 +1,3 @@
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb")
-set(CMAKE_CXX_LINKER_FLAGS "-tbb")
-
 foreach(tpp cancel_group_execution1.cpp
             cancel_group_execution2.cpp
             cancel_group_execution3.cpp
diff --git a/new_examples/exception/CMakeLists.txt b/new_examples/exception/CMakeLists.txt
index 27d14a0939..444f2c077a 100644
--- a/new_examples/exception/CMakeLists.txt
+++ b/new_examples/exception/CMakeLists.txt
@@ -1,9 +1,6 @@
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb")
-set(CMAKE_CXX_LINKER_FLAGS "-tbb")
-
-foreach(tpp cpp_exceptions.cpp
-            exception_catch1.cpp
-            exception_catch2.cpp)
+foreach(tpp exception_catch1.cpp
+            exception_catch2.cpp
+            exception_catch3.cpp)
   string(REPLACE ".cpp" "" texe ${tpp})
   add_executable(${texe} ${tpp})
   target_include_directories(${texe} PUBLIC
diff --git a/new_examples/graph/CMakeLists.txt b/new_examples/graph/CMakeLists.txt
index 1c971bd436..5f2114c42e 100644
--- a/new_examples/graph/CMakeLists.txt
+++ b/new_examples/graph/CMakeLists.txt
@@ -1,6 +1,3 @@
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb")
-set(CMAKE_CXX_LINKER_FLAGS "-tbb")
-
 foreach(tpp graph_composite_node.cpp
             graph_execute_while_building.cpp
             graph_fwd_substitution.cpp
@@ -10,6 +7,7 @@ foreach(tpp graph_composite_node.cpp
             graph_node_priorities.cpp
             graph_reestablish_order.cpp
             graph_small_nodes.cpp
+            graph_stereoscopic_3d.cpp
             graph_two_nodes.cpp 
             graph_two_nodes_deduced.cpp
             graph_with_join.cpp)
diff --git a/new_examples/graph/graph_execute_while_building.cpp b/new_examples/graph/graph_execute_while_building.cpp
index 7a977221df..387bb9a9f9 100644
--- a/new_examples/graph/graph_execute_while_building.cpp
+++ b/new_examples/graph/graph_execute_while_building.cpp
@@ -16,6 +16,7 @@
 
 #include <iostream>
 #include <memory>
+#include <vector>
 #include <tbb/tbb.h>
 
 struct config_t { 
@@ -23,16 +24,17 @@ struct config_t {
   int predecessor; 
 };  
 
-config_t configuration[] = { { 0, 3 }, 
-                           { 1, 4 }, 
-                           { 2, 5 }, 
-                           { 3, -1 }, 
-                           { 4, -1 }, 
-                           { 5, 3 }, 
-                           { 6, 4 }, 
-                           { 7, 1 } };
+// each element defines a node and what other node it must wait for
+std::vector<config_t> configuration =   { { 0, 3 }, 
+                                          { 1, 4 }, 
+                                          { 2, 5 }, 
+                                          { 3, -1 }, 
+                                          { 4, -1 }, 
+                                          { 5, 3 }, 
+                                          { 6, 4 }, 
+                                          { 7, 1 } };
 
-int num_nodes = sizeof(configuration) / sizeof(config_t);
+const int num_nodes = configuration.size();
 
 int main() {
   tbb::flow::graph g;
@@ -57,14 +59,18 @@ int main() {
                           return m;
                         }
                        });
-    // connect the new node to its future
+    // connect the new node to its "future"
     tbb::flow::make_edge(*work_nodes[c.id], future_nodes[c.id]); 
 
     // start the node or link it to predecessor's promise
     if (c.predecessor != -1) {
+       // must connect to predecessor's "future"
+       // if the future is already written to this will start the node
+       // otherwise it will be started when the future is written
        std::printf("new %d with %d -> %d\n", c.id, c.predecessor, c.id);
        tbb::flow::make_edge(future_nodes[c.predecessor], *work_nodes[c.id]);
     } else {
+       // does not need to wait and can be started immediately
        std::printf("starting %d from main\n", c.id);
        work_nodes[c.id]->try_put(tbb::flow::continue_msg{});
     }
diff --git a/new_examples/graph/graph_loops.cpp b/new_examples/graph/graph_loops.cpp
index b1dea8cb85..847cc544e6 100644
--- a/new_examples/graph/graph_loops.cpp
+++ b/new_examples/graph/graph_loops.cpp
@@ -27,8 +27,7 @@ void tryPutLoop() {
     }
   };
   for (int count = 0; count < limit; ++count) {
-    int value = count;
-    my_node.try_put(value);
+    my_node.try_put(count);
   }
   g.wait_for_all();
 }
diff --git a/new_examples/graph/graph_small_nodes.cpp b/new_examples/graph/graph_small_nodes.cpp
index 345ff196e6..6e2c9f5051 100644
--- a/new_examples/graph/graph_small_nodes.cpp
+++ b/new_examples/graph/graph_small_nodes.cpp
@@ -57,9 +57,9 @@ void small_nodes_lightweight() {
   tbb::flow::function_node< int, int > add( g, tbb::flow::unlimited, 
                      [](const int &v) { return v+1; } );
   tbb::flow::function_node< int, int, tbb::flow::lightweight >  multiply( g, tbb::flow::unlimited, 
-                          [](const int &v) { return v*2; } );
+                          [](const int &v) noexcept { return v*2; } );
   tbb::flow::function_node< int, int, tbb::flow::lightweight >  cube( g, tbb::flow::unlimited, 
-                      [](const int &v) { return v*v*v; } );
+                      [](const int &v) noexcept { return v*v*v; } );
 
   tbb::flow::make_edge(add, multiply);
   tbb::flow::make_edge(multiply, cube);
@@ -73,7 +73,7 @@ void small_nodes_combined_lightweight() {
   tbb::flow::graph g;
 
   tbb::flow::function_node< int, int, tbb::flow::lightweight > combined_node( g, tbb::flow::unlimited, 
-                     [](const int &v) { 
+                     [](const int &v) noexcept { 
                         auto v2 = (v+1)*2;
                         return v2*v2*v2;
                      });
diff --git a/new_examples/graph/graph_stereoscopic_3d.cpp b/new_examples/graph/graph_stereoscopic_3d.cpp
index 05042a1829..35bb571b93 100644
--- a/new_examples/graph/graph_stereoscopic_3d.cpp
+++ b/new_examples/graph/graph_stereoscopic_3d.cpp
@@ -21,10 +21,10 @@
 #include <string>
 #include <utility>
 #include <vector>
-#include "../algorithms/lodepng.h"
+#include "../common/lodepng.h"
 #include <tbb/tbb.h>
 
-class PNGImage {
+class Image {
 public:
   uint64_t frameNumber = -1;
   unsigned int width = 0, height = 0;
@@ -34,26 +34,26 @@ class PNGImage {
   static const int greenOffset = 1; 
   static const int blueOffset = 2; 
 
-  PNGImage() {} 
-  PNGImage(uint64_t frame_number, const std::string& file_name);
-  PNGImage(const PNGImage& p);
-  virtual ~PNGImage() {}
+  Image() {} 
+  Image(uint64_t frame_number, const std::string& file_name);
+  Image(const Image& p);
+  virtual ~Image() {}
   void write() const;
 };
 
 int getNextFrameNumber();
-PNGImage getLeftImage(uint64_t frameNumber);
-PNGImage getRightImage(uint64_t frameNumber);
-void increasePNGChannel(PNGImage& image, int channel_offset, int increase);
-void mergePNGImages(PNGImage& right, const PNGImage& left);
+Image getLeftImage(uint64_t frameNumber);
+Image getRightImage(uint64_t frameNumber);
+void increasePNGChannel(Image& image, int channel_offset, int increase);
+void mergeImages(Image& right, const Image& left);
 
 void stereo3D() {
-  using Image = PNGImage;
+  using Image = Image;
   // step 1: create graph object
   tbb::flow::graph g;
 
   // step 2: create nodes
-  tbb::flow::input_node<uint64_t> frame_no_node{g,
+  tbb::flow::input_node frame_no_node{g,
     []( tbb::flow_control &fc ) -> uint64_t {
       uint64_t frame_number = getNextFrameNumber();
       if (frame_number)
@@ -99,7 +99,7 @@ void stereo3D() {
     [] (std::tuple<Image, Image> t) -> Image {
       auto& l = std::get<0>(t);
       auto& r = std::get<1>(t);
-      mergePNGImages(r, l);
+      mergeImages(r, l);
       return r;
     }
   };
@@ -128,7 +128,7 @@ void stereo3D() {
   g.wait_for_all();
 }
 
-PNGImage::PNGImage(uint64_t frame_number, const std::string& file_name) :
+Image::Image(uint64_t frame_number, const std::string& file_name) :
   frameNumber{frame_number}, buffer{std::make_shared< std::vector<unsigned char> >()} {
   if (lodepng::decode(*buffer, width, height, file_name)) {
      std::cerr << "Error: could not read PNG file!" << std::endl;
@@ -136,11 +136,11 @@ PNGImage::PNGImage(uint64_t frame_number, const std::string& file_name) :
   }
 };
 
-PNGImage::PNGImage(const PNGImage& p) : frameNumber{p.frameNumber}, 
+Image::Image(const Image& p) : frameNumber{p.frameNumber}, 
                                         width{p.width}, height{p.height},
                                         buffer{p.buffer} {}
 
-void PNGImage::write() const {
+void Image::write() const {
   std::string file_name = std::string("out") + std::to_string(frameNumber) + ".png";
   if (lodepng::encode(file_name, *buffer, width, height)) {
     std::cerr << "Error: could not write PNG file!" << std::endl;
@@ -163,30 +163,30 @@ int getNextFrameNumber() {
   }
 }
 
-PNGImage getLeftImage(uint64_t frameNumber) {
-  return PNGImage(frameNumber, "input1.png");
+Image getLeftImage(uint64_t frameNumber) {
+  return Image(frameNumber, "input1.png");
 }
 
-PNGImage getRightImage(uint64_t frameNumber) {
-  return PNGImage(frameNumber, "input2.png");
+Image getRightImage(uint64_t frameNumber) {
+  return Image(frameNumber, "input2.png");
 }
 
-void increasePNGChannel(PNGImage& image, int channel_offset, int increase) {
-  const int height_base = PNGImage::numChannels * image.width;
+void increasePNGChannel(Image& image, int channel_offset, int increase) {
+  const int height_base = Image::numChannels * image.width;
   std::vector<unsigned char>& buffer = *image.buffer;
 
   // Increase selected color channel by a predefined value
   for (unsigned int y = 0; y < image.height; y++) {
     const int height_offset = height_base * y;
     for (unsigned int x = 0; x < image.width; x++) {
-      int pixel_offset = height_offset + PNGImage::numChannels * x + channel_offset;
+      int pixel_offset = height_offset + Image::numChannels * x + channel_offset;
       buffer[pixel_offset] = static_cast<uint8_t>(std::min(buffer[pixel_offset] + increase, 255));
     }
   }
 }
 
-void mergePNGImages(PNGImage& right, const PNGImage& left) {
-  const int channels_per_pixel = PNGImage::numChannels;
+void mergeImages(Image& right, const Image& left) {
+  const int channels_per_pixel = Image::numChannels;
   const int height_base = channels_per_pixel * right.width;
   std::vector<unsigned char>& left_buffer = *left.buffer;
   std::vector<unsigned char>& right_buffer = *right.buffer;
@@ -195,7 +195,7 @@ void mergePNGImages(PNGImage& right, const PNGImage& left) {
     const int height_offset = height_base * y;
     for (unsigned int x = 0; x < right.width; x++) {
       const int pixel_offset = height_offset + channels_per_pixel * x;
-      const int red_index = pixel_offset + PNGImage::redOffset;
+      const int red_index = pixel_offset + Image::redOffset;
       right_buffer[red_index] = left_buffer[red_index];
     }
   }
diff --git a/new_examples/graph/graph_two_nodes.cpp b/new_examples/graph/graph_two_nodes.cpp
index 1a3d647744..481667e714 100644
--- a/new_examples/graph/graph_two_nodes.cpp
+++ b/new_examples/graph/graph_two_nodes.cpp
@@ -40,7 +40,7 @@ void graphTwoNodes() {
   // step 3: add edges
   tbb::flow::make_edge(my_first_node, my_second_node);
 
-  // step 4: send messages
+  // step 4: send message that eagerly starts graph execution
   my_first_node.try_put(10);
 
   // step 5: wait for graph to complete
diff --git a/new_examples/graph/graph_with_join.cpp b/new_examples/graph/graph_with_join.cpp
index 88ad9639cb..2efc46212a 100644
--- a/new_examples/graph/graph_with_join.cpp
+++ b/new_examples/graph/graph_with_join.cpp
@@ -55,7 +55,7 @@ void graphJoin() {
   make_edge(my_other_node, tbb::flow::input_port<1>(my_join_node));
   make_edge(my_join_node, my_final_node);
 
-  // step 4: send messages
+  // step 4: send messages that eagerly start graph execution
   my_node.try_put(1);
   my_other_node.try_put(2);
   // step 5: wait for the graph to complete
diff --git a/new_examples/intro/CMakeLists.txt b/new_examples/intro/CMakeLists.txt
index 1efda6b038..9e44001630 100644
--- a/new_examples/intro/CMakeLists.txt
+++ b/new_examples/intro/CMakeLists.txt
@@ -1,6 +1,3 @@
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb")
-set(CMAKE_CXX_LINKER_FLAGS "-tbb")
-
 foreach(tpp intro_pi.cpp
             intro_pi_timing.cpp
             intro_flowgraph.cpp
diff --git a/new_examples/migration/CMakeLists.txt b/new_examples/migration/CMakeLists.txt
index d52df94a67..f489f910cb 100644
--- a/new_examples/migration/CMakeLists.txt
+++ b/new_examples/migration/CMakeLists.txt
@@ -1,7 +1,5 @@
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb -pthread")
-set(CMAKE_CXX_LINKER_FLAGS "-tbb -pthread")
-
-foreach(tpp migrate_task_scheduler_init.cpp
+foreach(tpp migrate_atomics.cpp
+            migrate_task_scheduler_init.cpp
             migrate_parallel_do.cpp
             migrate_priorities.cpp
             migrate_task_blocking.cpp
diff --git a/new_examples/migration/migrate_atomics.cpp b/new_examples/migration/migrate_atomics.cpp
new file mode 100644
index 0000000000..95c0c551b5
--- /dev/null
+++ b/new_examples/migration/migrate_atomics.cpp
@@ -0,0 +1,80 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include <vector>
+#include <iostream>
+#include <algorithm>
+#include <random>
+
+#include <tbb/tick_count.h>
+#include <tbb/parallel_for.h>
+
+#if TBB_VERSION_MAJOR > 2020
+#include <atomic>
+#else
+#include <tbb/atomic.h>
+#endif
+
+int main(int argc, char** argv) {
+  long int n = 1000000000;
+  constexpr int num_bins = 256;
+
+  // Initialize random number generator
+  std::random_device seed;    // Random device seed
+  std::mt19937 mte{seed()};   // mersenne_twister_engine
+  std::uniform_int_distribution<> uniform{0,num_bins};
+  // Initialize image  
+  std::vector<uint8_t> image; // empty vector
+  image.reserve(n);           // image vector prealocated
+  std::generate_n(std::back_inserter(image), n,
+                    [&] { return uniform(mte); }
+                 );
+  // Initialize histogram
+  std::vector<int> hist(num_bins);
+
+  // Serial execution
+  tbb::tick_count t0 = tbb::tick_count::now();
+  std::for_each(image.begin(), image.end(),
+                [&](uint8_t i){hist[i]++;});
+  tbb::tick_count t1 = tbb::tick_count::now();
+  double t_serial = (t1 - t0).seconds();
+
+  // Parallel execution
+  #if TBB_VERSION_MAJOR > 2020
+  std::vector<std::atomic<int>> hist_p(num_bins);
+  #else
+  #warning Using tbb::atomic instead of std::atomic
+  std::vector<tbb::atomic<int>> hist_p(num_bins);
+  #endif
+
+  t0 = tbb::tick_count::now();
+  parallel_for(tbb::blocked_range<size_t>{0, image.size()},
+              [&](const tbb::blocked_range<size_t>& r)
+              {
+                for (size_t i = r.begin(); i < r.end(); ++i)
+                  hist_p[image[i]]++;
+              });
+  t1 = tbb::tick_count::now();
+  double t_parallel = (t1 - t0).seconds();
+
+  std::cout << "Serial: "   << t_serial   << ", ";
+  std::cout << "Parallel: " << t_parallel << ", ";
+  std::cout << "Speed-up: " << t_serial/t_parallel << std::endl;
+
+  if (!std::equal(hist.begin(),hist.end(),hist_p.begin()))
+      std::cerr << "Parallel computation failed!!" << std::endl;
+  return 0;
+}
diff --git a/new_examples/migration/migrate_bypass_tasks.cpp b/new_examples/migration/migrate_bypass_tasks.cpp
index 64ba8f6b81..fc35d6782b 100644
--- a/new_examples/migration/migrate_bypass_tasks.cpp
+++ b/new_examples/migration/migrate_bypass_tasks.cpp
@@ -54,6 +54,7 @@ void serialFwdSubTiled(std::vector<double>& x,
   }
 }
 
+#if TBB_VERSION_MAJOR > 2020
 tbb::task_handle fwdSubTGBody(tbb::task_group& tg,
                               int N, int num_blocks, 
                               const std::pair<size_t, size_t> bi, 
@@ -86,6 +87,30 @@ tbb::task_handle fwdSubTGBody(tbb::task_group& tg,
 
   return deferred_task;
 }
+#else
+void fwdSubTGBody(tbb::task_group& tg,
+                int N, int num_blocks, 
+                const std::pair<size_t, size_t> bi, 
+                std::vector<double>& x, 
+                const std::vector<double>& a, 
+                std::vector<double>& b,
+                std::vector<std::atomic<char>>& ref_count) {
+  auto [r, c] = bi;
+  computeBlock(N, r, c, x, a, b);
+  // add successor to right if ready
+  if (c + 1 <= r && --ref_count[r*num_blocks + c + 1] == 0) {
+    tg.run([&, N, num_blocks, r, c]() { 
+      fwdSubTGBody(tg, N, num_blocks, BlockIndex(r, c+1), x, a, b, ref_count);
+    });
+  }
+  // add succesor below if ready
+  if (r + 1 < (size_t)num_blocks && --ref_count[(r+1)*num_blocks + c] == 0) {
+    tg.run([&, N, num_blocks, r, c]() { 
+      fwdSubTGBody(tg, N, num_blocks, BlockIndex(r+1, c), x, a, b, ref_count);
+    });
+  }
+}
+#endif
 
 void parallelFwdSubTaskGroup(std::vector<double>& x, 
                              const std::vector<double>& a, 
@@ -114,6 +139,7 @@ void parallelFwdSubTaskGroup(std::vector<double>& x,
 }
 
 #if TBB_VERSION_MAJOR <= 2020
+#warning Using tbb::task directly
 using RootTask = tbb::empty_task;
 
 class FwdSubTask : public tbb::task {
diff --git a/new_examples/migration/migrate_parallel_do.cpp b/new_examples/migration/migrate_parallel_do.cpp
index 8672f9108d..335d94890d 100644
--- a/new_examples/migration/migrate_parallel_do.cpp
+++ b/new_examples/migration/migrate_parallel_do.cpp
@@ -87,6 +87,7 @@ void parallelFwdSub(std::vector<double>& x,
     }
   );
 #else
+#warning Using tbb::parallel_do instead of tbb::parallel_for_each
   tbb::parallel_do( &top_left, &top_left+1, 
     [&](const BlockIndex& bi, tbb::parallel_do_feeder<BlockIndex>& f) {
       auto [r, c] = bi;
diff --git a/new_examples/migration/migrate_priorities.cpp b/new_examples/migration/migrate_priorities.cpp
index 3b6cc43e51..e65bb4f5cf 100644
--- a/new_examples/migration/migrate_priorities.cpp
+++ b/new_examples/migration/migrate_priorities.cpp
@@ -71,6 +71,7 @@ void runParallelForWithHighPriority() {
   std::printf("\n");
 }
 #else
+#warning Using tbb::task directly
 auto P = tbb::task_scheduler_init::default_num_threads();
 
 class MyTask : public tbb::task {
diff --git a/new_examples/migration/migrate_recycling_tasks.cpp b/new_examples/migration/migrate_recycling_tasks.cpp
index eae856a95b..f066c130a6 100644
--- a/new_examples/migration/migrate_recycling_tasks.cpp
+++ b/new_examples/migration/migrate_recycling_tasks.cpp
@@ -62,8 +62,8 @@ class FwdSubFunctor {
                 const std::vector<double>& a, 
                 std::vector<double>& b,
                 std::vector<std::atomic<char>>& ref_count) : 
-                my_tg(tg), my_N(N), my_num_blocks(num_blocks), 
-                my_index(new BlockIndex{bi}), 
+                my_tg(tg), my_index(new BlockIndex{bi}),
+                my_N(N), my_num_blocks(num_blocks),  
                 my_x(x), my_a(a), my_b(b), my_ref_count(ref_count) {}
 
   void operator()() const {
@@ -123,6 +123,7 @@ void parallelFwdSub(std::vector<double>& x,
   tg.wait();
 }
 #else
+#warning Using tbb::task directly with recycling 
 using RootTask = tbb::empty_task;
 
 class FwdSubTask : public tbb::task {
diff --git a/new_examples/migration/migrate_task_blocking.cpp b/new_examples/migration/migrate_task_blocking.cpp
index b09edbaca3..0852e38108 100644
--- a/new_examples/migration/migrate_task_blocking.cpp
+++ b/new_examples/migration/migrate_task_blocking.cpp
@@ -28,6 +28,7 @@ void taskBlocking() {
   g.wait();
 }
 #else
+#warning Using tbb::task directly
 const int P = tbb::task_scheduler_init::default_num_threads();
 
 class MyTask : public tbb::task {
diff --git a/new_examples/migration/migrate_task_scheduler_init.cpp b/new_examples/migration/migrate_task_scheduler_init.cpp
index 7021a38a31..94db31548f 100644
--- a/new_examples/migration/migrate_task_scheduler_init.cpp
+++ b/new_examples/migration/migrate_task_scheduler_init.cpp
@@ -19,7 +19,7 @@
 void doWork(double seconds);
 
 #if TBB_VERSION_MAJOR > 2020
-const int N = 2*tbb::info::default_concurrency();
+const int N = tbb::info::default_concurrency();
 
 void setThreadsAndSlots() {
   tbb::global_control gc(tbb::global_control::max_allowed_parallelism, N);
@@ -32,6 +32,7 @@ void setThreadsAndSlots() {
   });
 }
 #else
+#warning Using tbb::task_scheduler_init instead of tbb::global_control
 const int N = tbb::task_scheduler_init::default_num_threads();
 
 void setThreadsAndSlots() {
@@ -77,18 +78,17 @@ void clearParticipation() {
 }
 
 void dumpParticipation(int p) {
-  int end = next_tid;
   int sum = tid_participation[0];
   std::cout << "[" << tid_participation[0];
   for (int i = 1; i < p; ++i) {
     sum += tid_participation[i];
     std::cout << ", " << tid_participation[i];
   }
-  for (int i = p; i < 2*N; ++i) 
+  for (int i = p; i < N; ++i) 
     std::cout << ", -";
   std::cout << "]\n" 
             << "sum == " << sum  << "\n"
-            << "expected sum " << 2*10*N << "\n";
+            << "expected sum " << 10*N << "\n";
   clearParticipation();
 }
 
diff --git a/new_examples/migration/migrate_tasks_adding_work.cpp b/new_examples/migration/migrate_tasks_adding_work.cpp
index 01686557e6..442b7eaed7 100644
--- a/new_examples/migration/migrate_tasks_adding_work.cpp
+++ b/new_examples/migration/migrate_tasks_adding_work.cpp
@@ -102,6 +102,7 @@ void parallelFwdSubTaskGroup(std::vector<double>& x,
 }
 
 #if TBB_VERSION_MAJOR <= 2020
+#warning Using tbb::task directly
 using RootTask = tbb::empty_task;
 
 class FwdSubTask : public tbb::task {
diff --git a/new_examples/performance_tuning/CMakeLists.txt b/new_examples/performance_tuning/CMakeLists.txt
index 6879ea2e47..a48b218af5 100644
--- a/new_examples/performance_tuning/CMakeLists.txt
+++ b/new_examples/performance_tuning/CMakeLists.txt
@@ -1,19 +1,27 @@
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb -pthread")
-set(CMAKE_CXX_LINKER_FLAGS "-tbb -pthread")
+if (WIN32)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qtbb")
+  set(CMAKE_CXX_LINKER_FLAGS "-Qtbb")
+else()
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb")
+  set(CMAKE_CXX_LINKER_FLAGS "-tbb -lpthread") 
+endif()
 
-foreach(tpp global_control_and_implicit_arena.cpp 
-            global_control_and_explicit_arena.cpp 
-            global_control_and_implicit_conflict.cpp 
-            global_control_and_explicit_conflict.cpp 
-            priorities_and_conflict.cpp 
-            building_work.cpp
-            matrix_transpose.cpp
-            oblivious_transpose.cpp
-            oblivious_transpose_simple_partitioner.cpp
-            parallel_for_addition.cpp
-            timing_partitioners.cpp
-            blocked_ranges_trivial.cpp)
-	string(REPLACE ".cpp" "" texe ${tpp})
+foreach(tpp blocked_ranges_trivial.cpp
+            constraints.cpp
+            global_control_and_explicit_arena.cpp
+            global_control_and_explicit_conflict.cpp
+            global_control_and_implicit_arena.cpp
+            global_control_and_implicit_conflict.cpp
+            parallel_for_addition_partitioners.cpp
+            parallel_deterministic_reduce_partitioners.cpp
+            parallel_for_partitioners.cpp
+            parallel_for_partitioners_timed.cpp
+            parallel_for_transpose_partitioners.cpp
+            parallel_pipeline_timed.cpp
+            partitioners_imbalanced_loops.cpp
+            priorities_and_conflict.cpp
+            task_scheduler_observer.cpp)
+  string(REPLACE ".cpp" "" texe ${tpp})
   add_executable(${texe} ${tpp})
   target_include_directories(${texe} PUBLIC
     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../common>
diff --git a/new_examples/performance_tuning/arena_trace.h b/new_examples/performance_tuning/arena_trace.h
new file mode 100644
index 0000000000..2a680ed8e5
--- /dev/null
+++ b/new_examples/performance_tuning/arena_trace.h
@@ -0,0 +1,167 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef ARENA_TRACE_H
+#define ARENA_TRACE_H 1
+
+#include <tbb/combinable.h>
+#include <tbb/enumerable_thread_specific.h>
+#include <tbb/task_scheduler_observer.h>
+#include <algorithm>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <unordered_set>
+
+class arena_tracer {
+public:
+  arena_tracer(const std::string& name) : m_t0(std::chrono::steady_clock::now()), m_name(name)  { }
+  ~arena_tracer() { dump_trace(); }
+ 
+  void add_arena(const std::string& n, tbb::task_arena& a) {
+    m_observers.local().emplace_back( std::make_shared<tracing_observer>(m_events, m_arena_id++, m_t0, n, a )); 
+  }
+
+private:
+  using time_point_type = std::chrono::time_point<std::chrono::steady_clock>;
+  time_point_type m_t0;
+  const std::string m_name;
+  std::atomic<int> m_arena_id = 0;
+
+  struct trace_event {
+    time_point_type t;
+    char ph;
+    bool is_worker;
+    std::thread::id tid;
+    int slot;
+    int pid;
+    trace_event( time_point_type t_, char ph_, 
+                 bool is_worker_, std::thread::id tid_, 
+                 int slot_, int pid_ ) : t(t_), ph(ph_), is_worker(is_worker_), tid(tid_), slot(slot_), pid(pid_) {}
+  };
+
+  using ets_vector_type = std::vector<trace_event>;
+  using ets_value_type = ets_vector_type;
+  using ets_type = tbb::combinable<ets_value_type>;
+  ets_type m_events;
+ 
+  // observer class
+  class tracing_observer : public oneapi::tbb::task_scheduler_observer {
+    ets_type& m_events;
+    int m_arena_id;
+    time_point_type m_t;
+    const std::string m_name;
+
+  public:
+    tracing_observer( ets_type& e, int arena_id, time_point_type t0,
+                      const std::string& fn, oneapi::tbb::task_arena &a ) 
+        : oneapi::tbb::task_scheduler_observer(a), m_events(e), m_arena_id(arena_id), m_t(t0), m_name(fn) {
+        observe(true); // activate the observer
+    }
+
+    ~tracing_observer() { }
+
+    int get_arena_id() { return m_arena_id; }
+    const std::string& get_arena_name() { return m_name; }
+
+    void on_scheduler_entry( bool worker ) override {
+      auto& l = m_events.local();
+      std::thread::id tid = std::this_thread::get_id();
+
+      l.emplace_back(
+        std::chrono::steady_clock::now(), 
+        'B', // ph
+        worker, // is_worker
+        tid,
+        oneapi::tbb::this_task_arena::current_thread_index(),
+        m_arena_id 
+      );
+    }
+    void on_scheduler_exit( bool worker ) override {
+      auto& l = m_events.local();
+      std::thread::id tid = std::this_thread::get_id();
+
+      l.emplace_back(
+        std::chrono::steady_clock::now(), 
+        'E', // ph
+        worker, // is_worker
+        tid,
+        oneapi::tbb::this_task_arena::current_thread_index(),
+        m_arena_id 
+      );
+    }
+  };
+
+  void dump_trace() {
+    std::unordered_set<std::thread::id> was_seen;
+
+    std::ofstream out(m_name);
+    out << "[";
+
+    bool first = true;
+    for (auto& v : m_observers.range()) {
+      for (auto& obs : v) {
+        if (!first) out << ","; 
+        first = false; 
+        out << "\n{\"name\": \"process_name\", \"ph\" : \"M\"" 
+            << ", \"pid\" : " << obs->get_arena_id()  
+            << ", \"args\" : { \"name\" : \"" << obs->get_arena_name() << "\" }"  
+            << "}";
+      }
+    } 
+
+    ets_vector_type r;
+    m_events.combine_each(
+      [&r](const ets_vector_type& v) {
+        r.insert(r.end(), v.begin(), v.end());
+      });
+    
+    std::sort(r.begin(), r.end(), [](const trace_event& a, const trace_event& b) { return a.t < b.t; });
+
+    for (auto& e : r) {
+      out << ",\n{"
+          <<   "\"name\": \"" << e.tid << "\""
+          << ", \"cat\": \"arena\""
+          << ", \"ph\": \"" << e.ph << "\""
+          << ", \"ts\": " << std::chrono::duration_cast<std::chrono::microseconds>(e.t-m_t0).count()
+          << ", \"pid\": " << e.pid
+          << ", \"tid\": " << e.slot
+          << "}";
+      if (e.ph == 'B') {
+        out << ",\n{"
+            << "\"name\": \"flow\""
+            << ", \"id\": " << e.tid 
+            << ", \"cat\": \"arena\""
+            << ", \"ts\": " << std::chrono::duration_cast<std::chrono::microseconds>(e.t-m_t0).count()
+            << ", \"pid\": " << e.pid
+            << ", \"tid\": " << e.slot;
+        if (was_seen.find(e.tid) != was_seen.end()) {
+          out << ", \"ph\": \"t\", \"bp\": \"e\" }";
+        } else {
+          was_seen.insert(e.tid);
+          out << ", \"ph\": \"s\", \"bp\": \"e\" }";
+        }
+      }
+    }
+
+    out << "\n]\n";
+  }
+
+  tbb::enumerable_thread_specific<std::vector<std::shared_ptr<tracing_observer>>> m_observers;
+};
+
+#endif
diff --git a/new_examples/performance_tuning/blocked_ranges_trivial.cpp b/new_examples/performance_tuning/blocked_ranges_trivial.cpp
index 8515b1ce57..145f795575 100644
--- a/new_examples/performance_tuning/blocked_ranges_trivial.cpp
+++ b/new_examples/performance_tuning/blocked_ranges_trivial.cpp
@@ -16,9 +16,7 @@
 
 #include <cstdio>
 #include <vector>
-
-#define TBB_PREVIEW_BLOCKED_RANGE_ND 1
-#include <tbb/tbb.h>
+#include "tbb/tbb.h"
 
 double f(double v);
 
@@ -64,26 +62,10 @@ void example3d(int Z, int P, int N, int M, double* a) {
   );
 }
 
-void exampleNd(int Z, int P, int N, int M, double* a) {
-  tbb::parallel_for(tbb::blocked_rangeNd<int,4>{{0,Z},{0,P},{0,N}, {0,M}}, 
-    [=](const tbb::blocked_rangeNd<int,4>& r_ijkl) {
-      const auto& r_i = r_ijkl.dim(0);
-      const auto& r_j = r_ijkl.dim(1);
-      const auto& r_k = r_ijkl.dim(2);
-      const auto& r_l = r_ijkl.dim(3);
-      for (int i = r_i.begin(); i < r_i.end(); ++i)
-        for (int j = r_j.begin(); j < r_j.end(); ++j)
-          for (int k = r_k.begin(); k < r_k.end(); ++k) 
-            for (int l = r_l.begin(); l < r_l.end(); ++l) 
-              a[i*P*N*M+j*N*M+k*M+l] = f(a[i*P*N*M+j*N*M+k*M+l]);
-    }
-  );
-}
-
 #include <iostream>
 
 static void warmupTBB();
-static bool resultsAreValid(int, const double*, const double*, const double*, const double*, const double*);
+static bool resultsAreValid(int, const double*, const double*, const double*, const double*);
 static void serialDouble(int, double*);
 
 int main() {
@@ -98,9 +80,9 @@ int main() {
   double* a1 = new double[Size];
   double* a2 = new double[Size];
   double* a3 = new double[Size];
-  double* aN = new double[Size];
+
   for (int i = 0; i < Size; ++i)
-    a0[i] = a1[i] = a2[i] = a3[i] = aN[i] = 1.0;
+    a0[i] = a1[i] = a2[i] = a3[i] = 1.0;
 
   // Perform serial double
   tbb::tick_count t0 = tbb::tick_count::now();
@@ -127,21 +109,14 @@ int main() {
   double tbb3d_time = (tbb::tick_count::now() - t0).seconds();
   std::printf("3d done\n");
   
-  t0 = tbb::tick_count::now();
-  exampleNd(Z, P, N, M, aN);
-  double tbbNd_time = (tbb::tick_count::now() - t0).seconds();
-  std::printf("Nd done\n");
-
-  if (resultsAreValid(Size, a0, a1, a2, a3, aN)) {
+  if (resultsAreValid(Size, a0, a1, a2, a3)) {
     std::cout << "serial_time == " << serial_time << " seconds\n"
               << "tbb1d_time == " << tbb1d_time << " seconds\n"
               << "speedup == " << serial_time/tbb1d_time << "\n"
               << "tbb2d_time == " << tbb2d_time << " seconds\n"
               << "speedup == " << serial_time/tbb2d_time << "\n"
               << "tbb3d_time == " << tbb3d_time << " seconds\n"
-              << "speedup == " << serial_time/tbb3d_time << "\n"
-              << "tbbNd_time == " << tbbNd_time << " seconds\n"
-              << "speedup == " << serial_time/tbbNd_time << "\n";
+              << "speedup == " << serial_time/tbb3d_time << "\n";
     return 0;
   } else {
     std::cout << "ERROR: invalid results!\n";
@@ -181,14 +156,12 @@ double f(double v) {
 
 static bool resultsAreValid(int N, 
                             const double* a0, const double* a1, 
-                            const double* a2, const double* a3,
-                            const double* aN) {
+                            const double* a2, const double* a3) {
   for (int i = 0; i < N; ++i) {
     if (a0[i] != 2.0 
         || a1[i] != 2.0
         || a2[i] != 2.0
-        || a3[i] != 2.0
-        || aN[i] != 2.0) {
+        || a3[i] != 2.0) {
       std::printf("%d: %f, %f, %f, %f\n", i, a0[i], a1[i], a2[i], a3[i]); 
       std::cerr << "Invalid results" << std::endl;
       return false;
diff --git a/new_examples/performance_tuning/constraints.cpp b/new_examples/performance_tuning/constraints.cpp
new file mode 100644
index 0000000000..6df33f9366
--- /dev/null
+++ b/new_examples/performance_tuning/constraints.cpp
@@ -0,0 +1,108 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include <vector>
+#include <tbb/tbb.h>
+
+
+int N = 1000;
+double w = 0.01;
+double f(double v);
+
+void constrain_for_numa_nodes() {
+  std::vector<tbb::numa_node_id> numa_nodes = tbb::info::numa_nodes();
+  std::vector<tbb::task_arena> arenas(numa_nodes.size());
+  std::vector<tbb::task_group> task_groups(numa_nodes.size());
+
+  // initialize each arena, each constrained to a different NUMA node
+  for (int i = 0; i < numa_nodes.size(); i++)
+    arenas[i].initialize(tbb::task_arena::constraints(numa_nodes[i]), 0);
+
+  // enqueue work to all but the first arena, using the task_group to track work
+  // by using defer, the task_group reference count is incremented immediately
+  for (int i = 1; i < numa_nodes.size(); i++)
+    arenas[i].enqueue(
+      task_groups[i].defer([] { 
+        tbb::parallel_for(0, N, [](int j) { f(w); }); 
+      })
+    );
+
+  // directly execute the work to completion in the remaining arena
+  arenas[0].execute([] {
+    tbb::parallel_for(0, N, [](int j) { f(w); });
+  });
+
+  // join the other arenas to wait on their task_groups
+  for (int i = 1; i < numa_nodes.size(); i++)
+    arenas[i].execute([&task_groups, i] { task_groups[i].wait(); });
+}
+
+void constrain_for_core_type() {
+    std::vector<tbb::core_type_id> core_types = tbb::info::core_types();
+    tbb::task_arena arena(
+      tbb::task_arena::constraints{}.set_core_type(core_types.back())
+    );
+
+    arena.execute([] {
+        tbb::parallel_for(0, N, [](int) { f(w); });
+    });
+}
+
+void constrain_for_no_hyperthreading() {
+    tbb::task_arena::constraints c;
+    std::vector<tbb::core_type_id> core_types = tbb::info::core_types();
+    c.set_core_type(core_types.back());
+    c.set_max_threads_per_core(1);
+    tbb::task_arena no_ht_arena(c);
+
+    no_ht_arena.execute( [] {
+        tbb::parallel_for(0, N, [](int) { f(w); });
+    });
+}
+
+void limit_concurrency_for_no_hyperthreading() {
+    tbb::task_arena::constraints c;
+    std::vector<tbb::core_type_id> core_types = tbb::info::core_types();
+    c.set_core_type(core_types.back());
+    c.set_max_threads_per_core(1);
+    int no_ht_concurrency = tbb::info::default_concurrency(c);
+    tbb::task_arena arena( no_ht_concurrency );
+
+    arena.execute( [] {
+        tbb::parallel_for(0, N, [](int) { f(w); });
+    });
+}
+
+#include <iostream>
+
+int main() {
+  std::cout << "Running numa node constraint example\n";
+  constrain_for_numa_nodes();
+  std::cout << "Running core type constraint example\n";
+  constrain_for_core_type();
+  std::cout << "Running one thread per core constraint example\n";
+  constrain_for_no_hyperthreading();
+    std::cout << "Running limited concurrency example\n";
+  limit_concurrency_for_no_hyperthreading();
+  std::cout << "done\n";
+  return 0;
+}
+
+double f(double v) {
+  tbb::tick_count t0 = tbb::tick_count::now();
+  while ((tbb::tick_count::now() - t0).seconds() < 0.01);
+  return 2*v;
+}
diff --git a/new_examples/performance_tuning/global_control_and_explicit_arena.cpp b/new_examples/performance_tuning/global_control_and_explicit_arena.cpp
index 3bf5e90c14..65443ab64e 100644
--- a/new_examples/performance_tuning/global_control_and_explicit_arena.cpp
+++ b/new_examples/performance_tuning/global_control_and_explicit_arena.cpp
@@ -14,7 +14,6 @@
     limitations under the License.
 */
 
-#include <iostream>
 #include <tbb/tbb.h>
 
 const int default_P = tbb::info::default_concurrency();
@@ -33,10 +32,7 @@ void arenaGlobalControlExplicitArena(int p) {
 }
 
 #include <atomic>
-#include <cstdio>
-#include <vector>
-#include <map>
-#include <set>
+#include <iostream>
 #include <vector>
 
 std::atomic<int> next_tid;
@@ -65,7 +61,6 @@ void clearParticipation() {
 }
 
 void dumpParticipation(int p) {
-  int end = next_tid;
   int sum = tid_participation[0];
   std::cout << "[" << tid_participation[0];
   for (int i = 1; i < p; ++i) {
@@ -76,7 +71,7 @@ void dumpParticipation(int p) {
     std::cout << ", -";
   std::cout << "]\n" 
             << "sum == " << sum  << "\n"
-            << "expected sum " << 10*default_P << "\n";
+            << "expected sum " << 10*default_P << "\n\n";
   clearParticipation();
 }
 
diff --git a/new_examples/performance_tuning/global_control_and_explicit_conflict.cpp b/new_examples/performance_tuning/global_control_and_explicit_conflict.cpp
index a46c9d1100..35f5b5022b 100644
--- a/new_examples/performance_tuning/global_control_and_explicit_conflict.cpp
+++ b/new_examples/performance_tuning/global_control_and_explicit_conflict.cpp
@@ -14,44 +14,55 @@
     limitations under the License.
 */
 
-#include <iostream>
 #include <thread>
 #include <tbb/tbb.h>
 
 const int default_P = tbb::info::default_concurrency();
-void doWork(int inc, double seconds);
-void waitUntil(int N);
+using counter_t = std::atomic<int>;
+void waitUntil(int N, counter_t& c);
+void noteParticipation(int offset);
+void dumpParticipation();
 
-void arenaGlobalControlExplicitArena(int p, int inc) {
+void doWork(int offset, double seconds) {
+  noteParticipation(offset);
+  tbb::tick_count t0 = tbb::tick_count::now();
+  while ((tbb::tick_count::now() - t0).seconds() < seconds);
+}
+
+counter_t counter1 = 0, counter2 = 0;
+
+void arenaGlobalControlExplicitArena(int p, int offset) {
   tbb::global_control gc(tbb::global_control::max_allowed_parallelism, p);
 
+  // we use waitUntil to force overlap of the gc lifetimes
+  waitUntil(2, counter1);
   tbb::task_arena a{2*tbb::info::default_concurrency()};
 
   a.execute([=]() {
     tbb::parallel_for(0, 
                       10*tbb::info::default_concurrency(), 
-                      [=](int) { doWork(inc, 0.01); });
+                      [=](int) { doWork(offset, 0.01); });
   });
+
+  // we prevent either gc from being destroyed until both are done
+  waitUntil(2, counter2);
 }
 
 void runTwoThreads(int p0, int p1) {
-  std::thread t0([=]() {
-    waitUntil(2);
-    arenaGlobalControlExplicitArena(p0, 1);
-  });
-  std::thread t1([=]() {
-    waitUntil(2);
-    arenaGlobalControlExplicitArena(p1, 10000);
-  });
+  std::thread t0([=]() { arenaGlobalControlExplicitArena(p0, 1); });
+  std::thread t1([=]() { arenaGlobalControlExplicitArena(p1, 10000); });
   t0.join();
   t1.join();
 }
 
+int main() {
+  runTwoThreads(default_P/2, 2*default_P);
+  dumpParticipation();
+  return 0;
+}
+
 #include <atomic>
-#include <cstdio>
-#include <vector>
-#include <map>
-#include <set>
+#include <iostream>
 #include <vector>
 
 std::atomic<int> next_tid;
@@ -66,12 +77,6 @@ void noteParticipation(int inc) {
   tid_participation[t] += inc;
 }
 
-void doWork(int inc, double seconds) {
-  noteParticipation(inc);
-  tbb::tick_count t0 = tbb::tick_count::now();
-  while ((tbb::tick_count::now() - t0).seconds() < seconds);
-}
-
 void clearParticipation() {
   next_tid = 0;
   my_tid.clear();
@@ -79,11 +84,10 @@ void clearParticipation() {
     p = 0;
 }
 
-void dumpParticipation(int p) {
-  int end = next_tid;
+void dumpParticipation() {
   int sum = tid_participation[0];
   std::cout << "[" << tid_participation[0];
-  for (int i = 1; i < 2*default_P; ++i) {
+  for (int i = 1; i < tid_participation.size(); ++i) {
     sum += tid_participation[i];
     std::cout << ", " << tid_participation[i];
   }
@@ -91,18 +95,12 @@ void dumpParticipation(int p) {
             << "sum == " << sum << "\n"
             << "expected sum == " << 10*default_P + 10*default_P*10000 << "\n";
   clearParticipation();
+  counter1 = 0; counter2 = 0;
 }
 
-std::atomic<int> count_up = 0;
-
-void waitUntil(int N) {
-  ++count_up;
-  while (count_up != N);
+void waitUntil(int N, counter_t& c) {
+  ++c;
+  while (c != N);
 }
 
-int main() {
-  runTwoThreads(default_P/2, 2*default_P);
-  dumpParticipation(2*default_P);
-  return 0;
-}
 
diff --git a/new_examples/performance_tuning/global_control_and_implicit_arena.cpp b/new_examples/performance_tuning/global_control_and_implicit_arena.cpp
index a1074dc147..e9a3dfc97a 100644
--- a/new_examples/performance_tuning/global_control_and_implicit_arena.cpp
+++ b/new_examples/performance_tuning/global_control_and_implicit_arena.cpp
@@ -14,11 +14,18 @@
     limitations under the License.
 */
 
-#include <iostream>
 #include <tbb/tbb.h>
 
 const int default_P = tbb::info::default_concurrency();
-void doWork(double seconds);
+
+void noteParticipation(); /* record info for participation vector */
+void dumpParticipation(int p); /* display participation vector */
+
+void doWork(double seconds) {
+  noteParticipation();
+  tbb::tick_count t0 = tbb::tick_count::now();
+  while ((tbb::tick_count::now() - t0).seconds() < seconds);
+}
 
 void arenaGlobalControlImplicitArena(int p) {
   tbb::global_control gc(tbb::global_control::max_allowed_parallelism, p);
@@ -27,11 +34,19 @@ void arenaGlobalControlImplicitArena(int p) {
                     [](int) { doWork(0.01); });
 }
 
+
+int main() {
+  arenaGlobalControlImplicitArena(default_P);
+  dumpParticipation(default_P);
+  arenaGlobalControlImplicitArena(default_P/2);
+  dumpParticipation(default_P/2);
+  arenaGlobalControlImplicitArena(2*default_P);
+  dumpParticipation(2*default_P);
+  return 0;
+}
+
 #include <atomic>
-#include <cstdio>
-#include <vector>
-#include <map>
-#include <set>
+#include <iostream>
 #include <vector>
 
 std::atomic<int> next_tid;
@@ -46,12 +61,6 @@ void noteParticipation() {
   ++tid_participation[t];
 }
 
-void doWork(double seconds) {
-  noteParticipation();
-  tbb::tick_count t0 = tbb::tick_count::now();
-  while ((tbb::tick_count::now() - t0).seconds() < seconds);
-}
-
 void clearParticipation() {
   next_tid = 0;
   my_tid.clear();
@@ -60,7 +69,6 @@ void clearParticipation() {
 }
 
 void dumpParticipation(int p) {
-  int end = next_tid;
   int sum = tid_participation[0];
   std::cout << "[" << tid_participation[0];
   for (int i = 1; i < std::min(p, default_P); ++i) {
@@ -71,17 +79,9 @@ void dumpParticipation(int p) {
     std::cout << ", -";
   std::cout << "]\n" 
             << "sum == " << sum  << "\n"
-            << "expected sum " << 10*default_P << "\n";
+            << "expected sum " << 10*default_P << "\n\n";
   clearParticipation();
 }
 
-int main() {
-  arenaGlobalControlImplicitArena(default_P);
-  dumpParticipation(default_P);
-  arenaGlobalControlImplicitArena(default_P/2);
-  dumpParticipation(default_P/2);
-  arenaGlobalControlImplicitArena(2*default_P);
-  dumpParticipation(2*default_P);
-  return 0;
-}
+
 
diff --git a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp
index bf3fa3b863..8474c57de0 100644
--- a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp
+++ b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp
@@ -14,58 +14,67 @@
     limitations under the License.
 */
 
-#include <iostream>
+#include <atomic>
 #include <thread>
 #include <tbb/tbb.h>
 
 const int default_P = tbb::info::default_concurrency();
-void doWork(int inc, double seconds);
-void waitUntil(int N);
 
-void arenaGlobalControlImplicitArena(int p, int inc) {
+using counter_t = std::atomic<int>;
+void waitUntil(int N, counter_t& c);
+void noteParticipation(int offset);
+void dumpParticipation();
+
+void doWork(int offset, double seconds) {
+  noteParticipation(offset);
+  tbb::tick_count t0 = tbb::tick_count::now();
+  while ((tbb::tick_count::now() - t0).seconds() < seconds);
+}
+
+counter_t counter1 = 0, counter2 = 0;
+
+void arenaGlobalControlImplicitArena(int p, int offset) {
   tbb::global_control gc(tbb::global_control::max_allowed_parallelism, p);
 
+  // we use waitUntil to force overlap of the gc lifetimes
+  waitUntil(2, counter1);
+
   tbb::parallel_for(0, 
                     10*default_P, 
-                    [=](int) { doWork(inc, 0.01); });
+                    [=](int) { 
+                      doWork(offset, 0.01); 
+                    });
+
+  // we prevent either gc from being destroyed until both are done
+  waitUntil(2, counter2);
 }
 
 void runTwoThreads(int p0, int p1) {
-  std::thread t0([=]() {
-    waitUntil(2);
-    arenaGlobalControlImplicitArena(p0, 1);
-  });
-  std::thread t1([=]() {
-    waitUntil(2);
-    arenaGlobalControlImplicitArena(p1, 10000);
-  });
+  std::thread t0([=]() { arenaGlobalControlImplicitArena(p0, 1); });
+  std::thread t1([=]() { arenaGlobalControlImplicitArena(p1, 10000); });
   t0.join();
   t1.join();
 }
 
-#include <atomic>
-#include <cstdio>
-#include <vector>
-#include <map>
-#include <set>
+int main() {
+  runTwoThreads(default_P/2, default_P);
+  dumpParticipation();
+  return 0;
+}
+
+#include <iostream>
 #include <vector>
 
 std::atomic<int> next_tid;
 tbb::enumerable_thread_specific<int> my_tid(-1);
-std::vector<std::atomic<int>> tid_participation(2*default_P);
+std::vector<std::atomic<int>> tid_participation(default_P+1);
 
-void noteParticipation(int inc) {
+void noteParticipation(int offset) {
   auto& t = my_tid.local();
   if (t == -1) {
     t = next_tid++;
   }
-  tid_participation[t] += inc;
-}
-
-void doWork(int inc, double seconds) {
-  noteParticipation(inc);
-  tbb::tick_count t0 = tbb::tick_count::now();
-  while ((tbb::tick_count::now() - t0).seconds() < seconds);
+  tid_participation[t] += offset;
 }
 
 void clearParticipation() {
@@ -75,11 +84,10 @@ void clearParticipation() {
     p = 0;
 }
 
-void dumpParticipation(int p) {
-  int end = next_tid;
+void dumpParticipation() {
   int sum = tid_participation[0];
   std::cout << "[" << tid_participation[0];
-  for (int i = 1; i < 2*default_P + 1; ++i) {
+  for (int i = 1; i < tid_participation.size(); ++i) {
     sum += tid_participation[i];
     std::cout << ", " << tid_participation[i];
   }
@@ -87,18 +95,12 @@ void dumpParticipation(int p) {
             << "sum == " << sum << "\n"
             << "expected sum == " << 10*default_P + 10*default_P*10000 << "\n";
   clearParticipation();
+  counter1 = 0; counter2 = 0;
 }
 
-std::atomic<int> count_up = 0;
-
-void waitUntil(int N) {
-  ++count_up;
-  while (count_up != N);
+void waitUntil(int N, counter_t& c) {
+  ++c;
+  while (c != N);
 }
 
-int main() {
-  runTwoThreads(default_P/2, default_P);
-  dumpParticipation(default_P);
-  return 0;
-}
 
diff --git a/new_examples/performance_tuning/parallel_deterministic_reduce_partitioners.cpp b/new_examples/performance_tuning/parallel_deterministic_reduce_partitioners.cpp
new file mode 100644
index 0000000000..bef0d67a40
--- /dev/null
+++ b/new_examples/performance_tuning/parallel_deterministic_reduce_partitioners.cpp
@@ -0,0 +1,136 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include <iostream>
+#include <math.h>
+#include <tbb/tbb.h>
+
+double serialPiExample(int num_intervals) {
+  double dx = 1.0 / num_intervals;
+  double sum = 0.0;
+  for (int i = 0; i < num_intervals; ++i) {
+    double x = (i+0.5)*dx;
+    double h = sqrt(1-x*x);
+    sum += h*dx;
+  }
+  return 4 * sum;
+}
+
+template< typename Partitioner >
+double reducePiExample(int num_intervals, int grainsize) {
+  double dx = 1.0 / num_intervals;
+  double sum = tbb::parallel_reduce(
+    /* range = */ tbb::blocked_range<int>(0, num_intervals, grainsize), 
+    /* idenity = */ 0.0,
+    /* func */ 
+    [=](const tbb::blocked_range<int>& r, double init) -> double {
+      for (int i = r.begin(); i != r.end(); ++i) {
+        double x = (i + 0.5)*dx;
+        double h = sqrt(1 - x*x);
+        init += h*dx;
+      }
+      return init;
+    },
+    /* reduction */
+    [](double x, double y) -> double {
+        return x + y;
+    }, 
+    /* partitioner */ Partitioner()
+  );
+  return 4 * sum;
+}
+
+template< typename Partitioner >
+double deterministicReducePiExample(int num_intervals, int grainsize) {
+  double dx = 1.0 / num_intervals;
+  double sum = tbb::parallel_deterministic_reduce(
+    /* range = */ tbb::blocked_range<int>(0, num_intervals, grainsize), 
+    /* identity = */ 0.0,
+    /* func */ 
+    [=](const tbb::blocked_range<int>& r, double init) -> double {
+      for (int i = r.begin(); i != r.end(); ++i) {
+        double x = (i + 0.5)*dx;
+        double h = sqrt(1 - x*x);
+        init += h*dx;
+      }
+      return init;
+    },
+    /* reduction */
+    [](double x, double y) -> double {
+      return x + y;
+    },
+    /* partitioner */ Partitioner()
+  );
+  return 4 * sum;
+}
+
+static void warmupTBB() {
+  tbb::parallel_for(0, tbb::info::default_concurrency(), [](int) {
+    tbb::tick_count t0 = tbb::tick_count::now();
+    while ((tbb::tick_count::now() - t0).seconds() < 0.01);
+  });
+}
+
+int main() {
+  // use the most performance codes
+  // only a single NUMA node
+  // and only 1 thread per core
+  tbb::task_arena::constraints c;
+  c.set_numa_id(tbb::info::numa_nodes()[0]);
+  c.set_core_type(tbb::info::core_types().back());
+  c.set_max_threads_per_core(1);
+  c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c)));
+  tbb::task_arena a(c);
+
+  std::cout << "Using an arena with " << a.max_concurrency() << " slots\n";
+
+  a.execute([&]() {
+    int num_intervals = 1<<26;
+    tbb::tick_count ts_0 = tbb::tick_count::now();
+    double spi = serialPiExample(num_intervals);
+    tbb::tick_count ts_1 = tbb::tick_count::now();
+    double serial_time = (ts_1 - ts_0).seconds();
+    std::cout << "serial, " << spi << ", " << serial_time << std::endl;
+    warmupTBB();
+    std::cout << "speedups relative to serial:" << std::endl;
+    std::cout << "gs, r-simple, d-simple, r-static, d-static, r-auto" << std::endl;
+    for (int gs = 1; gs <= num_intervals; gs *= 2) {
+        reducePiExample<tbb::auto_partitioner>(num_intervals, gs);
+        tbb::tick_count t0 = tbb::tick_count::now();
+        double v0 = reducePiExample<tbb::auto_partitioner>(num_intervals, gs);
+        tbb::tick_count t1 = tbb::tick_count::now();
+        double v1 = reducePiExample<tbb::simple_partitioner>(num_intervals, gs);
+        tbb::tick_count t2 = tbb::tick_count::now();
+        double v2 = reducePiExample<tbb::static_partitioner>(num_intervals, gs);
+        tbb::tick_count t3 = tbb::tick_count::now();
+        double v3 = deterministicReducePiExample<tbb::simple_partitioner>(num_intervals, gs);
+        tbb::tick_count t4 = tbb::tick_count::now();
+        double v4 = deterministicReducePiExample<tbb::static_partitioner>(num_intervals, gs);
+        tbb::tick_count t5 = tbb::tick_count::now();
+        std::cout << v0 << ", " << v1 << ", " << v2 
+                  << ", " << v3 << ", " << v4 << "\n";
+        std::cout << gs 
+                << ", " << serial_time / (t2-t1).seconds()
+                << ", " << serial_time / (t4-t3).seconds()
+                << ", " << serial_time / (t3-t2).seconds()
+                << ", " << serial_time / (t5-t4).seconds() 
+                << ", " << serial_time / (t1-t0).seconds()
+                << std::endl;
+    }
+  });
+  return 0;
+}
+
diff --git a/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp b/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp
new file mode 100644
index 0000000000..30f995e269
--- /dev/null
+++ b/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp
@@ -0,0 +1,134 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include <tbb/tbb.h>
+
+template <typename Partitioner>
+void parForAdd(double v, int N, double *a, const Partitioner& p) {
+  tbb::parallel_for( tbb::blocked_range<int>(0, N, 1), 
+    [v, a](const tbb::blocked_range<int>& r) {
+      for (int i = r.begin(); i < r.end(); ++i) {
+        a[i] += v;
+      }
+    }, p
+  );
+}
+
+template <typename Partitioner>
+void parForAdd(double v, int N, double *a, Partitioner& p) {
+  tbb::parallel_for( tbb::blocked_range<int>(0, N, 1), 
+    [v, a](const tbb::blocked_range<int>& r) {
+      for (int i = r.begin(); i < r.end(); ++i) {
+        a[i] += v;
+      }
+    }, p
+  );
+}
+
+void resetV(int N, double *v);
+void resetA(int N, double *a);
+static void warmupTBB();
+
+int main(int argc, char *argv[]) {
+  // use the most performance codes
+  // only a single NUMA node
+  // and only 1 thread per core
+  tbb::task_arena::constraints c;
+  c.set_numa_id(tbb::info::numa_nodes()[0]);
+  c.set_core_type(tbb::info::core_types().back());
+  c.set_max_threads_per_core(1);
+  c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c)));
+  tbb::task_arena arena(c);
+
+  std::cout << "Using an arena with " << arena.max_concurrency() << " slots\n";
+
+  arena.execute([&]() {
+    int M = 10000;
+    int N = 100000;
+
+    std::cout << "P = " << tbb::info::default_concurrency()
+              << std::endl << "N = " << N 
+              << std::endl << "M = " << M << std::endl;
+
+    double *v = new double[M];
+    double *a = new double[N]; 
+
+    warmupTBB();
+    resetV(M, v);
+    resetA(N, a);
+    tbb::tick_count t0 = tbb::tick_count::now();
+    for (int i = 0; i < M; ++i) {
+      parForAdd(v[i], N, a, tbb::auto_partitioner{});
+    }
+    double auto_time = (tbb::tick_count::now() - t0).seconds();
+
+    warmupTBB();
+    resetA(N, a);
+    tbb::affinity_partitioner aff_p;
+    t0 = tbb::tick_count::now();
+    for (int i = 0; i < M; ++i) {
+      parForAdd(v[i], N, a, aff_p); 
+    }
+    double affinity_time = (tbb::tick_count::now() - t0).seconds();
+
+    warmupTBB();
+    resetA(N, a);
+    t0 = tbb::tick_count::now();
+    for (int i = 0; i < M; ++i) {
+      parForAdd(v[i], N, a, tbb::static_partitioner{});
+    }
+    double static_time = (tbb::tick_count::now() - t0).seconds();
+
+    std::cout << "auto_partitioner = " << auto_time << std::endl
+              << "affinity_partitioner = " << affinity_time << std::endl
+              << "static_partitioner = " << static_time << std::endl;
+
+    delete [] v;
+    delete [] a;
+  });
+
+  return 0;
+}
+
+
+void resetV(int N, double *v) {
+  for (int i = 0; i < N; ++i) {
+    v[i] = i;
+  }
+  std::shuffle(v, v+N, std::random_device{});
+}
+
+void resetA(int N, double *a) {
+  for (int i = 0; i < N; ++i) {
+    a[i] = 0;
+  }
+}
+
+static void warmupTBB() {
+  // This is a simple loop that should get workers started.
+  // oneTBB creates workers lazily on first use of the library
+  // so this hides the startup time when looking at trivial
+  // examples that do little real work. 
+  tbb::parallel_for(0, tbb::info::default_concurrency(), 
+    [=](int) {
+      tbb::tick_count t0 = tbb::tick_count::now();
+      while ((tbb::tick_count::now() - t0).seconds() < 0.01);
+    }
+  );
+}
\ No newline at end of file
diff --git a/new_examples/performance_tuning/parallel_for_partitioners.cpp b/new_examples/performance_tuning/parallel_for_partitioners.cpp
new file mode 100644
index 0000000000..b2d57de755
--- /dev/null
+++ b/new_examples/performance_tuning/parallel_for_partitioners.cpp
@@ -0,0 +1,99 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include <iostream>
+#include <tbb/tbb.h>
+
+void doWork(double sec);
+
+template <typename Partitioner>
+void pforWork(int N, const Partitioner& p) {
+  tbb::parallel_for( tbb::blocked_range<int>(0, N, 1), 
+    [](const tbb::blocked_range<int>& r) {
+      for (int i = r.begin(); i < r.end(); ++i) {
+        doWork(i);
+      }
+    }, p
+  );
+}
+
+template <typename Partitioner>
+void pforWork(int N, Partitioner& p) {
+  tbb::parallel_for( tbb::blocked_range<int>(0, N, 1), 
+    [](const tbb::blocked_range<int>& r) {
+      for (int i = r.begin(); i < r.end(); ++i) {
+        doWork(i);
+      }
+    }, p
+  );
+}
+
+static void warmupTBB();
+
+int main(int argc, char *argv[]) {
+  int N = 1000;
+  int M = 10;
+
+  std::cout << "P = " << tbb::info::default_concurrency()
+            << std::endl << "M = " << M
+            << std::endl << "N = " << N << std::endl;
+
+   warmupTBB();
+   tbb::tick_count t0 = tbb::tick_count::now();
+   for (int i = 0; i < M; ++i) {
+     pforWork(N, tbb::auto_partitioner{});
+   }
+   double auto_time = (tbb::tick_count::now() - t0).seconds();
+
+   warmupTBB();
+   tbb::affinity_partitioner aff_p;
+   t0 = tbb::tick_count::now();
+   for (int i = 0; i < M; ++i) {
+     pforWork(N, aff_p); 
+   }
+   double affinity_time = (tbb::tick_count::now() - t0).seconds();
+
+   warmupTBB();
+   t0 = tbb::tick_count::now();
+   for (int i = 0; i < M; ++i) {
+     pforWork(N, tbb::static_partitioner{});
+  }
+  double static_time = (tbb::tick_count::now() - t0).seconds();
+
+  std::cout << "auto_partitioner = " << auto_time << " seconds" << std::endl
+            << "affinity_partitioner = " << affinity_time << " seconds" << std::endl
+            << "static_partitioner = " << static_time << " seconds" << std::endl;
+  return 0;
+}
+
+static void warmupTBB() {
+  // This is a simple loop that should get workers started.
+  // oneTBB creates workers lazily on first use of the library
+  // so this hides the startup time when looking at trivial
+  // examples that do little real work. 
+  tbb::parallel_for(0, tbb::info::default_concurrency(), 
+    [=](int) {
+      tbb::tick_count t0 = tbb::tick_count::now();
+      while ((tbb::tick_count::now() - t0).seconds() < 0.01);
+    }
+  );
+}
+
+void doWork(double usec) {
+  double sec = usec*1e-06;
+  tbb::tick_count t0 = tbb::tick_count::now();
+  while ((tbb::tick_count::now() - t0).seconds() <= sec);
+}
\ No newline at end of file
diff --git a/new_examples/performance_tuning/parallel_for_partitioners_timed.cpp b/new_examples/performance_tuning/parallel_for_partitioners_timed.cpp
new file mode 100644
index 0000000000..7155b0caa9
--- /dev/null
+++ b/new_examples/performance_tuning/parallel_for_partitioners_timed.cpp
@@ -0,0 +1,114 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include <iostream>
+#include <string>
+#include <tbb/tbb.h>
+
+static inline void spinWaitForAtLeast(double sec=0.0);
+
+static inline double executeFor(int num_trials, int N, double tpi) {
+  tbb::tick_count t0;
+  for (int t = -1; t < num_trials; ++t) {
+    if (!t) t0 = tbb::tick_count::now();
+    for (int i = 0; i < N; ++i) {
+      spinWaitForAtLeast(tpi);
+    } 
+  }
+  tbb::tick_count t1 = tbb::tick_count::now();
+  return (t1 - t0).seconds()/num_trials;
+}
+
+template< typename P >
+static inline double executePfor(int num_trials, int N,
+		                         int gs, P& p, double tpi) {
+  tbb::tick_count t0;
+  for (int t = -1; t < num_trials; ++t) {
+    if (!t) t0 = tbb::tick_count::now();
+    tbb::parallel_for (
+      tbb::blocked_range<int>{0, N, static_cast<size_t>(gs)},
+      [tpi](const tbb::blocked_range<int>& r) {
+        for (int i = r.begin(); i < r.end(); ++i) {
+          spinWaitForAtLeast(tpi);
+        } 
+      }, 
+      p
+    );
+  }
+  tbb::tick_count t1 = tbb::tick_count::now();
+  return (t1 - t0).seconds()/num_trials;
+}
+
+int main() {
+  // use the most performance codes
+  // only a single NUMA node
+  // and only 1 thread per core
+  tbb::task_arena::constraints c;
+  c.set_numa_id(tbb::info::numa_nodes()[0]);
+  c.set_core_type(tbb::info::core_types().back());
+  c.set_max_threads_per_core(1);
+  c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c)));
+  tbb::task_arena a(c);
+
+  std::cout << "Using an arena with " << a.max_concurrency() << " slots\n";
+
+  a.execute([&]() {
+    tbb::auto_partitioner auto_p;
+    tbb::simple_partitioner simple_p;
+    tbb::static_partitioner static_p;
+    const std::string pname[4] = {"simple", "auto", "affinity", "static"};
+
+    const int N = 262144;
+    const int T = 20;
+    const double ten_ns = 0.00000001;
+    const double twenty_us = 0.00002;
+    double timing[4][19];
+
+    for (double tpi = ten_ns; tpi < twenty_us; tpi *= 10) { 
+      std::cout << "Speedups for " << tpi << " seconds per iteration" << std::endl
+                << "partitioner";
+      for (int gs = 1, i = 0; gs <= N; gs *= 2, ++i) 
+        std::cout << ", " << gs;
+      std::cout << std::endl;
+
+      double serial_time = executeFor(T, N, tpi);
+
+      for (int gs = 1, i = 0; gs <= N; gs *= 2, ++i) {
+        tbb::affinity_partitioner affinity_p;
+        spinWaitForAtLeast(0.001);
+        timing[0][i] = executePfor(T, N, gs, simple_p, tpi);
+        timing[1][i] = executePfor(T, N, gs, auto_p, tpi);
+        timing[2][i] = executePfor(T, N, gs, affinity_p, tpi);
+        timing[3][i] = executePfor(T, N, gs, static_p, tpi);
+      }
+      for (int p = 0; p < 4; ++p) {
+        std::cout << pname[p];  
+        for (int gs = 1, i = 0; gs <= N; gs *= 2, ++i) 
+          std::cout << ", " << serial_time/timing[p][i];
+        std::cout << std::endl;
+      }
+      std::cout << std::endl;
+    }
+  });
+
+  return 0;
+}
+
+static inline void spinWaitForAtLeast(double sec) {
+  if (sec == 0.0) return;
+  tbb::tick_count t0 = tbb::tick_count::now();
+  while ((tbb::tick_count::now() - t0).seconds() < sec);
+}
diff --git a/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp b/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp
new file mode 100644
index 0000000000..323a43de3b
--- /dev/null
+++ b/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp
@@ -0,0 +1,182 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include <tbb/tbb.h>
+
+
+double serialTranspose(int N, double *a, double *b) {
+  tbb::tick_count t0 = tbb::tick_count::now();
+  for (int i = 0; i < N; ++i) {
+    for (int j = 0; j < N; ++j) {
+      b[j*N+i] = a[i*N+j];
+    }
+  }
+  tbb::tick_count t1 = tbb::tick_count::now();
+
+  return (t1-t0).seconds();   
+}
+
+void obliviousTranspose(int N, int ib, int ie, int jb, int je, 
+                              double *a, double *b, int gs) {
+  int ilen = ie-ib;
+  int jlen = je-jb;
+  if (ilen > gs ||  jlen > gs) {
+     if ( ilen > jlen ) {
+       int imid = (ib+ie)/2;
+       obliviousTranspose(N, ib, imid, jb, je, a, b, gs);
+       obliviousTranspose(N, imid, ie, jb, je, a, b, gs);
+     } else {
+       int jmid = (jb+je)/2;
+       obliviousTranspose(N, ib, ie, jb, jmid, a, b, gs);
+       obliviousTranspose(N, ib, ie, jmid, je, a, b, gs);
+     }
+  } else {
+    for (int i = ib; i < ie; ++i) {
+      for (int j = jb; j < je; ++j) {
+        b[j*N+i] = a[i*N+j];
+      }
+    }
+  }
+}
+
+double serialObliviousTranspose(int N, double *a, double *b, int gs) {
+   tbb::tick_count t0 = tbb::tick_count::now();
+   obliviousTranspose(N, 0, N, 0, N, a, b, gs);
+   tbb::tick_count t1 = tbb::tick_count::now();
+   return (t1-t0).seconds();   
+}
+
+template< typename P >
+double pforTranspose(int N, double *a, double *b, int gs) {
+   tbb::tick_count t0 = tbb::tick_count::now();
+   tbb::parallel_for( tbb::blocked_range<int>(0, N, gs),
+     [N, a, b](const tbb::blocked_range<int>& r) {
+       for (int i = r.begin(); i < r.end(); ++i) {
+         for (int j = 0; j < N; ++j) {
+           b[j*N+i] = a[i*N+j];
+         }
+       }
+     }, P() 
+   );
+   tbb::tick_count t1 = tbb::tick_count::now();
+   return (t1-t0).seconds();   
+}
+
+template<typename P>
+double pforTranspose2d(int N, double *a, double *b, int gs) {
+  tbb::tick_count t0 = tbb::tick_count::now();
+  tbb::parallel_for( tbb::blocked_range2d<int,int>{
+                        0, N, static_cast<size_t>(gs), 
+                        0, N, static_cast<size_t>(gs)},
+    [N, a, b](const tbb::blocked_range2d<int,int>& r) {
+      for (int i = r.rows().begin(); i < r.rows().end(); ++i) {
+        for (int j = r.cols().begin(); j < r.cols().end(); ++j) {
+          b[j*N+i] = a[i*N+j];
+        }
+      }
+    }, P()
+  );
+  tbb::tick_count t1 = tbb::tick_count::now();
+  return (t1-t0).seconds();
+}
+
+void setArray(int N, double *a);
+void checkTranspose(int N, double *a);
+
+#include <iostream>
+
+int main() {
+  // use the most performance codes
+  // only a single NUMA node
+  // and only 1 thread per core
+  tbb::task_arena::constraints c;
+  c.set_numa_id(tbb::info::numa_nodes()[0]);
+  c.set_core_type(tbb::info::core_types().back());
+  c.set_max_threads_per_core(1);
+  c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c)));
+  tbb::task_arena a(c);
+
+  std::cout << "Using an arena with " << a.max_concurrency() << " slots\n";
+
+  a.execute([&]() {
+    int N = 2<<12; // 8192
+    double *a = new double[N*N];
+    double *b = new double[N*N];
+    setArray(N, a);
+    setArray(N, b);
+
+    serialTranspose(N, a, b);
+    double ts = serialTranspose(N, a, b);
+    checkTranspose(N, b);
+    std::cout << "Serial Time = " << ts << std::endl;
+
+    std::cout << "Parallel Times:" << std::endl
+              << "grainsize, oblivious, 1d auto, 1d simple, 2d auto, 2d simple" << std::endl;
+    for (int gs = 1; gs <= N; gs *= 2) {
+      setArray(N, a);
+      setArray(N, b);
+      serialObliviousTranspose(N, a, b, gs);
+      double to = serialObliviousTranspose(N, a, b, gs);
+      checkTranspose(N, b);
+
+      setArray(N, a);
+      setArray(N, b);
+      pforTranspose<tbb::auto_partitioner>(N, a, b, gs);
+      double t1d_auto = pforTranspose<tbb::auto_partitioner>(N, a, b, gs);
+
+      setArray(N, a);
+      setArray(N, b);
+      pforTranspose<tbb::simple_partitioner>(N, a, b, gs);
+      double t1d_simple = pforTranspose<tbb::simple_partitioner>(N, a, b, gs);
+
+      setArray(N, a);
+      setArray(N, b);
+      pforTranspose2d<tbb::auto_partitioner>(N, a, b, gs);
+      double t2d_auto = pforTranspose2d<tbb::auto_partitioner>(N, a, b, gs);
+
+      setArray(N, a);
+      setArray(N, b);
+      pforTranspose2d<tbb::simple_partitioner>(N, a, b, gs);
+      double t2d_simple = pforTranspose2d<tbb::simple_partitioner>(N, a, b, gs);
+
+      std::cout << gs 
+                << ", " << to 
+                << ", " << t1d_auto 
+                << ", " << t1d_simple 
+                << ", " << t2d_auto
+                << ", " << t2d_simple << std::endl;
+    }
+  });
+  return 0;
+}
+
+void setArray(int N, double *a) {
+  for (int i = 0; i < N; ++i) {
+    for (int j = 0; j < N; ++j) {
+      a[i*N+j] = i;
+    }
+  }
+}
+
+void checkTranspose(int N, double *a) {
+   for (int i = 0; i < N; ++i) {
+     for (int j = 0; j < N; ++j) {
+       if (a[i*N+j] != j) {
+         std::cout << "Transpose failed" << std::endl;
+       }
+     }
+   }
+}
diff --git a/new_examples/performance_tuning/parallel_pipeline_timed.cpp b/new_examples/performance_tuning/parallel_pipeline_timed.cpp
new file mode 100644
index 0000000000..5e66274627
--- /dev/null
+++ b/new_examples/performance_tuning/parallel_pipeline_timed.cpp
@@ -0,0 +1,302 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include <iostream>
+#include <memory>
+#include <vector>
+#include <tbb/tbb.h>
+
+void doWork(double sec) {
+  tbb::tick_count t0 = tbb::tick_count::now();
+  while ((tbb::tick_count::now() - t0).seconds() <= sec);
+}
+
+tbb::filter<int,int> makeMiddleFilters(int num_filters, tbb::filter_mode m, double usec,
+                                       int imbalanced_filter = -1, double imbalance_factor = 1) {
+  double w = (imbalanced_filter == num_filters) ? usec*imbalance_factor : usec;
+
+  auto f = tbb::make_filter<int, int>(m,
+                                      [=](int i) -> int {
+                                        doWork(w);
+                                        return i;
+                                      }); 
+  if (num_filters > 1)
+    return f & makeMiddleFilters(num_filters-1, m, usec, 
+                                 imbalanced_filter, imbalance_factor);
+  else
+    return f;
+}
+
+tbb::filter<void,void> buildFilterChain(std::atomic<int>& counter, std::vector<int>& data, 
+                                        int num_filters, tbb::filter_mode m, double usec,
+                                        double imbalance_factor = 1) {
+  counter = data.size() - 1;
+  tbb::filter<int, int> middle{};
+  tbb::filter<int,void> end{};
+  int imbalanced_filter = (imbalance_factor == 1) ? -1 : num_filters - 3;
+
+  if (num_filters > 1) {
+    tbb::filter<void,int> start = 
+      tbb::make_filter<void, int>(m,
+                                  [&counter, &data, usec](tbb::flow_control& fc) -> int {
+                                    int i = counter--;
+                                    if (i > 0) {
+                                      doWork(usec);
+                                      return data[i];
+                                    } else {
+                                      fc.stop();
+                                      return 0;
+                                    }
+                                  });
+
+
+      tbb::filter<int, void> end = 
+        tbb::make_filter<int, void>(m,
+                                    [=](int i) {
+                                      doWork(usec);
+                                    });
+
+      if (num_filters > 2) {
+        tbb::filter<int, int> middle = makeMiddleFilters(num_filters-2, m, usec, imbalanced_filter, imbalance_factor);
+        return start & middle & end;
+      } else {
+        return start & end;
+      }
+  } else {
+    return tbb::make_filter<void, void>(m,
+                                        [&](tbb::flow_control& fc) {
+                                          int i = counter--;
+                                          if (i > 0) {
+                                            doWork(usec);
+                                          } else {
+                                            fc.stop();
+                                          }
+                                        });
+  }
+}
+
+double runSerial(std::vector<int>& data, int num_filters, double usec, int num_items,
+                 int imbalanced_filter = -1, double imbalance_factor = 1) {
+  tbb::tick_count t0 = tbb::tick_count::now();
+  for (int i = 0; i < num_items; ++i) {
+    for (int j = 0; j < num_filters; ++j) {
+      if (j == imbalanced_filter) {
+        doWork(usec*imbalance_factor);
+      } else {
+        doWork(usec);
+      }
+    }
+  }
+  double total_time = (tbb::tick_count::now() - t0).seconds();
+  return total_time;   
+}
+
+std::vector<int> makeVector(int N);
+static void warmupTBB();
+
+std::vector<tbb::filter_mode> modes = {tbb::filter_mode::serial_in_order, 
+                                        tbb::filter_mode::serial_out_of_order, 
+                                        tbb::filter_mode::parallel };
+std::vector<std::string> mode_name = {"serial_in_order", 
+                                      "serial_out_of_order", 
+                                      "parallel"};
+
+
+
+void runEightFiltersOneTokenBalanced(int N, std::vector<int>& data) {
+  std::vector<double> spin_times = {1e-7, 1e-6, 1e-5, 1e-4};
+  int num_filters = 8;
+
+  std::cout << "\nVarying the mode and spin_time for 8 filters:\n";
+
+  // Balanced serial time
+  std::vector<double> serial_time;
+  for (double spin_time : spin_times) {
+    serial_time.push_back(runSerial(data, num_filters, spin_time, N));
+  }
+
+  std::cout << "\nmode";
+  for (double spin_time : spin_times) 
+    std::cout << ", " << spin_time;
+  std::cout << std::endl;
+
+  for (int m = 0; m < modes.size(); ++m) {
+    std::atomic<int> counter = 0;
+    std::cout << mode_name[m];
+    for (int st = 0; st < spin_times.size(); ++st) {
+      double spin_time = spin_times[st];
+
+      auto chain =
+        buildFilterChain(counter, data, num_filters, modes[m], spin_time);
+
+      tbb::tick_count t0 = tbb::tick_count::now();
+      warmupTBB();
+      tbb::parallel_pipeline(1, chain);
+      double t = (tbb::tick_count::now() - t0).seconds();
+      std::cout << ", " << serial_time[st]/t;
+    }
+    std::cout << std::endl; 
+  }
+}
+
+void runEightTokensIncreasingFilters(int N, std::vector<int>& data) {
+  int max_threads = tbb::this_task_arena::max_concurrency();
+  double spin_time = 0.0001;
+
+  std::cout << "\nVarying number of filters with 100us spin and 8 tokens:\n";
+
+  // Balanced serial time
+  std::vector<double> serial_time;
+  for (int num_filters = 1; num_filters <= max_threads; ++num_filters) {
+    serial_time.push_back(runSerial(data, num_filters, spin_time, N));
+  }
+
+  std::cout << "\nmode";
+  for (int num_filters = 1; num_filters <= max_threads; ++num_filters)
+    std::cout << ", " << num_filters;
+  std::cout << std::endl;
+
+  for (int m = 0; m < modes.size(); ++m) {
+    std::atomic<int> counter = 0;
+    std::cout << mode_name[m];
+    for (int num_filters = 1; num_filters <= max_threads; ++num_filters) {
+      auto chain =
+        buildFilterChain(counter, data, num_filters, modes[m], spin_time);
+
+      tbb::tick_count t0 = tbb::tick_count::now();
+      warmupTBB();
+      tbb::parallel_pipeline(max_threads, chain);
+      double t = (tbb::tick_count::now() - t0).seconds();
+      std::cout << ", " << serial_time[num_filters-1]/t;
+    }
+    std::cout << std::endl; 
+  }
+}
+
+void runEightFiltersIncreasingTokens(int N, std::vector<int>& data) {
+  int max_threads = tbb::this_task_arena::max_concurrency();
+  int num_filters = 8;
+  double spin_time = 0.0001;
+
+  std::cout << "\nVarying number of tokens with 8 filters and 100us spin:\n";
+
+  double serial_time = runSerial(data, num_filters, spin_time, N);
+
+  std::cout << "\nmode";
+  for (int num_tokens = 1; num_tokens <= 2*max_threads; ++num_tokens)
+    std::cout << ", " << num_tokens;
+  std::cout << std::endl;
+
+  for (int m = 0; m < modes.size(); ++m) {
+    std::atomic<int> counter = 0;
+    std::cout << mode_name[m];
+    for (int num_tokens = 1; num_tokens <= 2*max_threads; ++num_tokens) {
+      auto chain =
+        buildFilterChain(counter, data, num_filters, modes[m], spin_time);
+
+      tbb::tick_count t0 = tbb::tick_count::now();
+      warmupTBB();
+      tbb::parallel_pipeline(num_tokens, chain);
+      double t = (tbb::tick_count::now() - t0).seconds();
+      std::cout << ", " << serial_time/t;
+    }
+    std::cout << std::endl; 
+  }
+}
+
+void runEightFiltersImbalanced(int N, std::vector<int>& data) {
+  std::vector<double> imbalance = {0.1, 0.5, 0.75, 1.5, 2, 10};
+  int num_filters = 8;
+  int num_tokens = 8;
+  double spin_time = 0.0001;
+
+  std::cout << "\nVarying imbalance of 1 of 8 filters:\n";
+
+  // Imbalanced serial time
+  std::vector<double> serial_time;
+  for (double imb : imbalance) {
+    serial_time.push_back(runSerial(data, num_filters, spin_time, N, imb));
+  }
+
+  std::cout << "\nmode";
+  for (double imb : imbalance)
+    std::cout << ", " << imb;
+  std::cout << std::endl;
+
+  for (int m = 0; m < modes.size(); ++m) {
+    std::atomic<int> counter = 0;
+    std::cout << mode_name[m];
+    for (int imb = 0; imb < imbalance.size(); ++imb) {
+      double imbalance_factor = imbalance[imb];
+      auto chain =
+        buildFilterChain(counter, data, num_filters, modes[m], spin_time, imbalance_factor);
+
+      tbb::tick_count t0 = tbb::tick_count::now();
+      warmupTBB();
+      tbb::parallel_pipeline(num_tokens, chain);
+      double t = (tbb::tick_count::now() - t0).seconds();
+      std::cout << ", " << serial_time[imb]/t;
+    }
+    std::cout << std::endl; 
+  }
+}
+
+int main() {
+  // use the most performance codes
+  // only a single NUMA node
+  // and only 1 thread per core
+  tbb::task_arena::constraints c;
+  c.set_numa_id(tbb::info::numa_nodes()[0]);
+  c.set_core_type(tbb::info::core_types().front());
+  c.set_max_threads_per_core(1);
+  c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c)));
+  tbb::task_arena a(c);
+
+  std::cout << "Using an arena with " << a.max_concurrency() << " slots\n";
+
+  a.execute([&]() {
+    int N = 8000;
+    std::vector<int> data = makeVector(N);
+    //runEightFiltersOneTokenBalanced(N, data);
+    //runEightTokensIncreasingFilters(N, data);
+    //runEightFiltersIncreasingTokens(N, data);
+    runEightFiltersImbalanced(N, data);
+  });
+
+  return 0;
+}
+
+std::vector<int> makeVector(int N) {
+   std::vector<int> v;
+   v.reserve(N);
+   for (int i = 0; i < N; ++i) {
+     v.emplace_back(i);
+   }
+   return v;
+}
+
+static void warmupTBB() {
+  // This is a simple loop that should get workers started.
+  // oneTBB creates workers lazily on first use of the library
+  // so this hides the startup time when looking at trivial
+  // examples that do little real work. 
+  tbb::parallel_for(0, tbb::this_task_arena::max_concurrency(), 
+    [=](int) {
+      tbb::tick_count t0 = tbb::tick_count::now();
+      while ((tbb::tick_count::now() - t0).seconds() < 0.01);
+    }
+  );
+}
diff --git a/new_examples/performance_tuning/partitioners_imbalanced_loops.cpp b/new_examples/performance_tuning/partitioners_imbalanced_loops.cpp
new file mode 100644
index 0000000000..bcedeba6df
--- /dev/null
+++ b/new_examples/performance_tuning/partitioners_imbalanced_loops.cpp
@@ -0,0 +1,111 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include <iostream>
+#include <tbb/tbb.h>
+
+void doWork(double sec);
+
+template <typename Partitioner>
+void buildingWork(int N, const Partitioner& p) {
+  tbb::parallel_for( tbb::blocked_range<int>(0, N, 1), 
+    [](const tbb::blocked_range<int>& r) {
+      for (int i = r.begin(); i < r.end(); ++i) {
+        doWork(i);
+      }
+    }, p
+  );
+}
+
+template <typename Partitioner>
+void buildingWork(int N, Partitioner& p) {
+  tbb::parallel_for( tbb::blocked_range<int>(0, N, 1), 
+    [](const tbb::blocked_range<int>& r) {
+      for (int i = r.begin(); i <  r.end(); ++i) {
+        doWork(i);
+      }
+    }, p
+  );
+}
+
+static void warmupTBB() {
+  // This is a simple loop that should get workers started.
+  // oneTBB creates workers lazily on first use of the library
+  // so this hides the startup time when looking at trivial
+  // examples that do little real work. 
+  tbb::parallel_for(0, tbb::info::default_concurrency(), 
+    [=](int) {
+      tbb::tick_count t0 = tbb::tick_count::now();
+      while ((tbb::tick_count::now() - t0).seconds() < 0.01);
+    }
+  );
+}
+
+void doWork(double usec) {
+  double sec = usec*1e-06;
+  tbb::tick_count t0 = tbb::tick_count::now();
+  while ((tbb::tick_count::now() - t0).seconds() <= sec);
+}
+
+int main(int argc, char *argv[]) {
+  // use the most performance codes
+  // only a single NUMA node
+  // and only 1 thread per core
+  tbb::task_arena::constraints c;
+  c.set_numa_id(tbb::info::numa_nodes()[0]);
+  c.set_core_type(tbb::info::core_types().back());
+  c.set_max_threads_per_core(1);
+  c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c)));
+  tbb::task_arena a(c);
+
+  std::cout << "Using an arena with " << a.max_concurrency() << " slots\n";
+
+  a.execute([&]() {
+    int N = 1000;
+    int M = 10;
+
+    std::cout << std::endl << "M = " << M
+              << std::endl << "N = " << N << std::endl;
+
+    warmupTBB();
+    tbb::tick_count t0 = tbb::tick_count::now();
+    for (int i = 0; i < M; ++i) {
+      buildingWork(N, tbb::auto_partitioner{});
+    }
+    double auto_time = (tbb::tick_count::now() - t0).seconds();
+
+    warmupTBB();
+    tbb::affinity_partitioner aff_p;
+    t0 = tbb::tick_count::now();
+    for (int i = 0; i < M; ++i) {
+      buildingWork(N, aff_p); 
+    }
+    double affinity_time = (tbb::tick_count::now() - t0).seconds();
+
+    warmupTBB();
+    t0 = tbb::tick_count::now();
+    for (int i = 0; i < M; ++i) {
+      buildingWork(N, tbb::static_partitioner{});
+    }
+    double static_time = (tbb::tick_count::now() - t0).seconds();
+
+    std::cout << "auto_partitioner = " << auto_time << " seconds" << std::endl
+              << "affinity_partitioner = " << affinity_time << " seconds" << std::endl
+              << "static_partitioner = " << static_time << " seconds" << std::endl;
+  });
+  return 0;
+}
+
diff --git a/new_examples/performance_tuning/priorities_and_conflict.cpp b/new_examples/performance_tuning/priorities_and_conflict.cpp
index b8ae8117aa..e7a1771d37 100644
--- a/new_examples/performance_tuning/priorities_and_conflict.cpp
+++ b/new_examples/performance_tuning/priorities_and_conflict.cpp
@@ -14,18 +14,24 @@
     limitations under the License.
 */
 
-#include <iostream>
+
 #include <thread>
 #include <tbb/tbb.h>
 
 void printArrival(tbb::task_arena::priority priority);
-void waitUntil(int N);
+
+using counter_t = std::atomic<int>;
+counter_t counter = 0;
+void waitUntil(int N, counter_t& c) {
+  ++c;
+  while (c != N);
+}
 
 void explicitArenaWithPriority(tbb::task_arena::priority priority) {
   tbb::task_arena a{tbb::info::default_concurrency(), 1, priority};
   a.execute([=]() {
     tbb::parallel_for(0, 
-                      10*tbb::info::default_concurrency(), 
+                      2*tbb::info::default_concurrency(), 
                       [=](int) { printArrival(priority); });
   });
 }
@@ -33,11 +39,11 @@ void explicitArenaWithPriority(tbb::task_arena::priority priority) {
 void runTwoThreads(tbb::task_arena::priority priority0, 
                    tbb::task_arena::priority priority1) {
   std::thread t0([=]() {
-    waitUntil(2);
+    waitUntil(2, counter);
     explicitArenaWithPriority(priority0);
   });
   std::thread t1([=]() {
-    waitUntil(2);
+    waitUntil(2, counter);
     explicitArenaWithPriority(priority1);
   });
   t0.join();
@@ -46,16 +52,32 @@ void runTwoThreads(tbb::task_arena::priority priority0,
 
 #include <cstdio>
 
+int main() {
+  counter = 0;
+  std::printf("\n\nrunTwoThreads with low (.) and high (|)\n");
+  runTwoThreads(tbb::task_arena::priority::low, tbb::task_arena::priority::high);
+
+  counter = 0;
+  std::printf("\n\nrunTwoThreads with low (.) and normal (:)\n");
+  runTwoThreads(tbb::task_arena::priority::low, tbb::task_arena::priority::normal);
+
+  counter = 0;
+  std::printf("\n\nrunTwoThreads with normal (:) and high (|)\n");
+  runTwoThreads(tbb::task_arena::priority::normal, tbb::task_arena::priority::high);
+  std::printf("\n");
+  return 0;
+}
+
 void printArrival(tbb::task_arena::priority priority) {
   switch (priority) {
     case tbb::task_arena::priority::low:
-      std::printf(" low ");
+      std::printf(".");
       break;
     case tbb::task_arena::priority::normal:
-      std::printf(" normal ");
+      std::printf(":");
       break;
     case tbb::task_arena::priority::high:
-      std::printf(" high ");
+      std::printf("|");
       break;
     default:
       break;
@@ -65,24 +87,5 @@ void printArrival(tbb::task_arena::priority priority) {
   while ((tbb::tick_count::now() - t0).seconds() < 0.01);
 }
 
-std::atomic<int> count_up = 0;
-void waitUntil(int N) {
-  ++count_up;
-  while (count_up != N);
-}
-
-int main() {
-  count_up = 0;
-  std::printf("\n\n\n\nrunTwoThreads(low, high)");
-  runTwoThreads(tbb::task_arena::priority::low, tbb::task_arena::priority::high);
-
-  count_up = 0;
-  std::printf("\n\n\n\nrunTwoThreads(low, normal)");
-  runTwoThreads(tbb::task_arena::priority::low, tbb::task_arena::priority::normal);
 
-  count_up = 0;
-  std::printf("\n\n\n\nrunTwoThreads(normal, high)");
-  runTwoThreads(tbb::task_arena::priority::normal, tbb::task_arena::priority::high);
-  return 0;
-}
 
diff --git a/new_examples/performance_tuning/task_scheduler_observer.cpp b/new_examples/performance_tuning/task_scheduler_observer.cpp
new file mode 100644
index 0000000000..b6f319eac5
--- /dev/null
+++ b/new_examples/performance_tuning/task_scheduler_observer.cpp
@@ -0,0 +1,114 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include <iostream>
+#include <sstream>
+#include <thread>
+#include <tbb/tbb.h>
+
+// these are placeholder for where we would put OS-specific types and calls
+using affinity_mask_t = std::string;
+void set_thread_affinity( int tid, const affinity_mask_t& mask ) {
+  std::ostringstream buffer;
+  buffer << std::this_thread::get_id()
+         << " -> (" << tid
+         << ", " << mask << ")\n";
+  std::cout << buffer.str();
+}
+void restore_thread_affinity() {
+  std::ostringstream buffer;
+  buffer <<  std::this_thread::get_id() 
+         << " -> (restored)\n";
+  std::cout << buffer.str();
+}
+
+// observer class
+class PinningObserver : public tbb::task_scheduler_observer {
+public:
+    // HW affinity mask to be used for threads in an arena
+    affinity_mask_t m_mask;
+    PinningObserver( oneapi::tbb::task_arena &a, const affinity_mask_t& mask )
+        : tbb::task_scheduler_observer(a), m_mask(mask) {
+        observe(true); // activate the observer
+    }
+    void on_scheduler_entry( bool worker ) override {
+        set_thread_affinity(
+            tbb::this_task_arena::current_thread_index(), m_mask);
+    }
+    void on_scheduler_exit( bool worker ) override {
+        restore_thread_affinity();
+    }
+};
+
+int N = 1000;
+double w = 0.01;
+double f(double v);
+
+using counter_t = std::atomic<int>;
+counter_t counter = 0;
+void waitUntil(int N, counter_t& c) {
+  ++c;
+  while (c != N);
+}
+
+void observeTwoArenas() {
+  int P = tbb::info::default_concurrency();
+
+  // two arenas, each with half the hw threads
+  tbb::task_arena a0(P/2);
+  tbb::task_arena a1(P/2);
+
+  PinningObserver obs0(a0, "mask_zero");
+  PinningObserver obs1(a1, "mask_one");
+
+  // Execute consecutive loops
+  std::cout << "Execute a0 loop\n";
+  a0.execute([] {
+    tbb::parallel_for(0, N, [](int j) { f(w); });
+  });
+  std::cout << "Execute a1 loop\n";
+  a1.execute([] {
+    tbb::parallel_for(0, N, [](int j) { f(w); });
+  });
+
+  // Execute concurrent loops
+  std::cout << "Execute a0 and a1 concurrently\n";
+  std::thread t0([&]() { 
+    waitUntil(2, counter);
+    a0.execute([] {
+      tbb::parallel_for(0, N, [](int j) { f(w); });
+    });
+  });
+  std::thread t1([&]() { 
+    waitUntil(2, counter);
+    a1.execute([] {
+      tbb::parallel_for(0, N, [](int j) { f(w); });
+    });
+  });
+  t0.join();
+  t1.join();
+}
+
+int main() {
+  observeTwoArenas();
+  return 0;
+}
+
+double f(double v) {
+  tbb::tick_count t0 = tbb::tick_count::now();
+  while ((tbb::tick_count::now() - t0).seconds() < 0.01);
+  return 2*v;
+}
diff --git a/new_examples/tasks/CMakeLists.txt b/new_examples/tasks/CMakeLists.txt
index 79d468791d..3a1af18ea0 100644
--- a/new_examples/tasks/CMakeLists.txt
+++ b/new_examples/tasks/CMakeLists.txt
@@ -1,6 +1,3 @@
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb -pthread")
-set(CMAKE_CXX_LINKER_FLAGS "-tbb -pthread")
-
 foreach(tpp parallel_invoke_fib.cpp
             task_group_fib.cpp
             task_group_poor_scaling.cpp
diff --git a/new_examples/tasks/parallel_invoke_fib.cpp b/new_examples/tasks/parallel_invoke_fib.cpp
index a5e2f7c584..903729fc0f 100644
--- a/new_examples/tasks/parallel_invoke_fib.cpp
+++ b/new_examples/tasks/parallel_invoke_fib.cpp
@@ -73,7 +73,7 @@ int main(int argc, char** argv)
 
   std::cout << "SerialFib:   " << fib_s << " Time: " << t_s << "\n";
   std::cout << "ParallelFib: " << fib_p << " Time: " << t_p << " Speedup: " << t_s/t_p << "\n";
-  std::cout << "ParallelFibCutoff_30: " << fib_c << " Time: " << t_p << " Speedup: " << t_s/t_c << "\n";
+  std::cout << "ParallelFibCutoff_30: " << fib_c << " Time: " << t_c << " Speedup: " << t_s/t_c << "\n";
   return 0;
 }
 
diff --git a/new_examples/tasks/resumable_tasks.cpp b/new_examples/tasks/resumable_tasks.cpp
index 9cd3cc8982..c1bedaf921 100755
--- a/new_examples/tasks/resumable_tasks.cpp
+++ b/new_examples/tasks/resumable_tasks.cpp
@@ -32,7 +32,7 @@ int main() {
         tbb::task::suspend([=,&sycl_q](tbb::task::suspend_point tag) {
           auto sycl_event = sycl_q.fill(a, id, N);
           sycl_q.submit([=](sycl::handler& sycl_h) {
-            sycl_h.depends_on(sycl_event); // only run after e is done
+            sycl_h.depends_on(sycl_event); // run after sycl_event is done
             sycl_h.host_task([tag]() { 
               tbb::task::resume(tag);
             });