diff --git a/new_examples/CMakeLists.txt b/new_examples/CMakeLists.txt index 33dbebb7c4..cf5875dd35 100644 --- a/new_examples/CMakeLists.txt +++ b/new_examples/CMakeLists.txt @@ -2,8 +2,17 @@ cmake_minimum_required (VERSION 3.4) project(tbb_tutorials LANGUAGES CXX) -set(CMAKE_CXX_COMPILER "icpx") -set(CMAKE_LINKER "icpx") +if (WIN32) + set(CMAKE_CXX_COMPILER "icx-cl") + set(CMAKE_LINKER "icpx") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qtbb") + set(CMAKE_CXX_LINKER_FLAGS "-Qtbb") +else() + set(CMAKE_CXX_COMPILER "icpx") + set(CMAKE_LINKER "icpx") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb") + set(CMAKE_CXX_LINKER_FLAGS "-tbb -lpthread") +endif() set(CMAKE_CXX_STANDARD 20) include (CTest) diff --git a/new_examples/algorithms/CMakeLists.txt b/new_examples/algorithms/CMakeLists.txt index 5c5e670e30..2821d69f0b 100644 --- a/new_examples/algorithms/CMakeLists.txt +++ b/new_examples/algorithms/CMakeLists.txt @@ -1,6 +1,3 @@ -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb") -set(CMAKE_CXX_LINKER_FLAGS "-tbb") - foreach(tpp parallel_invoke_recursive_quicksort.cpp parallel_invoke_two_quicksorts.cpp parallel_for_trivial.cpp parallel_for_unoptimized_mxm.cpp parallel_reduce_max.cpp parallel_reduce_pi.cpp diff --git a/new_examples/algorithms/README.md b/new_examples/algorithms/README.md new file mode 100644 index 0000000000..7cd9092e85 --- /dev/null +++ b/new_examples/algorithms/README.md @@ -0,0 +1,3 @@ +# Chapter 2: Algorithms + +This directory contains the examples for Chapter 2. \ No newline at end of file diff --git a/new_examples/cancellation/CMakeLists.txt b/new_examples/cancellation/CMakeLists.txt index 0f26004913..adebb4e100 100644 --- a/new_examples/cancellation/CMakeLists.txt +++ b/new_examples/cancellation/CMakeLists.txt @@ -1,6 +1,3 @@ -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb") -set(CMAKE_CXX_LINKER_FLAGS "-tbb") - foreach(tpp cancel_group_execution1.cpp cancel_group_execution2.cpp cancel_group_execution3.cpp diff --git a/new_examples/exception/CMakeLists.txt b/new_examples/exception/CMakeLists.txt index 27d14a0939..444f2c077a 100644 --- a/new_examples/exception/CMakeLists.txt +++ b/new_examples/exception/CMakeLists.txt @@ -1,9 +1,6 @@ -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb") -set(CMAKE_CXX_LINKER_FLAGS "-tbb") - -foreach(tpp cpp_exceptions.cpp - exception_catch1.cpp - exception_catch2.cpp) +foreach(tpp exception_catch1.cpp + exception_catch2.cpp + exception_catch3.cpp) string(REPLACE ".cpp" "" texe ${tpp}) add_executable(${texe} ${tpp}) target_include_directories(${texe} PUBLIC diff --git a/new_examples/graph/CMakeLists.txt b/new_examples/graph/CMakeLists.txt index 1c971bd436..5f2114c42e 100644 --- a/new_examples/graph/CMakeLists.txt +++ b/new_examples/graph/CMakeLists.txt @@ -1,6 +1,3 @@ -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb") -set(CMAKE_CXX_LINKER_FLAGS "-tbb") - foreach(tpp graph_composite_node.cpp graph_execute_while_building.cpp graph_fwd_substitution.cpp @@ -10,6 +7,7 @@ foreach(tpp graph_composite_node.cpp graph_node_priorities.cpp graph_reestablish_order.cpp graph_small_nodes.cpp + graph_stereoscopic_3d.cpp graph_two_nodes.cpp graph_two_nodes_deduced.cpp graph_with_join.cpp) diff --git a/new_examples/graph/graph_execute_while_building.cpp b/new_examples/graph/graph_execute_while_building.cpp index 7a977221df..387bb9a9f9 100644 --- a/new_examples/graph/graph_execute_while_building.cpp +++ b/new_examples/graph/graph_execute_while_building.cpp @@ -16,6 +16,7 @@ #include #include +#include #include struct config_t { @@ -23,16 +24,17 @@ struct config_t { int predecessor; }; -config_t configuration[] = { { 0, 3 }, - { 1, 4 }, - { 2, 5 }, - { 3, -1 }, - { 4, -1 }, - { 5, 3 }, - { 6, 4 }, - { 7, 1 } }; +// each element defines a node and what other node it must wait for +std::vector configuration = { { 0, 3 }, + { 1, 4 }, + { 2, 5 }, + { 3, -1 }, + { 4, -1 }, + { 5, 3 }, + { 6, 4 }, + { 7, 1 } }; -int num_nodes = sizeof(configuration) / sizeof(config_t); +const int num_nodes = configuration.size(); int main() { tbb::flow::graph g; @@ -57,14 +59,18 @@ int main() { return m; } }); - // connect the new node to its future + // connect the new node to its "future" tbb::flow::make_edge(*work_nodes[c.id], future_nodes[c.id]); // start the node or link it to predecessor's promise if (c.predecessor != -1) { + // must connect to predecessor's "future" + // if the future is already written to this will start the node + // otherwise it will be started when the future is written std::printf("new %d with %d -> %d\n", c.id, c.predecessor, c.id); tbb::flow::make_edge(future_nodes[c.predecessor], *work_nodes[c.id]); } else { + // does not need to wait and can be started immediately std::printf("starting %d from main\n", c.id); work_nodes[c.id]->try_put(tbb::flow::continue_msg{}); } diff --git a/new_examples/graph/graph_loops.cpp b/new_examples/graph/graph_loops.cpp index b1dea8cb85..847cc544e6 100644 --- a/new_examples/graph/graph_loops.cpp +++ b/new_examples/graph/graph_loops.cpp @@ -27,8 +27,7 @@ void tryPutLoop() { } }; for (int count = 0; count < limit; ++count) { - int value = count; - my_node.try_put(value); + my_node.try_put(count); } g.wait_for_all(); } diff --git a/new_examples/graph/graph_small_nodes.cpp b/new_examples/graph/graph_small_nodes.cpp index 345ff196e6..6e2c9f5051 100644 --- a/new_examples/graph/graph_small_nodes.cpp +++ b/new_examples/graph/graph_small_nodes.cpp @@ -57,9 +57,9 @@ void small_nodes_lightweight() { tbb::flow::function_node< int, int > add( g, tbb::flow::unlimited, [](const int &v) { return v+1; } ); tbb::flow::function_node< int, int, tbb::flow::lightweight > multiply( g, tbb::flow::unlimited, - [](const int &v) { return v*2; } ); + [](const int &v) noexcept { return v*2; } ); tbb::flow::function_node< int, int, tbb::flow::lightweight > cube( g, tbb::flow::unlimited, - [](const int &v) { return v*v*v; } ); + [](const int &v) noexcept { return v*v*v; } ); tbb::flow::make_edge(add, multiply); tbb::flow::make_edge(multiply, cube); @@ -73,7 +73,7 @@ void small_nodes_combined_lightweight() { tbb::flow::graph g; tbb::flow::function_node< int, int, tbb::flow::lightweight > combined_node( g, tbb::flow::unlimited, - [](const int &v) { + [](const int &v) noexcept { auto v2 = (v+1)*2; return v2*v2*v2; }); diff --git a/new_examples/graph/graph_stereoscopic_3d.cpp b/new_examples/graph/graph_stereoscopic_3d.cpp index 05042a1829..35bb571b93 100644 --- a/new_examples/graph/graph_stereoscopic_3d.cpp +++ b/new_examples/graph/graph_stereoscopic_3d.cpp @@ -21,10 +21,10 @@ #include #include #include -#include "../algorithms/lodepng.h" +#include "../common/lodepng.h" #include -class PNGImage { +class Image { public: uint64_t frameNumber = -1; unsigned int width = 0, height = 0; @@ -34,26 +34,26 @@ class PNGImage { static const int greenOffset = 1; static const int blueOffset = 2; - PNGImage() {} - PNGImage(uint64_t frame_number, const std::string& file_name); - PNGImage(const PNGImage& p); - virtual ~PNGImage() {} + Image() {} + Image(uint64_t frame_number, const std::string& file_name); + Image(const Image& p); + virtual ~Image() {} void write() const; }; int getNextFrameNumber(); -PNGImage getLeftImage(uint64_t frameNumber); -PNGImage getRightImage(uint64_t frameNumber); -void increasePNGChannel(PNGImage& image, int channel_offset, int increase); -void mergePNGImages(PNGImage& right, const PNGImage& left); +Image getLeftImage(uint64_t frameNumber); +Image getRightImage(uint64_t frameNumber); +void increasePNGChannel(Image& image, int channel_offset, int increase); +void mergeImages(Image& right, const Image& left); void stereo3D() { - using Image = PNGImage; + using Image = Image; // step 1: create graph object tbb::flow::graph g; // step 2: create nodes - tbb::flow::input_node frame_no_node{g, + tbb::flow::input_node frame_no_node{g, []( tbb::flow_control &fc ) -> uint64_t { uint64_t frame_number = getNextFrameNumber(); if (frame_number) @@ -99,7 +99,7 @@ void stereo3D() { [] (std::tuple t) -> Image { auto& l = std::get<0>(t); auto& r = std::get<1>(t); - mergePNGImages(r, l); + mergeImages(r, l); return r; } }; @@ -128,7 +128,7 @@ void stereo3D() { g.wait_for_all(); } -PNGImage::PNGImage(uint64_t frame_number, const std::string& file_name) : +Image::Image(uint64_t frame_number, const std::string& file_name) : frameNumber{frame_number}, buffer{std::make_shared< std::vector >()} { if (lodepng::decode(*buffer, width, height, file_name)) { std::cerr << "Error: could not read PNG file!" << std::endl; @@ -136,11 +136,11 @@ PNGImage::PNGImage(uint64_t frame_number, const std::string& file_name) : } }; -PNGImage::PNGImage(const PNGImage& p) : frameNumber{p.frameNumber}, +Image::Image(const Image& p) : frameNumber{p.frameNumber}, width{p.width}, height{p.height}, buffer{p.buffer} {} -void PNGImage::write() const { +void Image::write() const { std::string file_name = std::string("out") + std::to_string(frameNumber) + ".png"; if (lodepng::encode(file_name, *buffer, width, height)) { std::cerr << "Error: could not write PNG file!" << std::endl; @@ -163,30 +163,30 @@ int getNextFrameNumber() { } } -PNGImage getLeftImage(uint64_t frameNumber) { - return PNGImage(frameNumber, "input1.png"); +Image getLeftImage(uint64_t frameNumber) { + return Image(frameNumber, "input1.png"); } -PNGImage getRightImage(uint64_t frameNumber) { - return PNGImage(frameNumber, "input2.png"); +Image getRightImage(uint64_t frameNumber) { + return Image(frameNumber, "input2.png"); } -void increasePNGChannel(PNGImage& image, int channel_offset, int increase) { - const int height_base = PNGImage::numChannels * image.width; +void increasePNGChannel(Image& image, int channel_offset, int increase) { + const int height_base = Image::numChannels * image.width; std::vector& buffer = *image.buffer; // Increase selected color channel by a predefined value for (unsigned int y = 0; y < image.height; y++) { const int height_offset = height_base * y; for (unsigned int x = 0; x < image.width; x++) { - int pixel_offset = height_offset + PNGImage::numChannels * x + channel_offset; + int pixel_offset = height_offset + Image::numChannels * x + channel_offset; buffer[pixel_offset] = static_cast(std::min(buffer[pixel_offset] + increase, 255)); } } } -void mergePNGImages(PNGImage& right, const PNGImage& left) { - const int channels_per_pixel = PNGImage::numChannels; +void mergeImages(Image& right, const Image& left) { + const int channels_per_pixel = Image::numChannels; const int height_base = channels_per_pixel * right.width; std::vector& left_buffer = *left.buffer; std::vector& right_buffer = *right.buffer; @@ -195,7 +195,7 @@ void mergePNGImages(PNGImage& right, const PNGImage& left) { const int height_offset = height_base * y; for (unsigned int x = 0; x < right.width; x++) { const int pixel_offset = height_offset + channels_per_pixel * x; - const int red_index = pixel_offset + PNGImage::redOffset; + const int red_index = pixel_offset + Image::redOffset; right_buffer[red_index] = left_buffer[red_index]; } } diff --git a/new_examples/graph/graph_two_nodes.cpp b/new_examples/graph/graph_two_nodes.cpp index 1a3d647744..481667e714 100644 --- a/new_examples/graph/graph_two_nodes.cpp +++ b/new_examples/graph/graph_two_nodes.cpp @@ -40,7 +40,7 @@ void graphTwoNodes() { // step 3: add edges tbb::flow::make_edge(my_first_node, my_second_node); - // step 4: send messages + // step 4: send message that eagerly starts graph execution my_first_node.try_put(10); // step 5: wait for graph to complete diff --git a/new_examples/graph/graph_with_join.cpp b/new_examples/graph/graph_with_join.cpp index 88ad9639cb..2efc46212a 100644 --- a/new_examples/graph/graph_with_join.cpp +++ b/new_examples/graph/graph_with_join.cpp @@ -55,7 +55,7 @@ void graphJoin() { make_edge(my_other_node, tbb::flow::input_port<1>(my_join_node)); make_edge(my_join_node, my_final_node); - // step 4: send messages + // step 4: send messages that eagerly start graph execution my_node.try_put(1); my_other_node.try_put(2); // step 5: wait for the graph to complete diff --git a/new_examples/intro/CMakeLists.txt b/new_examples/intro/CMakeLists.txt index 1efda6b038..9e44001630 100644 --- a/new_examples/intro/CMakeLists.txt +++ b/new_examples/intro/CMakeLists.txt @@ -1,6 +1,3 @@ -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb") -set(CMAKE_CXX_LINKER_FLAGS "-tbb") - foreach(tpp intro_pi.cpp intro_pi_timing.cpp intro_flowgraph.cpp diff --git a/new_examples/migration/CMakeLists.txt b/new_examples/migration/CMakeLists.txt index d52df94a67..f489f910cb 100644 --- a/new_examples/migration/CMakeLists.txt +++ b/new_examples/migration/CMakeLists.txt @@ -1,7 +1,5 @@ -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb -pthread") -set(CMAKE_CXX_LINKER_FLAGS "-tbb -pthread") - -foreach(tpp migrate_task_scheduler_init.cpp +foreach(tpp migrate_atomics.cpp + migrate_task_scheduler_init.cpp migrate_parallel_do.cpp migrate_priorities.cpp migrate_task_blocking.cpp diff --git a/new_examples/migration/migrate_atomics.cpp b/new_examples/migration/migrate_atomics.cpp new file mode 100644 index 0000000000..95c0c551b5 --- /dev/null +++ b/new_examples/migration/migrate_atomics.cpp @@ -0,0 +1,80 @@ +/* + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include +#include +#include + +#include +#include + +#if TBB_VERSION_MAJOR > 2020 +#include +#else +#include +#endif + +int main(int argc, char** argv) { + long int n = 1000000000; + constexpr int num_bins = 256; + + // Initialize random number generator + std::random_device seed; // Random device seed + std::mt19937 mte{seed()}; // mersenne_twister_engine + std::uniform_int_distribution<> uniform{0,num_bins}; + // Initialize image + std::vector image; // empty vector + image.reserve(n); // image vector prealocated + std::generate_n(std::back_inserter(image), n, + [&] { return uniform(mte); } + ); + // Initialize histogram + std::vector hist(num_bins); + + // Serial execution + tbb::tick_count t0 = tbb::tick_count::now(); + std::for_each(image.begin(), image.end(), + [&](uint8_t i){hist[i]++;}); + tbb::tick_count t1 = tbb::tick_count::now(); + double t_serial = (t1 - t0).seconds(); + + // Parallel execution + #if TBB_VERSION_MAJOR > 2020 + std::vector> hist_p(num_bins); + #else + #warning Using tbb::atomic instead of std::atomic + std::vector> hist_p(num_bins); + #endif + + t0 = tbb::tick_count::now(); + parallel_for(tbb::blocked_range{0, image.size()}, + [&](const tbb::blocked_range& r) + { + for (size_t i = r.begin(); i < r.end(); ++i) + hist_p[image[i]]++; + }); + t1 = tbb::tick_count::now(); + double t_parallel = (t1 - t0).seconds(); + + std::cout << "Serial: " << t_serial << ", "; + std::cout << "Parallel: " << t_parallel << ", "; + std::cout << "Speed-up: " << t_serial/t_parallel << std::endl; + + if (!std::equal(hist.begin(),hist.end(),hist_p.begin())) + std::cerr << "Parallel computation failed!!" << std::endl; + return 0; +} diff --git a/new_examples/migration/migrate_bypass_tasks.cpp b/new_examples/migration/migrate_bypass_tasks.cpp index 64ba8f6b81..fc35d6782b 100644 --- a/new_examples/migration/migrate_bypass_tasks.cpp +++ b/new_examples/migration/migrate_bypass_tasks.cpp @@ -54,6 +54,7 @@ void serialFwdSubTiled(std::vector& x, } } +#if TBB_VERSION_MAJOR > 2020 tbb::task_handle fwdSubTGBody(tbb::task_group& tg, int N, int num_blocks, const std::pair bi, @@ -86,6 +87,30 @@ tbb::task_handle fwdSubTGBody(tbb::task_group& tg, return deferred_task; } +#else +void fwdSubTGBody(tbb::task_group& tg, + int N, int num_blocks, + const std::pair bi, + std::vector& x, + const std::vector& a, + std::vector& b, + std::vector>& ref_count) { + auto [r, c] = bi; + computeBlock(N, r, c, x, a, b); + // add successor to right if ready + if (c + 1 <= r && --ref_count[r*num_blocks + c + 1] == 0) { + tg.run([&, N, num_blocks, r, c]() { + fwdSubTGBody(tg, N, num_blocks, BlockIndex(r, c+1), x, a, b, ref_count); + }); + } + // add succesor below if ready + if (r + 1 < (size_t)num_blocks && --ref_count[(r+1)*num_blocks + c] == 0) { + tg.run([&, N, num_blocks, r, c]() { + fwdSubTGBody(tg, N, num_blocks, BlockIndex(r+1, c), x, a, b, ref_count); + }); + } +} +#endif void parallelFwdSubTaskGroup(std::vector& x, const std::vector& a, @@ -114,6 +139,7 @@ void parallelFwdSubTaskGroup(std::vector& x, } #if TBB_VERSION_MAJOR <= 2020 +#warning Using tbb::task directly using RootTask = tbb::empty_task; class FwdSubTask : public tbb::task { diff --git a/new_examples/migration/migrate_parallel_do.cpp b/new_examples/migration/migrate_parallel_do.cpp index 8672f9108d..335d94890d 100644 --- a/new_examples/migration/migrate_parallel_do.cpp +++ b/new_examples/migration/migrate_parallel_do.cpp @@ -87,6 +87,7 @@ void parallelFwdSub(std::vector& x, } ); #else +#warning Using tbb::parallel_do instead of tbb::parallel_for_each tbb::parallel_do( &top_left, &top_left+1, [&](const BlockIndex& bi, tbb::parallel_do_feeder& f) { auto [r, c] = bi; diff --git a/new_examples/migration/migrate_priorities.cpp b/new_examples/migration/migrate_priorities.cpp index 3b6cc43e51..e65bb4f5cf 100644 --- a/new_examples/migration/migrate_priorities.cpp +++ b/new_examples/migration/migrate_priorities.cpp @@ -71,6 +71,7 @@ void runParallelForWithHighPriority() { std::printf("\n"); } #else +#warning Using tbb::task directly auto P = tbb::task_scheduler_init::default_num_threads(); class MyTask : public tbb::task { diff --git a/new_examples/migration/migrate_recycling_tasks.cpp b/new_examples/migration/migrate_recycling_tasks.cpp index eae856a95b..f066c130a6 100644 --- a/new_examples/migration/migrate_recycling_tasks.cpp +++ b/new_examples/migration/migrate_recycling_tasks.cpp @@ -62,8 +62,8 @@ class FwdSubFunctor { const std::vector& a, std::vector& b, std::vector>& ref_count) : - my_tg(tg), my_N(N), my_num_blocks(num_blocks), - my_index(new BlockIndex{bi}), + my_tg(tg), my_index(new BlockIndex{bi}), + my_N(N), my_num_blocks(num_blocks), my_x(x), my_a(a), my_b(b), my_ref_count(ref_count) {} void operator()() const { @@ -123,6 +123,7 @@ void parallelFwdSub(std::vector& x, tg.wait(); } #else +#warning Using tbb::task directly with recycling using RootTask = tbb::empty_task; class FwdSubTask : public tbb::task { diff --git a/new_examples/migration/migrate_task_blocking.cpp b/new_examples/migration/migrate_task_blocking.cpp index b09edbaca3..0852e38108 100644 --- a/new_examples/migration/migrate_task_blocking.cpp +++ b/new_examples/migration/migrate_task_blocking.cpp @@ -28,6 +28,7 @@ void taskBlocking() { g.wait(); } #else +#warning Using tbb::task directly const int P = tbb::task_scheduler_init::default_num_threads(); class MyTask : public tbb::task { diff --git a/new_examples/migration/migrate_task_scheduler_init.cpp b/new_examples/migration/migrate_task_scheduler_init.cpp index 7021a38a31..94db31548f 100644 --- a/new_examples/migration/migrate_task_scheduler_init.cpp +++ b/new_examples/migration/migrate_task_scheduler_init.cpp @@ -19,7 +19,7 @@ void doWork(double seconds); #if TBB_VERSION_MAJOR > 2020 -const int N = 2*tbb::info::default_concurrency(); +const int N = tbb::info::default_concurrency(); void setThreadsAndSlots() { tbb::global_control gc(tbb::global_control::max_allowed_parallelism, N); @@ -32,6 +32,7 @@ void setThreadsAndSlots() { }); } #else +#warning Using tbb::task_scheduler_init instead of tbb::global_control const int N = tbb::task_scheduler_init::default_num_threads(); void setThreadsAndSlots() { @@ -77,18 +78,17 @@ void clearParticipation() { } void dumpParticipation(int p) { - int end = next_tid; int sum = tid_participation[0]; std::cout << "[" << tid_participation[0]; for (int i = 1; i < p; ++i) { sum += tid_participation[i]; std::cout << ", " << tid_participation[i]; } - for (int i = p; i < 2*N; ++i) + for (int i = p; i < N; ++i) std::cout << ", -"; std::cout << "]\n" << "sum == " << sum << "\n" - << "expected sum " << 2*10*N << "\n"; + << "expected sum " << 10*N << "\n"; clearParticipation(); } diff --git a/new_examples/migration/migrate_tasks_adding_work.cpp b/new_examples/migration/migrate_tasks_adding_work.cpp index 01686557e6..442b7eaed7 100644 --- a/new_examples/migration/migrate_tasks_adding_work.cpp +++ b/new_examples/migration/migrate_tasks_adding_work.cpp @@ -102,6 +102,7 @@ void parallelFwdSubTaskGroup(std::vector& x, } #if TBB_VERSION_MAJOR <= 2020 +#warning Using tbb::task directly using RootTask = tbb::empty_task; class FwdSubTask : public tbb::task { diff --git a/new_examples/performance_tuning/CMakeLists.txt b/new_examples/performance_tuning/CMakeLists.txt index 6879ea2e47..a48b218af5 100644 --- a/new_examples/performance_tuning/CMakeLists.txt +++ b/new_examples/performance_tuning/CMakeLists.txt @@ -1,19 +1,27 @@ -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb -pthread") -set(CMAKE_CXX_LINKER_FLAGS "-tbb -pthread") +if (WIN32) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qtbb") + set(CMAKE_CXX_LINKER_FLAGS "-Qtbb") +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb") + set(CMAKE_CXX_LINKER_FLAGS "-tbb -lpthread") +endif() -foreach(tpp global_control_and_implicit_arena.cpp - global_control_and_explicit_arena.cpp - global_control_and_implicit_conflict.cpp - global_control_and_explicit_conflict.cpp - priorities_and_conflict.cpp - building_work.cpp - matrix_transpose.cpp - oblivious_transpose.cpp - oblivious_transpose_simple_partitioner.cpp - parallel_for_addition.cpp - timing_partitioners.cpp - blocked_ranges_trivial.cpp) - string(REPLACE ".cpp" "" texe ${tpp}) +foreach(tpp blocked_ranges_trivial.cpp + constraints.cpp + global_control_and_explicit_arena.cpp + global_control_and_explicit_conflict.cpp + global_control_and_implicit_arena.cpp + global_control_and_implicit_conflict.cpp + parallel_for_addition_partitioners.cpp + parallel_deterministic_reduce_partitioners.cpp + parallel_for_partitioners.cpp + parallel_for_partitioners_timed.cpp + parallel_for_transpose_partitioners.cpp + parallel_pipeline_timed.cpp + partitioners_imbalanced_loops.cpp + priorities_and_conflict.cpp + task_scheduler_observer.cpp) + string(REPLACE ".cpp" "" texe ${tpp}) add_executable(${texe} ${tpp}) target_include_directories(${texe} PUBLIC $ diff --git a/new_examples/performance_tuning/arena_trace.h b/new_examples/performance_tuning/arena_trace.h new file mode 100644 index 0000000000..2a680ed8e5 --- /dev/null +++ b/new_examples/performance_tuning/arena_trace.h @@ -0,0 +1,167 @@ +/* + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ARENA_TRACE_H +#define ARENA_TRACE_H 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +class arena_tracer { +public: + arena_tracer(const std::string& name) : m_t0(std::chrono::steady_clock::now()), m_name(name) { } + ~arena_tracer() { dump_trace(); } + + void add_arena(const std::string& n, tbb::task_arena& a) { + m_observers.local().emplace_back( std::make_shared(m_events, m_arena_id++, m_t0, n, a )); + } + +private: + using time_point_type = std::chrono::time_point; + time_point_type m_t0; + const std::string m_name; + std::atomic m_arena_id = 0; + + struct trace_event { + time_point_type t; + char ph; + bool is_worker; + std::thread::id tid; + int slot; + int pid; + trace_event( time_point_type t_, char ph_, + bool is_worker_, std::thread::id tid_, + int slot_, int pid_ ) : t(t_), ph(ph_), is_worker(is_worker_), tid(tid_), slot(slot_), pid(pid_) {} + }; + + using ets_vector_type = std::vector; + using ets_value_type = ets_vector_type; + using ets_type = tbb::combinable; + ets_type m_events; + + // observer class + class tracing_observer : public oneapi::tbb::task_scheduler_observer { + ets_type& m_events; + int m_arena_id; + time_point_type m_t; + const std::string m_name; + + public: + tracing_observer( ets_type& e, int arena_id, time_point_type t0, + const std::string& fn, oneapi::tbb::task_arena &a ) + : oneapi::tbb::task_scheduler_observer(a), m_events(e), m_arena_id(arena_id), m_t(t0), m_name(fn) { + observe(true); // activate the observer + } + + ~tracing_observer() { } + + int get_arena_id() { return m_arena_id; } + const std::string& get_arena_name() { return m_name; } + + void on_scheduler_entry( bool worker ) override { + auto& l = m_events.local(); + std::thread::id tid = std::this_thread::get_id(); + + l.emplace_back( + std::chrono::steady_clock::now(), + 'B', // ph + worker, // is_worker + tid, + oneapi::tbb::this_task_arena::current_thread_index(), + m_arena_id + ); + } + void on_scheduler_exit( bool worker ) override { + auto& l = m_events.local(); + std::thread::id tid = std::this_thread::get_id(); + + l.emplace_back( + std::chrono::steady_clock::now(), + 'E', // ph + worker, // is_worker + tid, + oneapi::tbb::this_task_arena::current_thread_index(), + m_arena_id + ); + } + }; + + void dump_trace() { + std::unordered_set was_seen; + + std::ofstream out(m_name); + out << "["; + + bool first = true; + for (auto& v : m_observers.range()) { + for (auto& obs : v) { + if (!first) out << ","; + first = false; + out << "\n{\"name\": \"process_name\", \"ph\" : \"M\"" + << ", \"pid\" : " << obs->get_arena_id() + << ", \"args\" : { \"name\" : \"" << obs->get_arena_name() << "\" }" + << "}"; + } + } + + ets_vector_type r; + m_events.combine_each( + [&r](const ets_vector_type& v) { + r.insert(r.end(), v.begin(), v.end()); + }); + + std::sort(r.begin(), r.end(), [](const trace_event& a, const trace_event& b) { return a.t < b.t; }); + + for (auto& e : r) { + out << ",\n{" + << "\"name\": \"" << e.tid << "\"" + << ", \"cat\": \"arena\"" + << ", \"ph\": \"" << e.ph << "\"" + << ", \"ts\": " << std::chrono::duration_cast(e.t-m_t0).count() + << ", \"pid\": " << e.pid + << ", \"tid\": " << e.slot + << "}"; + if (e.ph == 'B') { + out << ",\n{" + << "\"name\": \"flow\"" + << ", \"id\": " << e.tid + << ", \"cat\": \"arena\"" + << ", \"ts\": " << std::chrono::duration_cast(e.t-m_t0).count() + << ", \"pid\": " << e.pid + << ", \"tid\": " << e.slot; + if (was_seen.find(e.tid) != was_seen.end()) { + out << ", \"ph\": \"t\", \"bp\": \"e\" }"; + } else { + was_seen.insert(e.tid); + out << ", \"ph\": \"s\", \"bp\": \"e\" }"; + } + } + } + + out << "\n]\n"; + } + + tbb::enumerable_thread_specific>> m_observers; +}; + +#endif diff --git a/new_examples/performance_tuning/blocked_ranges_trivial.cpp b/new_examples/performance_tuning/blocked_ranges_trivial.cpp index 8515b1ce57..145f795575 100644 --- a/new_examples/performance_tuning/blocked_ranges_trivial.cpp +++ b/new_examples/performance_tuning/blocked_ranges_trivial.cpp @@ -16,9 +16,7 @@ #include #include - -#define TBB_PREVIEW_BLOCKED_RANGE_ND 1 -#include +#include "tbb/tbb.h" double f(double v); @@ -64,26 +62,10 @@ void example3d(int Z, int P, int N, int M, double* a) { ); } -void exampleNd(int Z, int P, int N, int M, double* a) { - tbb::parallel_for(tbb::blocked_rangeNd{{0,Z},{0,P},{0,N}, {0,M}}, - [=](const tbb::blocked_rangeNd& r_ijkl) { - const auto& r_i = r_ijkl.dim(0); - const auto& r_j = r_ijkl.dim(1); - const auto& r_k = r_ijkl.dim(2); - const auto& r_l = r_ijkl.dim(3); - for (int i = r_i.begin(); i < r_i.end(); ++i) - for (int j = r_j.begin(); j < r_j.end(); ++j) - for (int k = r_k.begin(); k < r_k.end(); ++k) - for (int l = r_l.begin(); l < r_l.end(); ++l) - a[i*P*N*M+j*N*M+k*M+l] = f(a[i*P*N*M+j*N*M+k*M+l]); - } - ); -} - #include static void warmupTBB(); -static bool resultsAreValid(int, const double*, const double*, const double*, const double*, const double*); +static bool resultsAreValid(int, const double*, const double*, const double*, const double*); static void serialDouble(int, double*); int main() { @@ -98,9 +80,9 @@ int main() { double* a1 = new double[Size]; double* a2 = new double[Size]; double* a3 = new double[Size]; - double* aN = new double[Size]; + for (int i = 0; i < Size; ++i) - a0[i] = a1[i] = a2[i] = a3[i] = aN[i] = 1.0; + a0[i] = a1[i] = a2[i] = a3[i] = 1.0; // Perform serial double tbb::tick_count t0 = tbb::tick_count::now(); @@ -127,21 +109,14 @@ int main() { double tbb3d_time = (tbb::tick_count::now() - t0).seconds(); std::printf("3d done\n"); - t0 = tbb::tick_count::now(); - exampleNd(Z, P, N, M, aN); - double tbbNd_time = (tbb::tick_count::now() - t0).seconds(); - std::printf("Nd done\n"); - - if (resultsAreValid(Size, a0, a1, a2, a3, aN)) { + if (resultsAreValid(Size, a0, a1, a2, a3)) { std::cout << "serial_time == " << serial_time << " seconds\n" << "tbb1d_time == " << tbb1d_time << " seconds\n" << "speedup == " << serial_time/tbb1d_time << "\n" << "tbb2d_time == " << tbb2d_time << " seconds\n" << "speedup == " << serial_time/tbb2d_time << "\n" << "tbb3d_time == " << tbb3d_time << " seconds\n" - << "speedup == " << serial_time/tbb3d_time << "\n" - << "tbbNd_time == " << tbbNd_time << " seconds\n" - << "speedup == " << serial_time/tbbNd_time << "\n"; + << "speedup == " << serial_time/tbb3d_time << "\n"; return 0; } else { std::cout << "ERROR: invalid results!\n"; @@ -181,14 +156,12 @@ double f(double v) { static bool resultsAreValid(int N, const double* a0, const double* a1, - const double* a2, const double* a3, - const double* aN) { + const double* a2, const double* a3) { for (int i = 0; i < N; ++i) { if (a0[i] != 2.0 || a1[i] != 2.0 || a2[i] != 2.0 - || a3[i] != 2.0 - || aN[i] != 2.0) { + || a3[i] != 2.0) { std::printf("%d: %f, %f, %f, %f\n", i, a0[i], a1[i], a2[i], a3[i]); std::cerr << "Invalid results" << std::endl; return false; diff --git a/new_examples/performance_tuning/constraints.cpp b/new_examples/performance_tuning/constraints.cpp new file mode 100644 index 0000000000..6df33f9366 --- /dev/null +++ b/new_examples/performance_tuning/constraints.cpp @@ -0,0 +1,108 @@ +/* + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include + + +int N = 1000; +double w = 0.01; +double f(double v); + +void constrain_for_numa_nodes() { + std::vector numa_nodes = tbb::info::numa_nodes(); + std::vector arenas(numa_nodes.size()); + std::vector task_groups(numa_nodes.size()); + + // initialize each arena, each constrained to a different NUMA node + for (int i = 0; i < numa_nodes.size(); i++) + arenas[i].initialize(tbb::task_arena::constraints(numa_nodes[i]), 0); + + // enqueue work to all but the first arena, using the task_group to track work + // by using defer, the task_group reference count is incremented immediately + for (int i = 1; i < numa_nodes.size(); i++) + arenas[i].enqueue( + task_groups[i].defer([] { + tbb::parallel_for(0, N, [](int j) { f(w); }); + }) + ); + + // directly execute the work to completion in the remaining arena + arenas[0].execute([] { + tbb::parallel_for(0, N, [](int j) { f(w); }); + }); + + // join the other arenas to wait on their task_groups + for (int i = 1; i < numa_nodes.size(); i++) + arenas[i].execute([&task_groups, i] { task_groups[i].wait(); }); +} + +void constrain_for_core_type() { + std::vector core_types = tbb::info::core_types(); + tbb::task_arena arena( + tbb::task_arena::constraints{}.set_core_type(core_types.back()) + ); + + arena.execute([] { + tbb::parallel_for(0, N, [](int) { f(w); }); + }); +} + +void constrain_for_no_hyperthreading() { + tbb::task_arena::constraints c; + std::vector core_types = tbb::info::core_types(); + c.set_core_type(core_types.back()); + c.set_max_threads_per_core(1); + tbb::task_arena no_ht_arena(c); + + no_ht_arena.execute( [] { + tbb::parallel_for(0, N, [](int) { f(w); }); + }); +} + +void limit_concurrency_for_no_hyperthreading() { + tbb::task_arena::constraints c; + std::vector core_types = tbb::info::core_types(); + c.set_core_type(core_types.back()); + c.set_max_threads_per_core(1); + int no_ht_concurrency = tbb::info::default_concurrency(c); + tbb::task_arena arena( no_ht_concurrency ); + + arena.execute( [] { + tbb::parallel_for(0, N, [](int) { f(w); }); + }); +} + +#include + +int main() { + std::cout << "Running numa node constraint example\n"; + constrain_for_numa_nodes(); + std::cout << "Running core type constraint example\n"; + constrain_for_core_type(); + std::cout << "Running one thread per core constraint example\n"; + constrain_for_no_hyperthreading(); + std::cout << "Running limited concurrency example\n"; + limit_concurrency_for_no_hyperthreading(); + std::cout << "done\n"; + return 0; +} + +double f(double v) { + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < 0.01); + return 2*v; +} diff --git a/new_examples/performance_tuning/global_control_and_explicit_arena.cpp b/new_examples/performance_tuning/global_control_and_explicit_arena.cpp index 3bf5e90c14..65443ab64e 100644 --- a/new_examples/performance_tuning/global_control_and_explicit_arena.cpp +++ b/new_examples/performance_tuning/global_control_and_explicit_arena.cpp @@ -14,7 +14,6 @@ limitations under the License. */ -#include #include const int default_P = tbb::info::default_concurrency(); @@ -33,10 +32,7 @@ void arenaGlobalControlExplicitArena(int p) { } #include -#include -#include -#include -#include +#include #include std::atomic next_tid; @@ -65,7 +61,6 @@ void clearParticipation() { } void dumpParticipation(int p) { - int end = next_tid; int sum = tid_participation[0]; std::cout << "[" << tid_participation[0]; for (int i = 1; i < p; ++i) { @@ -76,7 +71,7 @@ void dumpParticipation(int p) { std::cout << ", -"; std::cout << "]\n" << "sum == " << sum << "\n" - << "expected sum " << 10*default_P << "\n"; + << "expected sum " << 10*default_P << "\n\n"; clearParticipation(); } diff --git a/new_examples/performance_tuning/global_control_and_explicit_conflict.cpp b/new_examples/performance_tuning/global_control_and_explicit_conflict.cpp index a46c9d1100..35f5b5022b 100644 --- a/new_examples/performance_tuning/global_control_and_explicit_conflict.cpp +++ b/new_examples/performance_tuning/global_control_and_explicit_conflict.cpp @@ -14,44 +14,55 @@ limitations under the License. */ -#include #include #include const int default_P = tbb::info::default_concurrency(); -void doWork(int inc, double seconds); -void waitUntil(int N); +using counter_t = std::atomic; +void waitUntil(int N, counter_t& c); +void noteParticipation(int offset); +void dumpParticipation(); -void arenaGlobalControlExplicitArena(int p, int inc) { +void doWork(int offset, double seconds) { + noteParticipation(offset); + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < seconds); +} + +counter_t counter1 = 0, counter2 = 0; + +void arenaGlobalControlExplicitArena(int p, int offset) { tbb::global_control gc(tbb::global_control::max_allowed_parallelism, p); + // we use waitUntil to force overlap of the gc lifetimes + waitUntil(2, counter1); tbb::task_arena a{2*tbb::info::default_concurrency()}; a.execute([=]() { tbb::parallel_for(0, 10*tbb::info::default_concurrency(), - [=](int) { doWork(inc, 0.01); }); + [=](int) { doWork(offset, 0.01); }); }); + + // we prevent either gc from being destroyed until both are done + waitUntil(2, counter2); } void runTwoThreads(int p0, int p1) { - std::thread t0([=]() { - waitUntil(2); - arenaGlobalControlExplicitArena(p0, 1); - }); - std::thread t1([=]() { - waitUntil(2); - arenaGlobalControlExplicitArena(p1, 10000); - }); + std::thread t0([=]() { arenaGlobalControlExplicitArena(p0, 1); }); + std::thread t1([=]() { arenaGlobalControlExplicitArena(p1, 10000); }); t0.join(); t1.join(); } +int main() { + runTwoThreads(default_P/2, 2*default_P); + dumpParticipation(); + return 0; +} + #include -#include -#include -#include -#include +#include #include std::atomic next_tid; @@ -66,12 +77,6 @@ void noteParticipation(int inc) { tid_participation[t] += inc; } -void doWork(int inc, double seconds) { - noteParticipation(inc); - tbb::tick_count t0 = tbb::tick_count::now(); - while ((tbb::tick_count::now() - t0).seconds() < seconds); -} - void clearParticipation() { next_tid = 0; my_tid.clear(); @@ -79,11 +84,10 @@ void clearParticipation() { p = 0; } -void dumpParticipation(int p) { - int end = next_tid; +void dumpParticipation() { int sum = tid_participation[0]; std::cout << "[" << tid_participation[0]; - for (int i = 1; i < 2*default_P; ++i) { + for (int i = 1; i < tid_participation.size(); ++i) { sum += tid_participation[i]; std::cout << ", " << tid_participation[i]; } @@ -91,18 +95,12 @@ void dumpParticipation(int p) { << "sum == " << sum << "\n" << "expected sum == " << 10*default_P + 10*default_P*10000 << "\n"; clearParticipation(); + counter1 = 0; counter2 = 0; } -std::atomic count_up = 0; - -void waitUntil(int N) { - ++count_up; - while (count_up != N); +void waitUntil(int N, counter_t& c) { + ++c; + while (c != N); } -int main() { - runTwoThreads(default_P/2, 2*default_P); - dumpParticipation(2*default_P); - return 0; -} diff --git a/new_examples/performance_tuning/global_control_and_implicit_arena.cpp b/new_examples/performance_tuning/global_control_and_implicit_arena.cpp index a1074dc147..e9a3dfc97a 100644 --- a/new_examples/performance_tuning/global_control_and_implicit_arena.cpp +++ b/new_examples/performance_tuning/global_control_and_implicit_arena.cpp @@ -14,11 +14,18 @@ limitations under the License. */ -#include #include const int default_P = tbb::info::default_concurrency(); -void doWork(double seconds); + +void noteParticipation(); /* record info for participation vector */ +void dumpParticipation(int p); /* display participation vector */ + +void doWork(double seconds) { + noteParticipation(); + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < seconds); +} void arenaGlobalControlImplicitArena(int p) { tbb::global_control gc(tbb::global_control::max_allowed_parallelism, p); @@ -27,11 +34,19 @@ void arenaGlobalControlImplicitArena(int p) { [](int) { doWork(0.01); }); } + +int main() { + arenaGlobalControlImplicitArena(default_P); + dumpParticipation(default_P); + arenaGlobalControlImplicitArena(default_P/2); + dumpParticipation(default_P/2); + arenaGlobalControlImplicitArena(2*default_P); + dumpParticipation(2*default_P); + return 0; +} + #include -#include -#include -#include -#include +#include #include std::atomic next_tid; @@ -46,12 +61,6 @@ void noteParticipation() { ++tid_participation[t]; } -void doWork(double seconds) { - noteParticipation(); - tbb::tick_count t0 = tbb::tick_count::now(); - while ((tbb::tick_count::now() - t0).seconds() < seconds); -} - void clearParticipation() { next_tid = 0; my_tid.clear(); @@ -60,7 +69,6 @@ void clearParticipation() { } void dumpParticipation(int p) { - int end = next_tid; int sum = tid_participation[0]; std::cout << "[" << tid_participation[0]; for (int i = 1; i < std::min(p, default_P); ++i) { @@ -71,17 +79,9 @@ void dumpParticipation(int p) { std::cout << ", -"; std::cout << "]\n" << "sum == " << sum << "\n" - << "expected sum " << 10*default_P << "\n"; + << "expected sum " << 10*default_P << "\n\n"; clearParticipation(); } -int main() { - arenaGlobalControlImplicitArena(default_P); - dumpParticipation(default_P); - arenaGlobalControlImplicitArena(default_P/2); - dumpParticipation(default_P/2); - arenaGlobalControlImplicitArena(2*default_P); - dumpParticipation(2*default_P); - return 0; -} + diff --git a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp index bf3fa3b863..8474c57de0 100644 --- a/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp +++ b/new_examples/performance_tuning/global_control_and_implicit_conflict.cpp @@ -14,58 +14,67 @@ limitations under the License. */ -#include +#include #include #include const int default_P = tbb::info::default_concurrency(); -void doWork(int inc, double seconds); -void waitUntil(int N); -void arenaGlobalControlImplicitArena(int p, int inc) { +using counter_t = std::atomic; +void waitUntil(int N, counter_t& c); +void noteParticipation(int offset); +void dumpParticipation(); + +void doWork(int offset, double seconds) { + noteParticipation(offset); + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < seconds); +} + +counter_t counter1 = 0, counter2 = 0; + +void arenaGlobalControlImplicitArena(int p, int offset) { tbb::global_control gc(tbb::global_control::max_allowed_parallelism, p); + // we use waitUntil to force overlap of the gc lifetimes + waitUntil(2, counter1); + tbb::parallel_for(0, 10*default_P, - [=](int) { doWork(inc, 0.01); }); + [=](int) { + doWork(offset, 0.01); + }); + + // we prevent either gc from being destroyed until both are done + waitUntil(2, counter2); } void runTwoThreads(int p0, int p1) { - std::thread t0([=]() { - waitUntil(2); - arenaGlobalControlImplicitArena(p0, 1); - }); - std::thread t1([=]() { - waitUntil(2); - arenaGlobalControlImplicitArena(p1, 10000); - }); + std::thread t0([=]() { arenaGlobalControlImplicitArena(p0, 1); }); + std::thread t1([=]() { arenaGlobalControlImplicitArena(p1, 10000); }); t0.join(); t1.join(); } -#include -#include -#include -#include -#include +int main() { + runTwoThreads(default_P/2, default_P); + dumpParticipation(); + return 0; +} + +#include #include std::atomic next_tid; tbb::enumerable_thread_specific my_tid(-1); -std::vector> tid_participation(2*default_P); +std::vector> tid_participation(default_P+1); -void noteParticipation(int inc) { +void noteParticipation(int offset) { auto& t = my_tid.local(); if (t == -1) { t = next_tid++; } - tid_participation[t] += inc; -} - -void doWork(int inc, double seconds) { - noteParticipation(inc); - tbb::tick_count t0 = tbb::tick_count::now(); - while ((tbb::tick_count::now() - t0).seconds() < seconds); + tid_participation[t] += offset; } void clearParticipation() { @@ -75,11 +84,10 @@ void clearParticipation() { p = 0; } -void dumpParticipation(int p) { - int end = next_tid; +void dumpParticipation() { int sum = tid_participation[0]; std::cout << "[" << tid_participation[0]; - for (int i = 1; i < 2*default_P + 1; ++i) { + for (int i = 1; i < tid_participation.size(); ++i) { sum += tid_participation[i]; std::cout << ", " << tid_participation[i]; } @@ -87,18 +95,12 @@ void dumpParticipation(int p) { << "sum == " << sum << "\n" << "expected sum == " << 10*default_P + 10*default_P*10000 << "\n"; clearParticipation(); + counter1 = 0; counter2 = 0; } -std::atomic count_up = 0; - -void waitUntil(int N) { - ++count_up; - while (count_up != N); +void waitUntil(int N, counter_t& c) { + ++c; + while (c != N); } -int main() { - runTwoThreads(default_P/2, default_P); - dumpParticipation(default_P); - return 0; -} diff --git a/new_examples/performance_tuning/parallel_deterministic_reduce_partitioners.cpp b/new_examples/performance_tuning/parallel_deterministic_reduce_partitioners.cpp new file mode 100644 index 0000000000..bef0d67a40 --- /dev/null +++ b/new_examples/performance_tuning/parallel_deterministic_reduce_partitioners.cpp @@ -0,0 +1,136 @@ +/* + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include +#include + +double serialPiExample(int num_intervals) { + double dx = 1.0 / num_intervals; + double sum = 0.0; + for (int i = 0; i < num_intervals; ++i) { + double x = (i+0.5)*dx; + double h = sqrt(1-x*x); + sum += h*dx; + } + return 4 * sum; +} + +template< typename Partitioner > +double reducePiExample(int num_intervals, int grainsize) { + double dx = 1.0 / num_intervals; + double sum = tbb::parallel_reduce( + /* range = */ tbb::blocked_range(0, num_intervals, grainsize), + /* idenity = */ 0.0, + /* func */ + [=](const tbb::blocked_range& r, double init) -> double { + for (int i = r.begin(); i != r.end(); ++i) { + double x = (i + 0.5)*dx; + double h = sqrt(1 - x*x); + init += h*dx; + } + return init; + }, + /* reduction */ + [](double x, double y) -> double { + return x + y; + }, + /* partitioner */ Partitioner() + ); + return 4 * sum; +} + +template< typename Partitioner > +double deterministicReducePiExample(int num_intervals, int grainsize) { + double dx = 1.0 / num_intervals; + double sum = tbb::parallel_deterministic_reduce( + /* range = */ tbb::blocked_range(0, num_intervals, grainsize), + /* identity = */ 0.0, + /* func */ + [=](const tbb::blocked_range& r, double init) -> double { + for (int i = r.begin(); i != r.end(); ++i) { + double x = (i + 0.5)*dx; + double h = sqrt(1 - x*x); + init += h*dx; + } + return init; + }, + /* reduction */ + [](double x, double y) -> double { + return x + y; + }, + /* partitioner */ Partitioner() + ); + return 4 * sum; +} + +static void warmupTBB() { + tbb::parallel_for(0, tbb::info::default_concurrency(), [](int) { + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < 0.01); + }); +} + +int main() { + // use the most performance codes + // only a single NUMA node + // and only 1 thread per core + tbb::task_arena::constraints c; + c.set_numa_id(tbb::info::numa_nodes()[0]); + c.set_core_type(tbb::info::core_types().back()); + c.set_max_threads_per_core(1); + c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c))); + tbb::task_arena a(c); + + std::cout << "Using an arena with " << a.max_concurrency() << " slots\n"; + + a.execute([&]() { + int num_intervals = 1<<26; + tbb::tick_count ts_0 = tbb::tick_count::now(); + double spi = serialPiExample(num_intervals); + tbb::tick_count ts_1 = tbb::tick_count::now(); + double serial_time = (ts_1 - ts_0).seconds(); + std::cout << "serial, " << spi << ", " << serial_time << std::endl; + warmupTBB(); + std::cout << "speedups relative to serial:" << std::endl; + std::cout << "gs, r-simple, d-simple, r-static, d-static, r-auto" << std::endl; + for (int gs = 1; gs <= num_intervals; gs *= 2) { + reducePiExample(num_intervals, gs); + tbb::tick_count t0 = tbb::tick_count::now(); + double v0 = reducePiExample(num_intervals, gs); + tbb::tick_count t1 = tbb::tick_count::now(); + double v1 = reducePiExample(num_intervals, gs); + tbb::tick_count t2 = tbb::tick_count::now(); + double v2 = reducePiExample(num_intervals, gs); + tbb::tick_count t3 = tbb::tick_count::now(); + double v3 = deterministicReducePiExample(num_intervals, gs); + tbb::tick_count t4 = tbb::tick_count::now(); + double v4 = deterministicReducePiExample(num_intervals, gs); + tbb::tick_count t5 = tbb::tick_count::now(); + std::cout << v0 << ", " << v1 << ", " << v2 + << ", " << v3 << ", " << v4 << "\n"; + std::cout << gs + << ", " << serial_time / (t2-t1).seconds() + << ", " << serial_time / (t4-t3).seconds() + << ", " << serial_time / (t3-t2).seconds() + << ", " << serial_time / (t5-t4).seconds() + << ", " << serial_time / (t1-t0).seconds() + << std::endl; + } + }); + return 0; +} + diff --git a/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp b/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp new file mode 100644 index 0000000000..30f995e269 --- /dev/null +++ b/new_examples/performance_tuning/parallel_for_addition_partitioners.cpp @@ -0,0 +1,134 @@ +/* + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include +#include +#include + +template +void parForAdd(double v, int N, double *a, const Partitioner& p) { + tbb::parallel_for( tbb::blocked_range(0, N, 1), + [v, a](const tbb::blocked_range& r) { + for (int i = r.begin(); i < r.end(); ++i) { + a[i] += v; + } + }, p + ); +} + +template +void parForAdd(double v, int N, double *a, Partitioner& p) { + tbb::parallel_for( tbb::blocked_range(0, N, 1), + [v, a](const tbb::blocked_range& r) { + for (int i = r.begin(); i < r.end(); ++i) { + a[i] += v; + } + }, p + ); +} + +void resetV(int N, double *v); +void resetA(int N, double *a); +static void warmupTBB(); + +int main(int argc, char *argv[]) { + // use the most performance codes + // only a single NUMA node + // and only 1 thread per core + tbb::task_arena::constraints c; + c.set_numa_id(tbb::info::numa_nodes()[0]); + c.set_core_type(tbb::info::core_types().back()); + c.set_max_threads_per_core(1); + c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c))); + tbb::task_arena arena(c); + + std::cout << "Using an arena with " << arena.max_concurrency() << " slots\n"; + + arena.execute([&]() { + int M = 10000; + int N = 100000; + + std::cout << "P = " << tbb::info::default_concurrency() + << std::endl << "N = " << N + << std::endl << "M = " << M << std::endl; + + double *v = new double[M]; + double *a = new double[N]; + + warmupTBB(); + resetV(M, v); + resetA(N, a); + tbb::tick_count t0 = tbb::tick_count::now(); + for (int i = 0; i < M; ++i) { + parForAdd(v[i], N, a, tbb::auto_partitioner{}); + } + double auto_time = (tbb::tick_count::now() - t0).seconds(); + + warmupTBB(); + resetA(N, a); + tbb::affinity_partitioner aff_p; + t0 = tbb::tick_count::now(); + for (int i = 0; i < M; ++i) { + parForAdd(v[i], N, a, aff_p); + } + double affinity_time = (tbb::tick_count::now() - t0).seconds(); + + warmupTBB(); + resetA(N, a); + t0 = tbb::tick_count::now(); + for (int i = 0; i < M; ++i) { + parForAdd(v[i], N, a, tbb::static_partitioner{}); + } + double static_time = (tbb::tick_count::now() - t0).seconds(); + + std::cout << "auto_partitioner = " << auto_time << std::endl + << "affinity_partitioner = " << affinity_time << std::endl + << "static_partitioner = " << static_time << std::endl; + + delete [] v; + delete [] a; + }); + + return 0; +} + + +void resetV(int N, double *v) { + for (int i = 0; i < N; ++i) { + v[i] = i; + } + std::shuffle(v, v+N, std::random_device{}); +} + +void resetA(int N, double *a) { + for (int i = 0; i < N; ++i) { + a[i] = 0; + } +} + +static void warmupTBB() { + // This is a simple loop that should get workers started. + // oneTBB creates workers lazily on first use of the library + // so this hides the startup time when looking at trivial + // examples that do little real work. + tbb::parallel_for(0, tbb::info::default_concurrency(), + [=](int) { + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < 0.01); + } + ); +} \ No newline at end of file diff --git a/new_examples/performance_tuning/parallel_for_partitioners.cpp b/new_examples/performance_tuning/parallel_for_partitioners.cpp new file mode 100644 index 0000000000..b2d57de755 --- /dev/null +++ b/new_examples/performance_tuning/parallel_for_partitioners.cpp @@ -0,0 +1,99 @@ +/* + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include + +void doWork(double sec); + +template +void pforWork(int N, const Partitioner& p) { + tbb::parallel_for( tbb::blocked_range(0, N, 1), + [](const tbb::blocked_range& r) { + for (int i = r.begin(); i < r.end(); ++i) { + doWork(i); + } + }, p + ); +} + +template +void pforWork(int N, Partitioner& p) { + tbb::parallel_for( tbb::blocked_range(0, N, 1), + [](const tbb::blocked_range& r) { + for (int i = r.begin(); i < r.end(); ++i) { + doWork(i); + } + }, p + ); +} + +static void warmupTBB(); + +int main(int argc, char *argv[]) { + int N = 1000; + int M = 10; + + std::cout << "P = " << tbb::info::default_concurrency() + << std::endl << "M = " << M + << std::endl << "N = " << N << std::endl; + + warmupTBB(); + tbb::tick_count t0 = tbb::tick_count::now(); + for (int i = 0; i < M; ++i) { + pforWork(N, tbb::auto_partitioner{}); + } + double auto_time = (tbb::tick_count::now() - t0).seconds(); + + warmupTBB(); + tbb::affinity_partitioner aff_p; + t0 = tbb::tick_count::now(); + for (int i = 0; i < M; ++i) { + pforWork(N, aff_p); + } + double affinity_time = (tbb::tick_count::now() - t0).seconds(); + + warmupTBB(); + t0 = tbb::tick_count::now(); + for (int i = 0; i < M; ++i) { + pforWork(N, tbb::static_partitioner{}); + } + double static_time = (tbb::tick_count::now() - t0).seconds(); + + std::cout << "auto_partitioner = " << auto_time << " seconds" << std::endl + << "affinity_partitioner = " << affinity_time << " seconds" << std::endl + << "static_partitioner = " << static_time << " seconds" << std::endl; + return 0; +} + +static void warmupTBB() { + // This is a simple loop that should get workers started. + // oneTBB creates workers lazily on first use of the library + // so this hides the startup time when looking at trivial + // examples that do little real work. + tbb::parallel_for(0, tbb::info::default_concurrency(), + [=](int) { + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < 0.01); + } + ); +} + +void doWork(double usec) { + double sec = usec*1e-06; + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() <= sec); +} \ No newline at end of file diff --git a/new_examples/performance_tuning/parallel_for_partitioners_timed.cpp b/new_examples/performance_tuning/parallel_for_partitioners_timed.cpp new file mode 100644 index 0000000000..7155b0caa9 --- /dev/null +++ b/new_examples/performance_tuning/parallel_for_partitioners_timed.cpp @@ -0,0 +1,114 @@ +/* + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include +#include + +static inline void spinWaitForAtLeast(double sec=0.0); + +static inline double executeFor(int num_trials, int N, double tpi) { + tbb::tick_count t0; + for (int t = -1; t < num_trials; ++t) { + if (!t) t0 = tbb::tick_count::now(); + for (int i = 0; i < N; ++i) { + spinWaitForAtLeast(tpi); + } + } + tbb::tick_count t1 = tbb::tick_count::now(); + return (t1 - t0).seconds()/num_trials; +} + +template< typename P > +static inline double executePfor(int num_trials, int N, + int gs, P& p, double tpi) { + tbb::tick_count t0; + for (int t = -1; t < num_trials; ++t) { + if (!t) t0 = tbb::tick_count::now(); + tbb::parallel_for ( + tbb::blocked_range{0, N, static_cast(gs)}, + [tpi](const tbb::blocked_range& r) { + for (int i = r.begin(); i < r.end(); ++i) { + spinWaitForAtLeast(tpi); + } + }, + p + ); + } + tbb::tick_count t1 = tbb::tick_count::now(); + return (t1 - t0).seconds()/num_trials; +} + +int main() { + // use the most performance codes + // only a single NUMA node + // and only 1 thread per core + tbb::task_arena::constraints c; + c.set_numa_id(tbb::info::numa_nodes()[0]); + c.set_core_type(tbb::info::core_types().back()); + c.set_max_threads_per_core(1); + c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c))); + tbb::task_arena a(c); + + std::cout << "Using an arena with " << a.max_concurrency() << " slots\n"; + + a.execute([&]() { + tbb::auto_partitioner auto_p; + tbb::simple_partitioner simple_p; + tbb::static_partitioner static_p; + const std::string pname[4] = {"simple", "auto", "affinity", "static"}; + + const int N = 262144; + const int T = 20; + const double ten_ns = 0.00000001; + const double twenty_us = 0.00002; + double timing[4][19]; + + for (double tpi = ten_ns; tpi < twenty_us; tpi *= 10) { + std::cout << "Speedups for " << tpi << " seconds per iteration" << std::endl + << "partitioner"; + for (int gs = 1, i = 0; gs <= N; gs *= 2, ++i) + std::cout << ", " << gs; + std::cout << std::endl; + + double serial_time = executeFor(T, N, tpi); + + for (int gs = 1, i = 0; gs <= N; gs *= 2, ++i) { + tbb::affinity_partitioner affinity_p; + spinWaitForAtLeast(0.001); + timing[0][i] = executePfor(T, N, gs, simple_p, tpi); + timing[1][i] = executePfor(T, N, gs, auto_p, tpi); + timing[2][i] = executePfor(T, N, gs, affinity_p, tpi); + timing[3][i] = executePfor(T, N, gs, static_p, tpi); + } + for (int p = 0; p < 4; ++p) { + std::cout << pname[p]; + for (int gs = 1, i = 0; gs <= N; gs *= 2, ++i) + std::cout << ", " << serial_time/timing[p][i]; + std::cout << std::endl; + } + std::cout << std::endl; + } + }); + + return 0; +} + +static inline void spinWaitForAtLeast(double sec) { + if (sec == 0.0) return; + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < sec); +} diff --git a/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp b/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp new file mode 100644 index 0000000000..323a43de3b --- /dev/null +++ b/new_examples/performance_tuning/parallel_for_transpose_partitioners.cpp @@ -0,0 +1,182 @@ +/* + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include + + +double serialTranspose(int N, double *a, double *b) { + tbb::tick_count t0 = tbb::tick_count::now(); + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + b[j*N+i] = a[i*N+j]; + } + } + tbb::tick_count t1 = tbb::tick_count::now(); + + return (t1-t0).seconds(); +} + +void obliviousTranspose(int N, int ib, int ie, int jb, int je, + double *a, double *b, int gs) { + int ilen = ie-ib; + int jlen = je-jb; + if (ilen > gs || jlen > gs) { + if ( ilen > jlen ) { + int imid = (ib+ie)/2; + obliviousTranspose(N, ib, imid, jb, je, a, b, gs); + obliviousTranspose(N, imid, ie, jb, je, a, b, gs); + } else { + int jmid = (jb+je)/2; + obliviousTranspose(N, ib, ie, jb, jmid, a, b, gs); + obliviousTranspose(N, ib, ie, jmid, je, a, b, gs); + } + } else { + for (int i = ib; i < ie; ++i) { + for (int j = jb; j < je; ++j) { + b[j*N+i] = a[i*N+j]; + } + } + } +} + +double serialObliviousTranspose(int N, double *a, double *b, int gs) { + tbb::tick_count t0 = tbb::tick_count::now(); + obliviousTranspose(N, 0, N, 0, N, a, b, gs); + tbb::tick_count t1 = tbb::tick_count::now(); + return (t1-t0).seconds(); +} + +template< typename P > +double pforTranspose(int N, double *a, double *b, int gs) { + tbb::tick_count t0 = tbb::tick_count::now(); + tbb::parallel_for( tbb::blocked_range(0, N, gs), + [N, a, b](const tbb::blocked_range& r) { + for (int i = r.begin(); i < r.end(); ++i) { + for (int j = 0; j < N; ++j) { + b[j*N+i] = a[i*N+j]; + } + } + }, P() + ); + tbb::tick_count t1 = tbb::tick_count::now(); + return (t1-t0).seconds(); +} + +template +double pforTranspose2d(int N, double *a, double *b, int gs) { + tbb::tick_count t0 = tbb::tick_count::now(); + tbb::parallel_for( tbb::blocked_range2d{ + 0, N, static_cast(gs), + 0, N, static_cast(gs)}, + [N, a, b](const tbb::blocked_range2d& r) { + for (int i = r.rows().begin(); i < r.rows().end(); ++i) { + for (int j = r.cols().begin(); j < r.cols().end(); ++j) { + b[j*N+i] = a[i*N+j]; + } + } + }, P() + ); + tbb::tick_count t1 = tbb::tick_count::now(); + return (t1-t0).seconds(); +} + +void setArray(int N, double *a); +void checkTranspose(int N, double *a); + +#include + +int main() { + // use the most performance codes + // only a single NUMA node + // and only 1 thread per core + tbb::task_arena::constraints c; + c.set_numa_id(tbb::info::numa_nodes()[0]); + c.set_core_type(tbb::info::core_types().back()); + c.set_max_threads_per_core(1); + c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c))); + tbb::task_arena a(c); + + std::cout << "Using an arena with " << a.max_concurrency() << " slots\n"; + + a.execute([&]() { + int N = 2<<12; // 8192 + double *a = new double[N*N]; + double *b = new double[N*N]; + setArray(N, a); + setArray(N, b); + + serialTranspose(N, a, b); + double ts = serialTranspose(N, a, b); + checkTranspose(N, b); + std::cout << "Serial Time = " << ts << std::endl; + + std::cout << "Parallel Times:" << std::endl + << "grainsize, oblivious, 1d auto, 1d simple, 2d auto, 2d simple" << std::endl; + for (int gs = 1; gs <= N; gs *= 2) { + setArray(N, a); + setArray(N, b); + serialObliviousTranspose(N, a, b, gs); + double to = serialObliviousTranspose(N, a, b, gs); + checkTranspose(N, b); + + setArray(N, a); + setArray(N, b); + pforTranspose(N, a, b, gs); + double t1d_auto = pforTranspose(N, a, b, gs); + + setArray(N, a); + setArray(N, b); + pforTranspose(N, a, b, gs); + double t1d_simple = pforTranspose(N, a, b, gs); + + setArray(N, a); + setArray(N, b); + pforTranspose2d(N, a, b, gs); + double t2d_auto = pforTranspose2d(N, a, b, gs); + + setArray(N, a); + setArray(N, b); + pforTranspose2d(N, a, b, gs); + double t2d_simple = pforTranspose2d(N, a, b, gs); + + std::cout << gs + << ", " << to + << ", " << t1d_auto + << ", " << t1d_simple + << ", " << t2d_auto + << ", " << t2d_simple << std::endl; + } + }); + return 0; +} + +void setArray(int N, double *a) { + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + a[i*N+j] = i; + } + } +} + +void checkTranspose(int N, double *a) { + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + if (a[i*N+j] != j) { + std::cout << "Transpose failed" << std::endl; + } + } + } +} diff --git a/new_examples/performance_tuning/parallel_pipeline_timed.cpp b/new_examples/performance_tuning/parallel_pipeline_timed.cpp new file mode 100644 index 0000000000..5e66274627 --- /dev/null +++ b/new_examples/performance_tuning/parallel_pipeline_timed.cpp @@ -0,0 +1,302 @@ +/* + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include +#include +#include + +void doWork(double sec) { + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() <= sec); +} + +tbb::filter makeMiddleFilters(int num_filters, tbb::filter_mode m, double usec, + int imbalanced_filter = -1, double imbalance_factor = 1) { + double w = (imbalanced_filter == num_filters) ? usec*imbalance_factor : usec; + + auto f = tbb::make_filter(m, + [=](int i) -> int { + doWork(w); + return i; + }); + if (num_filters > 1) + return f & makeMiddleFilters(num_filters-1, m, usec, + imbalanced_filter, imbalance_factor); + else + return f; +} + +tbb::filter buildFilterChain(std::atomic& counter, std::vector& data, + int num_filters, tbb::filter_mode m, double usec, + double imbalance_factor = 1) { + counter = data.size() - 1; + tbb::filter middle{}; + tbb::filter end{}; + int imbalanced_filter = (imbalance_factor == 1) ? -1 : num_filters - 3; + + if (num_filters > 1) { + tbb::filter start = + tbb::make_filter(m, + [&counter, &data, usec](tbb::flow_control& fc) -> int { + int i = counter--; + if (i > 0) { + doWork(usec); + return data[i]; + } else { + fc.stop(); + return 0; + } + }); + + + tbb::filter end = + tbb::make_filter(m, + [=](int i) { + doWork(usec); + }); + + if (num_filters > 2) { + tbb::filter middle = makeMiddleFilters(num_filters-2, m, usec, imbalanced_filter, imbalance_factor); + return start & middle & end; + } else { + return start & end; + } + } else { + return tbb::make_filter(m, + [&](tbb::flow_control& fc) { + int i = counter--; + if (i > 0) { + doWork(usec); + } else { + fc.stop(); + } + }); + } +} + +double runSerial(std::vector& data, int num_filters, double usec, int num_items, + int imbalanced_filter = -1, double imbalance_factor = 1) { + tbb::tick_count t0 = tbb::tick_count::now(); + for (int i = 0; i < num_items; ++i) { + for (int j = 0; j < num_filters; ++j) { + if (j == imbalanced_filter) { + doWork(usec*imbalance_factor); + } else { + doWork(usec); + } + } + } + double total_time = (tbb::tick_count::now() - t0).seconds(); + return total_time; +} + +std::vector makeVector(int N); +static void warmupTBB(); + +std::vector modes = {tbb::filter_mode::serial_in_order, + tbb::filter_mode::serial_out_of_order, + tbb::filter_mode::parallel }; +std::vector mode_name = {"serial_in_order", + "serial_out_of_order", + "parallel"}; + + + +void runEightFiltersOneTokenBalanced(int N, std::vector& data) { + std::vector spin_times = {1e-7, 1e-6, 1e-5, 1e-4}; + int num_filters = 8; + + std::cout << "\nVarying the mode and spin_time for 8 filters:\n"; + + // Balanced serial time + std::vector serial_time; + for (double spin_time : spin_times) { + serial_time.push_back(runSerial(data, num_filters, spin_time, N)); + } + + std::cout << "\nmode"; + for (double spin_time : spin_times) + std::cout << ", " << spin_time; + std::cout << std::endl; + + for (int m = 0; m < modes.size(); ++m) { + std::atomic counter = 0; + std::cout << mode_name[m]; + for (int st = 0; st < spin_times.size(); ++st) { + double spin_time = spin_times[st]; + + auto chain = + buildFilterChain(counter, data, num_filters, modes[m], spin_time); + + tbb::tick_count t0 = tbb::tick_count::now(); + warmupTBB(); + tbb::parallel_pipeline(1, chain); + double t = (tbb::tick_count::now() - t0).seconds(); + std::cout << ", " << serial_time[st]/t; + } + std::cout << std::endl; + } +} + +void runEightTokensIncreasingFilters(int N, std::vector& data) { + int max_threads = tbb::this_task_arena::max_concurrency(); + double spin_time = 0.0001; + + std::cout << "\nVarying number of filters with 100us spin and 8 tokens:\n"; + + // Balanced serial time + std::vector serial_time; + for (int num_filters = 1; num_filters <= max_threads; ++num_filters) { + serial_time.push_back(runSerial(data, num_filters, spin_time, N)); + } + + std::cout << "\nmode"; + for (int num_filters = 1; num_filters <= max_threads; ++num_filters) + std::cout << ", " << num_filters; + std::cout << std::endl; + + for (int m = 0; m < modes.size(); ++m) { + std::atomic counter = 0; + std::cout << mode_name[m]; + for (int num_filters = 1; num_filters <= max_threads; ++num_filters) { + auto chain = + buildFilterChain(counter, data, num_filters, modes[m], spin_time); + + tbb::tick_count t0 = tbb::tick_count::now(); + warmupTBB(); + tbb::parallel_pipeline(max_threads, chain); + double t = (tbb::tick_count::now() - t0).seconds(); + std::cout << ", " << serial_time[num_filters-1]/t; + } + std::cout << std::endl; + } +} + +void runEightFiltersIncreasingTokens(int N, std::vector& data) { + int max_threads = tbb::this_task_arena::max_concurrency(); + int num_filters = 8; + double spin_time = 0.0001; + + std::cout << "\nVarying number of tokens with 8 filters and 100us spin:\n"; + + double serial_time = runSerial(data, num_filters, spin_time, N); + + std::cout << "\nmode"; + for (int num_tokens = 1; num_tokens <= 2*max_threads; ++num_tokens) + std::cout << ", " << num_tokens; + std::cout << std::endl; + + for (int m = 0; m < modes.size(); ++m) { + std::atomic counter = 0; + std::cout << mode_name[m]; + for (int num_tokens = 1; num_tokens <= 2*max_threads; ++num_tokens) { + auto chain = + buildFilterChain(counter, data, num_filters, modes[m], spin_time); + + tbb::tick_count t0 = tbb::tick_count::now(); + warmupTBB(); + tbb::parallel_pipeline(num_tokens, chain); + double t = (tbb::tick_count::now() - t0).seconds(); + std::cout << ", " << serial_time/t; + } + std::cout << std::endl; + } +} + +void runEightFiltersImbalanced(int N, std::vector& data) { + std::vector imbalance = {0.1, 0.5, 0.75, 1.5, 2, 10}; + int num_filters = 8; + int num_tokens = 8; + double spin_time = 0.0001; + + std::cout << "\nVarying imbalance of 1 of 8 filters:\n"; + + // Imbalanced serial time + std::vector serial_time; + for (double imb : imbalance) { + serial_time.push_back(runSerial(data, num_filters, spin_time, N, imb)); + } + + std::cout << "\nmode"; + for (double imb : imbalance) + std::cout << ", " << imb; + std::cout << std::endl; + + for (int m = 0; m < modes.size(); ++m) { + std::atomic counter = 0; + std::cout << mode_name[m]; + for (int imb = 0; imb < imbalance.size(); ++imb) { + double imbalance_factor = imbalance[imb]; + auto chain = + buildFilterChain(counter, data, num_filters, modes[m], spin_time, imbalance_factor); + + tbb::tick_count t0 = tbb::tick_count::now(); + warmupTBB(); + tbb::parallel_pipeline(num_tokens, chain); + double t = (tbb::tick_count::now() - t0).seconds(); + std::cout << ", " << serial_time[imb]/t; + } + std::cout << std::endl; + } +} + +int main() { + // use the most performance codes + // only a single NUMA node + // and only 1 thread per core + tbb::task_arena::constraints c; + c.set_numa_id(tbb::info::numa_nodes()[0]); + c.set_core_type(tbb::info::core_types().front()); + c.set_max_threads_per_core(1); + c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c))); + tbb::task_arena a(c); + + std::cout << "Using an arena with " << a.max_concurrency() << " slots\n"; + + a.execute([&]() { + int N = 8000; + std::vector data = makeVector(N); + //runEightFiltersOneTokenBalanced(N, data); + //runEightTokensIncreasingFilters(N, data); + //runEightFiltersIncreasingTokens(N, data); + runEightFiltersImbalanced(N, data); + }); + + return 0; +} + +std::vector makeVector(int N) { + std::vector v; + v.reserve(N); + for (int i = 0; i < N; ++i) { + v.emplace_back(i); + } + return v; +} + +static void warmupTBB() { + // This is a simple loop that should get workers started. + // oneTBB creates workers lazily on first use of the library + // so this hides the startup time when looking at trivial + // examples that do little real work. + tbb::parallel_for(0, tbb::this_task_arena::max_concurrency(), + [=](int) { + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < 0.01); + } + ); +} diff --git a/new_examples/performance_tuning/partitioners_imbalanced_loops.cpp b/new_examples/performance_tuning/partitioners_imbalanced_loops.cpp new file mode 100644 index 0000000000..bcedeba6df --- /dev/null +++ b/new_examples/performance_tuning/partitioners_imbalanced_loops.cpp @@ -0,0 +1,111 @@ +/* + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include + +void doWork(double sec); + +template +void buildingWork(int N, const Partitioner& p) { + tbb::parallel_for( tbb::blocked_range(0, N, 1), + [](const tbb::blocked_range& r) { + for (int i = r.begin(); i < r.end(); ++i) { + doWork(i); + } + }, p + ); +} + +template +void buildingWork(int N, Partitioner& p) { + tbb::parallel_for( tbb::blocked_range(0, N, 1), + [](const tbb::blocked_range& r) { + for (int i = r.begin(); i < r.end(); ++i) { + doWork(i); + } + }, p + ); +} + +static void warmupTBB() { + // This is a simple loop that should get workers started. + // oneTBB creates workers lazily on first use of the library + // so this hides the startup time when looking at trivial + // examples that do little real work. + tbb::parallel_for(0, tbb::info::default_concurrency(), + [=](int) { + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < 0.01); + } + ); +} + +void doWork(double usec) { + double sec = usec*1e-06; + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() <= sec); +} + +int main(int argc, char *argv[]) { + // use the most performance codes + // only a single NUMA node + // and only 1 thread per core + tbb::task_arena::constraints c; + c.set_numa_id(tbb::info::numa_nodes()[0]); + c.set_core_type(tbb::info::core_types().back()); + c.set_max_threads_per_core(1); + c.set_max_concurrency(std::min(8, tbb::info::default_concurrency(c))); + tbb::task_arena a(c); + + std::cout << "Using an arena with " << a.max_concurrency() << " slots\n"; + + a.execute([&]() { + int N = 1000; + int M = 10; + + std::cout << std::endl << "M = " << M + << std::endl << "N = " << N << std::endl; + + warmupTBB(); + tbb::tick_count t0 = tbb::tick_count::now(); + for (int i = 0; i < M; ++i) { + buildingWork(N, tbb::auto_partitioner{}); + } + double auto_time = (tbb::tick_count::now() - t0).seconds(); + + warmupTBB(); + tbb::affinity_partitioner aff_p; + t0 = tbb::tick_count::now(); + for (int i = 0; i < M; ++i) { + buildingWork(N, aff_p); + } + double affinity_time = (tbb::tick_count::now() - t0).seconds(); + + warmupTBB(); + t0 = tbb::tick_count::now(); + for (int i = 0; i < M; ++i) { + buildingWork(N, tbb::static_partitioner{}); + } + double static_time = (tbb::tick_count::now() - t0).seconds(); + + std::cout << "auto_partitioner = " << auto_time << " seconds" << std::endl + << "affinity_partitioner = " << affinity_time << " seconds" << std::endl + << "static_partitioner = " << static_time << " seconds" << std::endl; + }); + return 0; +} + diff --git a/new_examples/performance_tuning/priorities_and_conflict.cpp b/new_examples/performance_tuning/priorities_and_conflict.cpp index b8ae8117aa..e7a1771d37 100644 --- a/new_examples/performance_tuning/priorities_and_conflict.cpp +++ b/new_examples/performance_tuning/priorities_and_conflict.cpp @@ -14,18 +14,24 @@ limitations under the License. */ -#include + #include #include void printArrival(tbb::task_arena::priority priority); -void waitUntil(int N); + +using counter_t = std::atomic; +counter_t counter = 0; +void waitUntil(int N, counter_t& c) { + ++c; + while (c != N); +} void explicitArenaWithPriority(tbb::task_arena::priority priority) { tbb::task_arena a{tbb::info::default_concurrency(), 1, priority}; a.execute([=]() { tbb::parallel_for(0, - 10*tbb::info::default_concurrency(), + 2*tbb::info::default_concurrency(), [=](int) { printArrival(priority); }); }); } @@ -33,11 +39,11 @@ void explicitArenaWithPriority(tbb::task_arena::priority priority) { void runTwoThreads(tbb::task_arena::priority priority0, tbb::task_arena::priority priority1) { std::thread t0([=]() { - waitUntil(2); + waitUntil(2, counter); explicitArenaWithPriority(priority0); }); std::thread t1([=]() { - waitUntil(2); + waitUntil(2, counter); explicitArenaWithPriority(priority1); }); t0.join(); @@ -46,16 +52,32 @@ void runTwoThreads(tbb::task_arena::priority priority0, #include +int main() { + counter = 0; + std::printf("\n\nrunTwoThreads with low (.) and high (|)\n"); + runTwoThreads(tbb::task_arena::priority::low, tbb::task_arena::priority::high); + + counter = 0; + std::printf("\n\nrunTwoThreads with low (.) and normal (:)\n"); + runTwoThreads(tbb::task_arena::priority::low, tbb::task_arena::priority::normal); + + counter = 0; + std::printf("\n\nrunTwoThreads with normal (:) and high (|)\n"); + runTwoThreads(tbb::task_arena::priority::normal, tbb::task_arena::priority::high); + std::printf("\n"); + return 0; +} + void printArrival(tbb::task_arena::priority priority) { switch (priority) { case tbb::task_arena::priority::low: - std::printf(" low "); + std::printf("."); break; case tbb::task_arena::priority::normal: - std::printf(" normal "); + std::printf(":"); break; case tbb::task_arena::priority::high: - std::printf(" high "); + std::printf("|"); break; default: break; @@ -65,24 +87,5 @@ void printArrival(tbb::task_arena::priority priority) { while ((tbb::tick_count::now() - t0).seconds() < 0.01); } -std::atomic count_up = 0; -void waitUntil(int N) { - ++count_up; - while (count_up != N); -} - -int main() { - count_up = 0; - std::printf("\n\n\n\nrunTwoThreads(low, high)"); - runTwoThreads(tbb::task_arena::priority::low, tbb::task_arena::priority::high); - - count_up = 0; - std::printf("\n\n\n\nrunTwoThreads(low, normal)"); - runTwoThreads(tbb::task_arena::priority::low, tbb::task_arena::priority::normal); - count_up = 0; - std::printf("\n\n\n\nrunTwoThreads(normal, high)"); - runTwoThreads(tbb::task_arena::priority::normal, tbb::task_arena::priority::high); - return 0; -} diff --git a/new_examples/performance_tuning/task_scheduler_observer.cpp b/new_examples/performance_tuning/task_scheduler_observer.cpp new file mode 100644 index 0000000000..b6f319eac5 --- /dev/null +++ b/new_examples/performance_tuning/task_scheduler_observer.cpp @@ -0,0 +1,114 @@ +/* + Copyright (c) 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include +#include +#include + +// these are placeholder for where we would put OS-specific types and calls +using affinity_mask_t = std::string; +void set_thread_affinity( int tid, const affinity_mask_t& mask ) { + std::ostringstream buffer; + buffer << std::this_thread::get_id() + << " -> (" << tid + << ", " << mask << ")\n"; + std::cout << buffer.str(); +} +void restore_thread_affinity() { + std::ostringstream buffer; + buffer << std::this_thread::get_id() + << " -> (restored)\n"; + std::cout << buffer.str(); +} + +// observer class +class PinningObserver : public tbb::task_scheduler_observer { +public: + // HW affinity mask to be used for threads in an arena + affinity_mask_t m_mask; + PinningObserver( oneapi::tbb::task_arena &a, const affinity_mask_t& mask ) + : tbb::task_scheduler_observer(a), m_mask(mask) { + observe(true); // activate the observer + } + void on_scheduler_entry( bool worker ) override { + set_thread_affinity( + tbb::this_task_arena::current_thread_index(), m_mask); + } + void on_scheduler_exit( bool worker ) override { + restore_thread_affinity(); + } +}; + +int N = 1000; +double w = 0.01; +double f(double v); + +using counter_t = std::atomic; +counter_t counter = 0; +void waitUntil(int N, counter_t& c) { + ++c; + while (c != N); +} + +void observeTwoArenas() { + int P = tbb::info::default_concurrency(); + + // two arenas, each with half the hw threads + tbb::task_arena a0(P/2); + tbb::task_arena a1(P/2); + + PinningObserver obs0(a0, "mask_zero"); + PinningObserver obs1(a1, "mask_one"); + + // Execute consecutive loops + std::cout << "Execute a0 loop\n"; + a0.execute([] { + tbb::parallel_for(0, N, [](int j) { f(w); }); + }); + std::cout << "Execute a1 loop\n"; + a1.execute([] { + tbb::parallel_for(0, N, [](int j) { f(w); }); + }); + + // Execute concurrent loops + std::cout << "Execute a0 and a1 concurrently\n"; + std::thread t0([&]() { + waitUntil(2, counter); + a0.execute([] { + tbb::parallel_for(0, N, [](int j) { f(w); }); + }); + }); + std::thread t1([&]() { + waitUntil(2, counter); + a1.execute([] { + tbb::parallel_for(0, N, [](int j) { f(w); }); + }); + }); + t0.join(); + t1.join(); +} + +int main() { + observeTwoArenas(); + return 0; +} + +double f(double v) { + tbb::tick_count t0 = tbb::tick_count::now(); + while ((tbb::tick_count::now() - t0).seconds() < 0.01); + return 2*v; +} diff --git a/new_examples/tasks/CMakeLists.txt b/new_examples/tasks/CMakeLists.txt index 79d468791d..3a1af18ea0 100644 --- a/new_examples/tasks/CMakeLists.txt +++ b/new_examples/tasks/CMakeLists.txt @@ -1,6 +1,3 @@ -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -tbb -pthread") -set(CMAKE_CXX_LINKER_FLAGS "-tbb -pthread") - foreach(tpp parallel_invoke_fib.cpp task_group_fib.cpp task_group_poor_scaling.cpp diff --git a/new_examples/tasks/parallel_invoke_fib.cpp b/new_examples/tasks/parallel_invoke_fib.cpp index a5e2f7c584..903729fc0f 100644 --- a/new_examples/tasks/parallel_invoke_fib.cpp +++ b/new_examples/tasks/parallel_invoke_fib.cpp @@ -73,7 +73,7 @@ int main(int argc, char** argv) std::cout << "SerialFib: " << fib_s << " Time: " << t_s << "\n"; std::cout << "ParallelFib: " << fib_p << " Time: " << t_p << " Speedup: " << t_s/t_p << "\n"; - std::cout << "ParallelFibCutoff_30: " << fib_c << " Time: " << t_p << " Speedup: " << t_s/t_c << "\n"; + std::cout << "ParallelFibCutoff_30: " << fib_c << " Time: " << t_c << " Speedup: " << t_s/t_c << "\n"; return 0; } diff --git a/new_examples/tasks/resumable_tasks.cpp b/new_examples/tasks/resumable_tasks.cpp index 9cd3cc8982..c1bedaf921 100755 --- a/new_examples/tasks/resumable_tasks.cpp +++ b/new_examples/tasks/resumable_tasks.cpp @@ -32,7 +32,7 @@ int main() { tbb::task::suspend([=,&sycl_q](tbb::task::suspend_point tag) { auto sycl_event = sycl_q.fill(a, id, N); sycl_q.submit([=](sycl::handler& sycl_h) { - sycl_h.depends_on(sycl_event); // only run after e is done + sycl_h.depends_on(sycl_event); // run after sycl_event is done sycl_h.host_task([tag]() { tbb::task::resume(tag); });