diff --git a/doc/main/tbb_userguide/Advanced_Topic_Other_Kinds_of_Iteration_Spaces.rst b/doc/main/tbb_userguide/Advanced_Topic_Other_Kinds_of_Iteration_Spaces.rst index 3352dd8d32..99446ab659 100644 --- a/doc/main/tbb_userguide/Advanced_Topic_Other_Kinds_of_Iteration_Spaces.rst +++ b/doc/main/tbb_userguide/Advanced_Topic_Other_Kinds_of_Iteration_Spaces.rst @@ -72,4 +72,40 @@ along its longest axis. When used with ``parallel_for``, it causes the loop to be "recursively blocked" in a way that improves cache usage. This nice cache behavior means that using ``parallel_for`` over a ``blocked_range2d`` can make a loop run faster than the sequential -equivalent, even on a single processor. +equivalent, even on a single processor. + +The ``blocked_range2d`` allows you to use different value types for +its first dimension, *rows*, and the second one, *columns*. +That means you can combine indexes, pointers, and iterators into a joint +iteration space. Use the methods ``rows()`` and ``cols()`` to obtain +``blocked_range`` objects that represent the respective dimensions. + +The ``blocked_range3d`` class template extends this approach to 3D by adding +``pages()`` as the first dimension, followed by ``rows()`` and ``cols()``. + +The ``blocked_nd_range`` class template represents a blocked iteration +space of any dimensionality. Unlike the previously described 2D and 3D ranges, +``blocked_nd_range`` uses the same value type for all its axes, and its +constructor requires you to pass N instances of ``blocked_range`` instead of +individual boundary values. The change in the naming pattern reflects these +differences. + + +Example of a Multidimensional Iteration Space +------------------------------------------------ + +The example demonstrates calculation of a 3-dimensional filter over the pack +of feature maps. + +The ``convolution3d`` function iterates over the output cells, assigning to +each cell the result of the ``kernel3d`` function that combines the values +from a range in the feature maps. + +To run the computation in parallel, ``tbb::parallel_for`` is called with +``tbb::blocked_nd_range`` as an argument. The body function processes +the received 3D subrange in nested loops, using the method ``dim`` to get +the loop boundaries for each dimension. + + +.. literalinclude:: ./snippets/blocked_nd_range_example.h + :language: c++ diff --git a/doc/main/tbb_userguide/parallel_for_os.rst b/doc/main/tbb_userguide/parallel_for_os.rst index fed07af68b..cbc7578f4c 100644 --- a/doc/main/tbb_userguide/parallel_for_os.rst +++ b/doc/main/tbb_userguide/parallel_for_os.rst @@ -55,8 +55,9 @@ before each identifier. The rest of the examples assume that such a Note the argument to ``operator()``. A ``blocked_range`` is a template class provided by the library. It describes a one-dimensional iteration space over type ``T``. Class ``parallel_for`` works with other -kinds of iteration spaces too. The library provides ``blocked_range2d`` -for two-dimensional spaces. You can define your own spaces as explained +kinds of iteration spaces too. The library provides ``blocked_range2d``, +``blocked_range3d``, and ``blocked_nd_range`` for multidimensional spaces. +You can define your own spaces as explained in :ref:`Advanced_Topic_Other_Kinds_of_Iteration_Spaces`. diff --git a/doc/main/tbb_userguide/snippets/blocked_nd_range_example.cpp b/doc/main/tbb_userguide/snippets/blocked_nd_range_example.cpp new file mode 100644 index 0000000000..7417123999 --- /dev/null +++ b/doc/main/tbb_userguide/snippets/blocked_nd_range_example.cpp @@ -0,0 +1,37 @@ +#include "blocked_nd_range_example.h" +#include +#include + +int main() { + const int kernel_length = 9; + const int kernel_width = 5; + const int kernel_height = 5; + + const int feature_maps_length = 128; + const int feature_maps_width = 16; + const int feature_maps_heigth = 16; + + const int out_length = feature_maps_length - kernel_length + 1; + const int out_width = feature_maps_width - kernel_width + 1; + const int out_heigth = feature_maps_heigth - kernel_height + 1; + + // Initializes feature maps with 1 in each cell and out with zeros. + std::vector>> feature_maps(feature_maps_length, std::vector>(feature_maps_width, std::vector(feature_maps_heigth, 1.0f))); + std::vector>> out(out_length, std::vector>(out_width, std::vector(out_heigth, 0.f))); + + // 3D convolution calculates the sum of all elements in the kernel + convolution3d(feature_maps, out, + out_length, out_width, out_heigth, + kernel_length, kernel_width, kernel_height); + + // Checks correctness of convolution by equality to the expected sum of elements + float expected = float(kernel_length * kernel_height * kernel_width); + for (auto i : out) { + for (auto j : i) { + for (auto k : j) { + assert(k == expected && "convolution failed to calculate correctly"); + } + } + } + return 0; +} diff --git a/doc/main/tbb_userguide/snippets/blocked_nd_range_example.h b/doc/main/tbb_userguide/snippets/blocked_nd_range_example.h new file mode 100644 index 0000000000..ded2a09c57 --- /dev/null +++ b/doc/main/tbb_userguide/snippets/blocked_nd_range_example.h @@ -0,0 +1,37 @@ +#include "oneapi/tbb/blocked_nd_range.h" +#include "oneapi/tbb/parallel_for.h" + +template +float kernel3d(const Features& feature_maps, int i, int j, int k, + int kernel_length, int kernel_width, int kernel_height) { + float result = 0.f; + + for (int feature_i = i; feature_i < i + kernel_length; ++feature_i) + for (int feature_j = j; feature_j < j + kernel_width; ++feature_j) + for (int feature_k = k; feature_k < k + kernel_width; ++feature_k) + result += feature_maps[feature_i][feature_j][feature_k]; + + return result; +} + +template +void convolution3d(const Features& feature_maps, Output& out, + int out_length, int out_width, int out_heigth, + int kernel_length, int kernel_width, int kernel_height) { + using range_t = oneapi::tbb::blocked_nd_range; + + oneapi::tbb::parallel_for( + range_t({0, out_length}, {0, out_width}, {0, out_heigth}), + [&](const range_t& out_range) { + auto out_x = out_range.dim(0); + auto out_y = out_range.dim(1); + auto out_z = out_range.dim(2); + + for (int i = out_x.begin(); i < out_x.end(); ++i) + for (int j = out_y.begin(); j < out_y.end(); ++j) + for (int k = out_z.begin(); k < out_z.end(); ++k) + out[i][j][k] = kernel3d(feature_maps, i, j, k, + kernel_length, kernel_width, kernel_height); + } + ); +}