From be057496153b5038edec8b272d3fe41a6302c20d Mon Sep 17 00:00:00 2001
From: Markus Vieth <mvieth@techfak.uni-bielefeld.de>
Date: Sun, 30 Jul 2023 16:55:22 +0200
Subject: [PATCH] NormalEstimationOMP: use dynamic scheduling for faster
 computation So far, no scheduling was specified, which seems to result in a
 behaviour similar to static scheduling. However, this is suboptimal, as the
 workload is not balanced well between the threads, especially when using
 radius search. With dynamic scheduling (default chunk size of 256), the
 speedup (ratio of computation time of NormalEstimation and
 NormalEstimationOMP) is better. The speedup for organized datasets is
 slightly higher than for unorganized datasets, possibly because FLANN (used
 for unorganized datasets) already uses some parallelization, while
 OrganizedNeighbor does not.

Laptop 1 (6 physical cores, 12 logical cores, number of threads set to 6):

dataset |             |          | #/mm | speedup before | speedup after
-----|-------------|----------|------|----------------|--------------
mug  | organized   | radius   | 10   | 3.4857         | 5.2508
mug  | organized   | radius   | 20   | 3.3441         | 5.1059
mug  | organized   | nearestk | 50   | 4.7033         | 5.0594
mug  | organized   | nearestk | 100  | 4.5808         | 4.9751
mug  | unorganized | radius   | 10   | 3.3374         | 4.8992
mug  | unorganized | radius   | 20   | 3.0206         | 4.7978
mug  | unorganized | nearestk | 50   | 4.5841         | 4.9189
mug  | unorganized | nearestk | 100  | 4.7062         | 4.8844
milk | organized   | radius   | 10   | 3.5140         | 5.1686
milk | organized   | radius   | 20   | 3.2605         | 5.1719
milk | organized   | nearestk | 50   | 4.3245         | 4.9924
milk | organized   | nearestk | 100  | 4.4170         | 4.9207
milk | unorganized | radius   | 10   | 3.4451         | 4.8029
milk | unorganized | radius   | 20   | 3.1887         | 4.8810
milk | unorganized | nearestk | 50   | 4.3789         | 4.6894
milk | unorganized | nearestk | 100  | 4.2717         | 4.7473

Laptop 2 (4 physical cores, 8 logical cores, number of threads set to 4):

dataset |             |          | #/mm | speedup before | speedup after
-----|-------------|----------|------|----------------|--------------
mug  | organized   | radius   | 10   | 2.3783         | 3.9812
mug  | organized   | radius   | 20   | 2.3080         | 3.9753
mug  | organized   | nearestk | 50   | 3.6190         | 3.9595
mug  | organized   | nearestk | 100  | 3.6100         | 3.9590
mug  | unorganized | radius   | 10   | 2.4181         | 3.7466
mug  | unorganized | radius   | 20   | 2.2157         | 3.8890
mug  | unorganized | nearestk | 50   | 3.4894         | 3.6551
mug  | unorganized | nearestk | 100  | 3.4293         | 3.7825
milk | organized   | radius   | 10   | 2.8174         | 3.8209
milk | organized   | radius   | 20   | 2.6911         | 3.9722
milk | organized   | nearestk | 50   | 3.3346         | 3.9433
milk | organized   | nearestk | 100  | 3.3275         | 3.9798
milk | unorganized | radius   | 10   | 2.8815         | 3.5443
milk | unorganized | radius   | 20   | 2.6467         | 3.7990
milk | unorganized | nearestk | 50   | 3.1602         | 3.6469
milk | unorganized | nearestk | 100  | 3.6460         | 3.7981
---
 features/include/pcl/features/impl/normal_3d_omp.hpp | 6 ++++--
 features/include/pcl/features/normal_3d_omp.h        | 5 ++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/features/include/pcl/features/impl/normal_3d_omp.hpp b/features/include/pcl/features/impl/normal_3d_omp.hpp
index a40b2bb2c2a..736b5c8d2fe 100644
--- a/features/include/pcl/features/impl/normal_3d_omp.hpp
+++ b/features/include/pcl/features/impl/normal_3d_omp.hpp
@@ -77,7 +77,8 @@ pcl::NormalEstimationOMP<PointInT, PointOutT>::computeFeature (PointCloudOut &ou
   default(none) \
   shared(output) \
   firstprivate(nn_indices, nn_dists) \
-  num_threads(threads_)
+  num_threads(threads_) \
+  schedule(dynamic, chunk_size_)
     // Iterating over the entire index vector
     for (std::ptrdiff_t idx = 0; idx < static_cast<std::ptrdiff_t> (indices_->size ()); ++idx)
     {
@@ -106,7 +107,8 @@ pcl::NormalEstimationOMP<PointInT, PointOutT>::computeFeature (PointCloudOut &ou
   default(none) \
   shared(output) \
   firstprivate(nn_indices, nn_dists) \
-  num_threads(threads_)
+  num_threads(threads_) \
+  schedule(dynamic, chunk_size_)
     // Iterating over the entire index vector
     for (std::ptrdiff_t idx = 0; idx < static_cast<std::ptrdiff_t> (indices_->size ()); ++idx)
     {
diff --git a/features/include/pcl/features/normal_3d_omp.h b/features/include/pcl/features/normal_3d_omp.h
index a8ae45b0a37..ba10bb76d8d 100644
--- a/features/include/pcl/features/normal_3d_omp.h
+++ b/features/include/pcl/features/normal_3d_omp.h
@@ -72,8 +72,9 @@ namespace pcl
     public:
       /** \brief Initialize the scheduler and set the number of threads to use.
         * \param nr_threads the number of hardware threads to use (0 sets the value back to automatic)
+        * \param chunk_size PCL will use dynamic scheduling with this chunk size. Setting it too low will lead to more parallelization overhead. Setting it too high will lead to a worse balancing between the threads.
         */
-      NormalEstimationOMP (unsigned int nr_threads = 0)
+      NormalEstimationOMP (unsigned int nr_threads = 0, int chunk_size = 256): chunk_size_(chunk_size)
       {
         feature_name_ = "NormalEstimationOMP";
 
@@ -90,6 +91,8 @@ namespace pcl
       /** \brief The number of threads the scheduler should use. */
       unsigned int threads_;
 
+      /** \brief Chunk size for (dynamic) scheduling. */
+      int chunk_size_;
     private:
       /** \brief Estimate normals for all points given in <setInputCloud (), setIndices ()> using the surface in
         * setSearchSurface () and the spatial locator in setSearchMethod ()