From be057496153b5038edec8b272d3fe41a6302c20d Mon Sep 17 00:00:00 2001 From: Markus Vieth Date: Sun, 30 Jul 2023 16:55:22 +0200 Subject: [PATCH] NormalEstimationOMP: use dynamic scheduling for faster computation So far, no scheduling was specified, which seems to result in a behaviour similar to static scheduling. However, this is suboptimal, as the workload is not balanced well between the threads, especially when using radius search. With dynamic scheduling (default chunk size of 256), the speedup (ratio of computation time of NormalEstimation and NormalEstimationOMP) is better. The speedup for organized datasets is slightly higher than for unorganized datasets, possibly because FLANN (used for unorganized datasets) already uses some parallelization, while OrganizedNeighbor does not. Laptop 1 (6 physical cores, 12 logical cores, number of threads set to 6): dataset | | | #/mm | speedup before | speedup after -----|-------------|----------|------|----------------|-------------- mug | organized | radius | 10 | 3.4857 | 5.2508 mug | organized | radius | 20 | 3.3441 | 5.1059 mug | organized | nearestk | 50 | 4.7033 | 5.0594 mug | organized | nearestk | 100 | 4.5808 | 4.9751 mug | unorganized | radius | 10 | 3.3374 | 4.8992 mug | unorganized | radius | 20 | 3.0206 | 4.7978 mug | unorganized | nearestk | 50 | 4.5841 | 4.9189 mug | unorganized | nearestk | 100 | 4.7062 | 4.8844 milk | organized | radius | 10 | 3.5140 | 5.1686 milk | organized | radius | 20 | 3.2605 | 5.1719 milk | organized | nearestk | 50 | 4.3245 | 4.9924 milk | organized | nearestk | 100 | 4.4170 | 4.9207 milk | unorganized | radius | 10 | 3.4451 | 4.8029 milk | unorganized | radius | 20 | 3.1887 | 4.8810 milk | unorganized | nearestk | 50 | 4.3789 | 4.6894 milk | unorganized | nearestk | 100 | 4.2717 | 4.7473 Laptop 2 (4 physical cores, 8 logical cores, number of threads set to 4): dataset | | | #/mm | speedup before | speedup after -----|-------------|----------|------|----------------|-------------- mug | organized | radius | 10 | 2.3783 | 3.9812 mug | organized | radius | 20 | 2.3080 | 3.9753 mug | organized | nearestk | 50 | 3.6190 | 3.9595 mug | organized | nearestk | 100 | 3.6100 | 3.9590 mug | unorganized | radius | 10 | 2.4181 | 3.7466 mug | unorganized | radius | 20 | 2.2157 | 3.8890 mug | unorganized | nearestk | 50 | 3.4894 | 3.6551 mug | unorganized | nearestk | 100 | 3.4293 | 3.7825 milk | organized | radius | 10 | 2.8174 | 3.8209 milk | organized | radius | 20 | 2.6911 | 3.9722 milk | organized | nearestk | 50 | 3.3346 | 3.9433 milk | organized | nearestk | 100 | 3.3275 | 3.9798 milk | unorganized | radius | 10 | 2.8815 | 3.5443 milk | unorganized | radius | 20 | 2.6467 | 3.7990 milk | unorganized | nearestk | 50 | 3.1602 | 3.6469 milk | unorganized | nearestk | 100 | 3.6460 | 3.7981 --- features/include/pcl/features/impl/normal_3d_omp.hpp | 6 ++++-- features/include/pcl/features/normal_3d_omp.h | 5 ++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/features/include/pcl/features/impl/normal_3d_omp.hpp b/features/include/pcl/features/impl/normal_3d_omp.hpp index a40b2bb2c2a..736b5c8d2fe 100644 --- a/features/include/pcl/features/impl/normal_3d_omp.hpp +++ b/features/include/pcl/features/impl/normal_3d_omp.hpp @@ -77,7 +77,8 @@ pcl::NormalEstimationOMP::computeFeature (PointCloudOut &ou default(none) \ shared(output) \ firstprivate(nn_indices, nn_dists) \ - num_threads(threads_) + num_threads(threads_) \ + schedule(dynamic, chunk_size_) // Iterating over the entire index vector for (std::ptrdiff_t idx = 0; idx < static_cast (indices_->size ()); ++idx) { @@ -106,7 +107,8 @@ pcl::NormalEstimationOMP::computeFeature (PointCloudOut &ou default(none) \ shared(output) \ firstprivate(nn_indices, nn_dists) \ - num_threads(threads_) + num_threads(threads_) \ + schedule(dynamic, chunk_size_) // Iterating over the entire index vector for (std::ptrdiff_t idx = 0; idx < static_cast (indices_->size ()); ++idx) { diff --git a/features/include/pcl/features/normal_3d_omp.h b/features/include/pcl/features/normal_3d_omp.h index a8ae45b0a37..ba10bb76d8d 100644 --- a/features/include/pcl/features/normal_3d_omp.h +++ b/features/include/pcl/features/normal_3d_omp.h @@ -72,8 +72,9 @@ namespace pcl public: /** \brief Initialize the scheduler and set the number of threads to use. * \param nr_threads the number of hardware threads to use (0 sets the value back to automatic) + * \param chunk_size PCL will use dynamic scheduling with this chunk size. Setting it too low will lead to more parallelization overhead. Setting it too high will lead to a worse balancing between the threads. */ - NormalEstimationOMP (unsigned int nr_threads = 0) + NormalEstimationOMP (unsigned int nr_threads = 0, int chunk_size = 256): chunk_size_(chunk_size) { feature_name_ = "NormalEstimationOMP"; @@ -90,6 +91,8 @@ namespace pcl /** \brief The number of threads the scheduler should use. */ unsigned int threads_; + /** \brief Chunk size for (dynamic) scheduling. */ + int chunk_size_; private: /** \brief Estimate normals for all points given in using the surface in * setSearchSurface () and the spatial locator in setSearchMethod ()