add tunable nthreads parameter

eth-cscs · Apr 18, 2023 · 26d72aa · 26d72aa
1 parent 3ca979e
commit 26d72aa
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 14 deletions.
diff --git a/include/dlaf/eigensolver/get_tfactor_nworkers.h b/include/dlaf/eigensolver/get_tfactor_nworkers.h
@@ -0,0 +1,32 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2018-2023, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+
+#include <pika/runtime.hpp>
+
+#include "dlaf/common/assert.h"
+#include "dlaf/tune.h"
+
+namespace dlaf::factorization::internal {
+
+inline size_t getTFactorNWorkers() noexcept {
+  const size_t nworkers = getTuneParameters().tfactor_nworkers;
+
+  // Note: precautionarily we leave at least 1 thread "free" to do other stuff
+  const size_t max_workers = pika::resource::get_thread_pool("default").get_os_thread_count() - 1;
+
+  // 1 <= number of workers < max_workers
+  return std::max<std::size_t>(1, std::min<std::size_t>(max_workers, nworkers));
+}
+
+}
diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include <cstddef>
 #include <memory>
 
 #include <blas.hh>
@@ -34,6 +35,7 @@
 #include "dlaf/common/vector.h"
 #include "dlaf/communication/kernels/all_reduce.h"
 #include "dlaf/communication/sync/all_reduce.h"
+#include "dlaf/eigensolver/get_tfactor_nworkers.h"
 #include "dlaf/lapack/tile.h"
 #include "dlaf/matrix/matrix.h"
 #include "dlaf/matrix/views.h"
@@ -281,7 +283,7 @@ void QR_Tfactor<Backend::MC, Device::CPU, T>::call(matrix::Panel<Coord::Col, T,
   const SizeType bsRows = hh_panel.parentDistribution().blockSize().rows();
   const SizeType panelRowBegin = hh_panel.iteratorLocal().begin()->row();
 
-  const size_t nthreads = std::max<size_t>(1, (pika::get_num_worker_threads() / 2));
+  const std::size_t nthreads = getTFactorNWorkers();
   ex::start_detached(
       ex::when_all(ex::just(std::make_shared<pika::barrier<>>(nthreads)),
                    ex::when_all_vector(std::move(panel_tiles)), std::move(taus), std::move(t)) |
@@ -294,7 +296,8 @@ void QR_Tfactor<Backend::MC, Device::CPU, T>::call(matrix::Panel<Coord::Col, T,
                ex::transfer(
                    dlaf::internal::getBackendScheduler<B>(pika::execution::thread_priority::high)) |
                ex::bulk(nthreads, [=, &barrier_ptr, &t, &taus,
-                                   &panel](const size_t index, std::vector<matrix::Tile<T, D>>& t_all) {
+                                   &panel](const std::size_t index,
+                                           std::vector<matrix::Tile<T, D>>& t_all) {
                  using Helpers = tfactor_l::Helpers<B, D, T>;
 
                  tile::internal::set0<T>(index == 0 ? t : t_all[index - 1]);
@@ -303,11 +306,11 @@ void QR_Tfactor<Backend::MC, Device::CPU, T>::call(matrix::Panel<Coord::Col, T,
                  // compute the column partial result `t` (multi-threaded)
                  // First we compute the matrix vector multiplication for each column
                  // -tau(j) . V(j:, 0:j)* . V(j:, j)
-                 const size_t chunk_size = util::ceilDiv(panel.size(), nthreads);
-                 const size_t begin = index * chunk_size;
-                 const size_t end = std::min(index * chunk_size + chunk_size, panel.size());
+                 const std::size_t chunk_size = util::ceilDiv(panel.size(), nthreads);
+                 const std::size_t begin = index * chunk_size;
+                 const std::size_t end = std::min(index * chunk_size + chunk_size, panel.size());
 
-                 for (size_t i = begin; i < end; ++i) {
+                 for (std::size_t i = begin; i < end; ++i) {
                    const matrix::Tile<const T, D>& tile_v = panel[i].get();
 
                    const SizeType first_row_tile =
@@ -432,7 +435,7 @@ void QR_Tfactor<Backend::MC, Device::CPU, T>::call(
   const SizeType bsRows = hh_panel.parentDistribution().blockSize().rows();
   const SizeType panelRowBegin = hh_panel.iteratorLocal().begin()->row();
 
-  const size_t nthreads = std::max<size_t>(1, (pika::get_num_worker_threads() / 2));
+  const std::size_t nthreads = getTFactorNWorkers();
   ex::start_detached(
       ex::when_all(ex::just(std::make_shared<pika::barrier<>>(nthreads)),
                    ex::when_all_vector(std::move(panel_tiles)), std::move(taus), std::move(t),
@@ -446,7 +449,8 @@ void QR_Tfactor<Backend::MC, Device::CPU, T>::call(
                ex::transfer(
                    dlaf::internal::getBackendScheduler<B>(pika::execution::thread_priority::high)) |
                ex::bulk(nthreads, [=, &barrier_ptr, &t, &taus, &panel,
-                                   &pcomm](const size_t index, std::vector<matrix::Tile<T, D>>& t_all) {
+                                   &pcomm](const std::size_t index,
+                                           std::vector<matrix::Tile<T, D>>& t_all) {
                  using Helpers = tfactor_l::Helpers<B, D, T>;
 
                  tile::internal::set0<T>(index == 0 ? t : t_all[index - 1]);
@@ -455,11 +459,11 @@ void QR_Tfactor<Backend::MC, Device::CPU, T>::call(
                  // compute the column partial result `t` (multi-threaded)
                  // First we compute the matrix vector multiplication for each column
                  // -tau(j) . V(j:, 0:j)* . V(j:, j)
-                 const size_t chunk_size = util::ceilDiv(panel.size(), nthreads);
-                 const size_t begin = index * chunk_size;
-                 const size_t end = std::min(index * chunk_size + chunk_size, panel.size());
+                 const std::size_t chunk_size = util::ceilDiv(panel.size(), nthreads);
+                 const std::size_t begin = index * chunk_size;
+                 const std::size_t end = std::min(index * chunk_size + chunk_size, panel.size());
 
-                 for (size_t i = begin; i < end; ++i) {
+                 for (std::size_t i = begin; i < end; ++i) {
                    const matrix::Tile<const T, D>& tile_v = panel[i].get();
 
                    const SizeType first_row_tile =

diff --git a/include/dlaf/tune.h b/include/dlaf/tune.h
@@ -9,6 +9,8 @@
 //
 #pragma once
 
+#include <cstddef>
+
 #include <pika/runtime.hpp>
 #include <dlaf/types.h>
 
@@ -17,6 +19,7 @@ namespace dlaf {
 ///
 /// Holds the value of the parameters that can be used to tune DLA-Future.
 /// - red2band_panel_nworkers: number of threads to use for computing the panel in the reduction to band algorithm.
+/// - tfactor_nworkers: number of threads to use for computing the T factor
 /// - eigensolver_min_band: The minimun value to start looking for a divisor of the block size.
 ///                         Set with --dlaf:eigensolver-min-band or env variable DLAF_EIGENSOLVER_MIN_BAND.
 /// - band_to_tridiag_1d_block_size_base:
@@ -29,8 +32,11 @@ namespace dlaf {
 ///     DLAF_BT_BAND_TO_TRIDIAG_HH_APPLY_GROUP_SIZE.
 /// Note to developers: Users can change these values, therefore consistency has to be ensured by algorithms.
 struct TuneParameters {
-  size_t red2band_panel_nworkers =
-      std::max<size_t>(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2);
+  std::size_t red2band_panel_nworkers =
+      std::max<std::size_t>(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2);
+
+  std::size_t tfactor_nworkers =
+      std::max<std::size_t>(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2);
 
   SizeType eigensolver_min_band = 100;
   SizeType band_to_tridiag_1d_block_size_base = 8192;