Skip to content

Commit

Permalink
add tunable nthreads parameter
Browse files Browse the repository at this point in the history
  • Loading branch information
albestro committed Apr 18, 2023
1 parent 3ca979e commit 26d72aa
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 14 deletions.
32 changes: 32 additions & 0 deletions include/dlaf/eigensolver/get_tfactor_nworkers.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
//
// Distributed Linear Algebra with Future (DLAF)
//
// Copyright (c) 2018-2023, ETH Zurich
// All rights reserved.
//
// Please, refer to the LICENSE file in the root directory.
// SPDX-License-Identifier: BSD-3-Clause
//
#pragma once

#include <algorithm>
#include <cmath>

#include <pika/runtime.hpp>

#include "dlaf/common/assert.h"
#include "dlaf/tune.h"

namespace dlaf::factorization::internal {

inline size_t getTFactorNWorkers() noexcept {
const size_t nworkers = getTuneParameters().tfactor_nworkers;

// Note: precautionarily we leave at least 1 thread "free" to do other stuff
const size_t max_workers = pika::resource::get_thread_pool("default").get_os_thread_count() - 1;

// 1 <= number of workers < max_workers
return std::max<std::size_t>(1, std::min<std::size_t>(max_workers, nworkers));
}

}
28 changes: 16 additions & 12 deletions include/dlaf/factorization/qr/t_factor_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#pragma once

#include <cstddef>
#include <memory>

#include <blas.hh>
Expand All @@ -34,6 +35,7 @@
#include "dlaf/common/vector.h"
#include "dlaf/communication/kernels/all_reduce.h"
#include "dlaf/communication/sync/all_reduce.h"
#include "dlaf/eigensolver/get_tfactor_nworkers.h"
#include "dlaf/lapack/tile.h"
#include "dlaf/matrix/matrix.h"
#include "dlaf/matrix/views.h"
Expand Down Expand Up @@ -281,7 +283,7 @@ void QR_Tfactor<Backend::MC, Device::CPU, T>::call(matrix::Panel<Coord::Col, T,
const SizeType bsRows = hh_panel.parentDistribution().blockSize().rows();
const SizeType panelRowBegin = hh_panel.iteratorLocal().begin()->row();

const size_t nthreads = std::max<size_t>(1, (pika::get_num_worker_threads() / 2));
const std::size_t nthreads = getTFactorNWorkers();
ex::start_detached(
ex::when_all(ex::just(std::make_shared<pika::barrier<>>(nthreads)),
ex::when_all_vector(std::move(panel_tiles)), std::move(taus), std::move(t)) |
Expand All @@ -294,7 +296,8 @@ void QR_Tfactor<Backend::MC, Device::CPU, T>::call(matrix::Panel<Coord::Col, T,
ex::transfer(
dlaf::internal::getBackendScheduler<B>(pika::execution::thread_priority::high)) |
ex::bulk(nthreads, [=, &barrier_ptr, &t, &taus,
&panel](const size_t index, std::vector<matrix::Tile<T, D>>& t_all) {
&panel](const std::size_t index,
std::vector<matrix::Tile<T, D>>& t_all) {
using Helpers = tfactor_l::Helpers<B, D, T>;

tile::internal::set0<T>(index == 0 ? t : t_all[index - 1]);
Expand All @@ -303,11 +306,11 @@ void QR_Tfactor<Backend::MC, Device::CPU, T>::call(matrix::Panel<Coord::Col, T,
// compute the column partial result `t` (multi-threaded)
// First we compute the matrix vector multiplication for each column
// -tau(j) . V(j:, 0:j)* . V(j:, j)
const size_t chunk_size = util::ceilDiv(panel.size(), nthreads);
const size_t begin = index * chunk_size;
const size_t end = std::min(index * chunk_size + chunk_size, panel.size());
const std::size_t chunk_size = util::ceilDiv(panel.size(), nthreads);
const std::size_t begin = index * chunk_size;
const std::size_t end = std::min(index * chunk_size + chunk_size, panel.size());

for (size_t i = begin; i < end; ++i) {
for (std::size_t i = begin; i < end; ++i) {
const matrix::Tile<const T, D>& tile_v = panel[i].get();

const SizeType first_row_tile =
Expand Down Expand Up @@ -432,7 +435,7 @@ void QR_Tfactor<Backend::MC, Device::CPU, T>::call(
const SizeType bsRows = hh_panel.parentDistribution().blockSize().rows();
const SizeType panelRowBegin = hh_panel.iteratorLocal().begin()->row();

const size_t nthreads = std::max<size_t>(1, (pika::get_num_worker_threads() / 2));
const std::size_t nthreads = getTFactorNWorkers();
ex::start_detached(
ex::when_all(ex::just(std::make_shared<pika::barrier<>>(nthreads)),
ex::when_all_vector(std::move(panel_tiles)), std::move(taus), std::move(t),
Expand All @@ -446,7 +449,8 @@ void QR_Tfactor<Backend::MC, Device::CPU, T>::call(
ex::transfer(
dlaf::internal::getBackendScheduler<B>(pika::execution::thread_priority::high)) |
ex::bulk(nthreads, [=, &barrier_ptr, &t, &taus, &panel,
&pcomm](const size_t index, std::vector<matrix::Tile<T, D>>& t_all) {
&pcomm](const std::size_t index,
std::vector<matrix::Tile<T, D>>& t_all) {
using Helpers = tfactor_l::Helpers<B, D, T>;

tile::internal::set0<T>(index == 0 ? t : t_all[index - 1]);
Expand All @@ -455,11 +459,11 @@ void QR_Tfactor<Backend::MC, Device::CPU, T>::call(
// compute the column partial result `t` (multi-threaded)
// First we compute the matrix vector multiplication for each column
// -tau(j) . V(j:, 0:j)* . V(j:, j)
const size_t chunk_size = util::ceilDiv(panel.size(), nthreads);
const size_t begin = index * chunk_size;
const size_t end = std::min(index * chunk_size + chunk_size, panel.size());
const std::size_t chunk_size = util::ceilDiv(panel.size(), nthreads);
const std::size_t begin = index * chunk_size;
const std::size_t end = std::min(index * chunk_size + chunk_size, panel.size());

for (size_t i = begin; i < end; ++i) {
for (std::size_t i = begin; i < end; ++i) {
const matrix::Tile<const T, D>& tile_v = panel[i].get();

const SizeType first_row_tile =
Expand Down
10 changes: 8 additions & 2 deletions include/dlaf/tune.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
//
#pragma once

#include <cstddef>

#include <pika/runtime.hpp>
#include <dlaf/types.h>

Expand All @@ -17,6 +19,7 @@ namespace dlaf {
///
/// Holds the value of the parameters that can be used to tune DLA-Future.
/// - red2band_panel_nworkers: number of threads to use for computing the panel in the reduction to band algorithm.
/// - tfactor_nworkers: number of threads to use for computing the T factor
/// - eigensolver_min_band: The minimun value to start looking for a divisor of the block size.
/// Set with --dlaf:eigensolver-min-band or env variable DLAF_EIGENSOLVER_MIN_BAND.
/// - band_to_tridiag_1d_block_size_base:
Expand All @@ -29,8 +32,11 @@ namespace dlaf {
/// DLAF_BT_BAND_TO_TRIDIAG_HH_APPLY_GROUP_SIZE.
/// Note to developers: Users can change these values, therefore consistency has to be ensured by algorithms.
struct TuneParameters {
size_t red2band_panel_nworkers =
std::max<size_t>(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2);
std::size_t red2band_panel_nworkers =
std::max<std::size_t>(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2);

std::size_t tfactor_nworkers =
std::max<std::size_t>(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2);

SizeType eigensolver_min_band = 100;
SizeType band_to_tridiag_1d_block_size_base = 8192;
Expand Down

0 comments on commit 26d72aa

Please sign in to comment.