From 535d6c8d9136eb7a0a9f0a644cc25ff46de4d1c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Raffaele=20Solc=C3=A0?= Date: Wed, 19 Apr 2023 13:21:13 +0200 Subject: [PATCH] computeTFactor parallelise computation with bulk (MC Local and Distributed) (#798) --- .../dlaf/eigensolver/get_tfactor_nworkers.h | 32 ++ include/dlaf/factorization/qr.h | 55 ++- include/dlaf/factorization/qr/api.h | 74 +--- include/dlaf/factorization/qr/t_factor_impl.h | 347 ++++++++++++++---- include/dlaf/tune.h | 10 +- 5 files changed, 396 insertions(+), 122 deletions(-) create mode 100644 include/dlaf/eigensolver/get_tfactor_nworkers.h diff --git a/include/dlaf/eigensolver/get_tfactor_nworkers.h b/include/dlaf/eigensolver/get_tfactor_nworkers.h new file mode 100644 index 0000000000..ca61e9d0d2 --- /dev/null +++ b/include/dlaf/eigensolver/get_tfactor_nworkers.h @@ -0,0 +1,32 @@ +// +// Distributed Linear Algebra with Future (DLAF) +// +// Copyright (c) 2018-2023, ETH Zurich +// All rights reserved. +// +// Please, refer to the LICENSE file in the root directory. +// SPDX-License-Identifier: BSD-3-Clause +// +#pragma once + +#include +#include + +#include + +#include "dlaf/common/assert.h" +#include "dlaf/tune.h" + +namespace dlaf::factorization::internal { + +inline size_t getTFactorNWorkers() noexcept { + const size_t nworkers = getTuneParameters().tfactor_nworkers; + + // Note: precautionarily we leave at least 1 thread "free" to do other stuff + const size_t max_workers = pika::resource::get_thread_pool("default").get_os_thread_count() - 1; + + // 1 <= number of workers < max_workers + return std::max(1, std::min(max_workers, nworkers)); +} + +} diff --git a/include/dlaf/factorization/qr.h b/include/dlaf/factorization/qr.h index 189987cc6f..f351217732 100644 --- a/include/dlaf/factorization/qr.h +++ b/include/dlaf/factorization/qr.h @@ -27,13 +27,27 @@ namespace dlaf::factorization::internal { /// H0 H1 H2 ... HK-1 /// Note: The first element of the HH reflectors is NOT implicitly assumed to be 1, /// it has to be set correctly in the panel (0s as well). - +/// +/// It is similar to what xLARFT in LAPACK does. +/// Given @p k elementary reflectors stored in the column of @p hh_panel together with related tau values +/// in @p taus, in @p t will be formed the triangular factor for the H block of reflectors, such that +/// +/// H = I - V . T . V* +/// +/// where H = H1 . H2 . ... . Hk +/// +/// in which Hi represents a single elementary reflector transformation. +/// /// A Storage-Efficient WY Representation for Products of Householder Transformations. /// Schreiber, Robert & VanLoan, Charles. (1989) /// SIAM Journal on Scientific and Statistical Computing. 10. 10.1137/0910005. /// -/// @pre taus contains a vector with k elements -/// @pre t contains a (k x k) tile +/// @param hh_panel where the elementary reflectors are stored +/// @param taus array of taus, associated with the related elementary reflector +/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size +/// TileElementSize(k, k) +/// +/// @pre hh_pane.getWidth() <= t.get().size().rows && hh_panel.size().getWidth() <= t.get().size().cols() template void computeTFactor(matrix::Panel& hh_panel, pika::shared_future> taus, @@ -41,6 +55,41 @@ void computeTFactor(matrix::Panel& hh_panel, QR_Tfactor::call(hh_panel, taus, std::move(t)); } +/// Forms the triangular factor T of a block of reflectors H, which is defined as a product of +/// k := hh_panel.getWidth() elementary reflectors. +/// +/// hh_panel should have the following form +/// H0 0 0 ... 0 +/// . H1 0 ... 0 +/// . . H2 ... 0 +/// . . . ... 0 +/// . . . ... HK-1 +/// . . . ... . +/// H0 H1 H2 ... HK-1 +/// Note: The first element of the HH reflectors is NOT implicitly assumed to be 1, +/// it has to be set correctly in the panel (0s as well). +/// +/// It is similar to what xLARFT in LAPACK does. +/// Given @p k elementary reflectors stored in the column of @p hh_panel together with related tau values +/// in @p taus, in @p t will be formed the triangular factor for the H block of reflectors, such that +/// +/// H = I - V . T . V* +/// +/// where H = H1 . H2 . ... . Hk +/// +/// in which Hi represents a single elementary reflector transformation. +/// +/// A Storage-Efficient WY Representation for Products of Householder Transformations. +/// Schreiber, Robert & VanLoan, Charles. (1989) +/// SIAM Journal on Scientific and Statistical Computing. 10. 10.1137/0910005. +/// +/// @param hh_panel where the elementary reflectors are stored +/// @param taus array of taus, associated with the related elementary reflector +/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size +/// TileElementSize(k, k) +/// @param mpi_col_task_chain where internal communications are issued +/// +/// @pre hh_pane.getWidth() <= t.get().size().rows && hh_panel.size().getWidth() <= t.get().size().cols() template void computeTFactor(matrix::Panel& hh_panel, pika::shared_future> taus, diff --git a/include/dlaf/factorization/qr/api.h b/include/dlaf/factorization/qr/api.h index e803219597..79add6fbce 100644 --- a/include/dlaf/factorization/qr/api.h +++ b/include/dlaf/factorization/qr/api.h @@ -24,65 +24,31 @@ template struct QR {}; template -struct QR_Tfactor { - /// Forms the triangular factor T of a block of reflectors H, which is defined as a product of k - /// elementary reflectors. - /// - /// It is similar to what xLARFT in LAPACK does. - /// Given @p k elementary reflectors stored in the column of @p v starting at tile @p v_start, - /// together with related tau values in @p taus, in @p t will be formed the triangular factor for the H - /// block of reflector, such that - /// - /// H = I - V . T . V* - /// - /// where H = H1 . H2 . ... . Hk - /// - /// in which Hi represents a single elementary reflector transformation - /// - /// @param k the number of elementary reflectors to use (from the beginning of the tile) - /// @param v where the elementary reflectors are stored - /// @param v_start tile in @p v where the column of reflectors starts - /// @param taus array of taus, associated with the related elementary reflector - /// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size - /// TileElementSize(k, k) - /// - /// @pre k <= t.get().size().rows && k <= t.get().size().cols() - /// @pre k >= 0 - /// @pre v_start.isIn(v.nrTiles()) - static void call(matrix::Panel& panel_view, +struct QR_Tfactor; + +template +struct QR_Tfactor { + static void call(matrix::Panel& panel_view, + pika::shared_future> taus, + pika::future> t); + static void call(matrix::Panel& hh_panel, pika::shared_future> taus, - pika::future> t); + pika::future> t, + common::Pipeline& mpi_col_task_chain); +}; - /// Forms the triangular factor T of a block of reflectors H, which is defined as a product of k - /// elementary reflectors. - /// - /// It is similar to what xLARFT in LAPACK does. - /// Given @p k elementary reflectors stored in the column of @p v starting at tile @p v_start, - /// together with related tau values in @p taus, in @p t will be formed the triangular factor for the H - /// block of reflector, such that - /// - /// H = I - V . T . V* - /// - /// where H = H1 . H2 . ... . Hk - /// - /// in which Hi represents a single elementary reflector transformation - /// - /// @param k the number of elementary reflectors to use (from the beginning of the tile) - /// @param v where the elementary reflectors are stored - /// @param v_start tile in @p v where the column of reflectors starts - /// @param taus array of taus, associated with the related elementary reflector - /// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size - /// TileElementSize(k, k) - /// @param mpi_col_task_chain where internal communications are issued - /// - /// @pre k <= t.get().size().rows && k <= t.get().size().cols() - /// @pre k >= 0 - /// @pre v_start.isIn(v.nrTiles()) - static void call(matrix::Panel& hh_panel, +#ifdef DLAF_WITH_GPU +template +struct QR_Tfactor { + static void call(matrix::Panel& panel_view, + pika::shared_future> taus, + pika::future> t); + static void call(matrix::Panel& hh_panel, pika::shared_future> taus, - pika::future> t, + pika::future> t, common::Pipeline& mpi_col_task_chain); }; +#endif /// ---- ETI #define DLAF_FACTORIZATION_QR_TFACTOR_ETI(KWORD, BACKEND, DEVICE, DATATYPE) \ diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h index eb1e9bdfcf..1dbe267a49 100644 --- a/include/dlaf/factorization/qr/t_factor_impl.h +++ b/include/dlaf/factorization/qr/t_factor_impl.h @@ -10,9 +10,16 @@ #pragma once -#include +#include +#include #include +#include +#include + +#include "dlaf/blas/tile_extensions.h" +#include "dlaf/matrix/tile.h" +#include "dlaf/util_matrix.h" #ifdef DLAF_WITH_GPU #include @@ -27,6 +34,8 @@ #include "dlaf/common/range2d.h" #include "dlaf/common/vector.h" #include "dlaf/communication/kernels/all_reduce.h" +#include "dlaf/communication/sync/all_reduce.h" +#include "dlaf/eigensolver/get_tfactor_nworkers.h" #include "dlaf/lapack/tile.h" #include "dlaf/matrix/matrix.h" #include "dlaf/matrix/views.h" @@ -59,74 +68,76 @@ struct Helpers { std::forward(t)); } - template - static auto gemvColumnT(SizeType first_row_tile, - pika::shared_future> tile_vi, - pika::shared_future>& taus, TSender&& tile_t) { - namespace ex = pika::execution::experimental; + static auto gemv_func(const SizeType first_row_tile, const matrix::Tile& tile_v, + const common::internal::vector& taus, + matrix::Tile tile_t) noexcept { + const SizeType k = tile_t.size().cols(); + DLAF_ASSERT(tile_v.size().cols() == k, tile_v.size().cols(), k); + DLAF_ASSERT(taus.size() == k, taus.size(), k); - auto gemv_func = [first_row_tile](const auto& tile_v, const auto& taus, auto&& tile_t) noexcept { - const SizeType k = tile_t.size().cols(); - DLAF_ASSERT(tile_v.size().cols() == k, tile_v.size().cols(), k); - DLAF_ASSERT(taus.size() == k, taus.size(), k); + for (SizeType j = 0; j < k; ++j) { + const T tau = taus[j]; - for (SizeType j = 0; j < k; ++j) { - const T tau = taus[j]; + const TileElementIndex t_start{0, j}; - const TileElementIndex t_start{0, j}; + // Position of the 1 in the diagonal in the current column. + SizeType i_diag = j - first_row_tile; + const SizeType first_element_in_col = std::max(0, i_diag); - // Position of the 1 in the diagonal in the current column. - SizeType i_diag = j - first_row_tile; - const SizeType first_element_in_col = std::max(0, i_diag); + // Break if the reflector starts in the next tile. + if (i_diag >= tile_v.size().rows()) + break; - // Break if the reflector starts in the next tile. - if (i_diag >= tile_v.size().rows()) - break; + // T(0:j, j) = -tau . V(j:, 0:j)* . V(j:, j) + // [j x 1] = [(n-j) x j]* . [(n-j) x 1] + TileElementIndex va_start{first_element_in_col, 0}; + TileElementIndex vb_start{first_element_in_col, j}; + TileElementSize va_size{tile_v.size().rows() - first_element_in_col, j}; - // T(0:j, j) = -tau . V(j:, 0:j)* . V(j:, j) - // [j x 1] = [(n-j) x j]* . [(n-j) x 1] - TileElementIndex va_start{first_element_in_col, 0}; - TileElementIndex vb_start{first_element_in_col, j}; - TileElementSize va_size{tile_v.size().rows() - first_element_in_col, j}; + if (i_diag >= 0) { + tile_t({j, j}) = tau; + } - if (i_diag >= 0) { - tile_t({j, j}) = tau; - } + blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, va_size.rows(), va_size.cols(), -tau, + tile_v.ptr(va_start), tile_v.ld(), tile_v.ptr(vb_start), 1, 1, tile_t.ptr(t_start), 1); + } + return tile_t; + }; + + template + static auto gemvColumnT(SizeType first_row_tile, + pika::shared_future> tile_vi, + pika::shared_future>& taus, TSender&& tile_t) { + namespace ex = pika::execution::experimental; - blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, va_size.rows(), va_size.cols(), -tau, - tile_v.ptr(va_start), tile_v.ld(), tile_v.ptr(vb_start), 1, 1, tile_t.ptr(t_start), - 1); - } - return std::move(tile_t); - }; return dlaf::internal::transform(dlaf::internal::Policy( pika::execution::thread_priority::high), - std::move(gemv_func), - ex::when_all(dlaf::internal::keepFuture(tile_vi), + gemv_func, + ex::when_all(ex::just(first_row_tile), + dlaf::internal::keepFuture(tile_vi), dlaf::internal::keepFuture(taus), std::forward(tile_t))); } + // Update each column (in order) t = T . t + // remember that T is upper triangular, so it is possible to use TRMV + static void trmv_func(matrix::Tile& tile_t) { + for (SizeType j = 0; j < tile_t.size().cols(); ++j) { + const TileElementIndex t_start{0, j}; + const TileElementSize t_size{j, 1}; + + blas::trmv(blas::Layout::ColMajor, blas::Uplo::Upper, blas::Op::NoTrans, blas::Diag::NonUnit, + t_size.rows(), tile_t.ptr(), tile_t.ld(), tile_t.ptr(t_start), 1); + } + } + template static auto trmvUpdateColumn(TSender&& tile_t) noexcept { namespace ex = pika::execution::experimental; - // Update each column (in order) t = T . t - // remember that T is upper triangular, so it is possible to use TRMV - auto trmv_func = [](matrix::Tile&& tile_t) { - for (SizeType j = 0; j < tile_t.size().cols(); ++j) { - const TileElementIndex t_start{0, j}; - const TileElementSize t_size{j, 1}; - - blas::trmv(blas::Layout::ColMajor, blas::Uplo::Upper, blas::Op::NoTrans, blas::Diag::NonUnit, - t_size.rows(), tile_t.ptr(), tile_t.ld(), tile_t.ptr(t_start), 1); - } - // TODO: Why return if the tile is unused? - return std::move(tile_t); - }; return dlaf::internal::transform(dlaf::internal::Policy( pika::execution::thread_priority::high), - std::move(trmv_func), std::forward(tile_t)); + trmv_func, std::forward(tile_t)); } }; @@ -234,20 +245,118 @@ struct Helpers { #endif } -template -void QR_Tfactor::call(matrix::Panel& hh_panel, - pika::shared_future> taus, - pika::future> t) { +template +void QR_Tfactor::call(matrix::Panel& hh_panel, + pika::shared_future> taus, + pika::future> t) { + constexpr auto B = Backend::MC; + constexpr auto D = Device::CPU; + + namespace ex = pika::execution::experimental; + + // Fast return in case of no reflectors + if (hh_panel.getWidth() == 0) + return; + + std::vector()))> panel_tiles; + for (const auto idx : hh_panel.iteratorLocal()) + panel_tiles.push_back(hh_panel.read_sender(idx)); + + // Note: + // T factor is an upper triangular square matrix, built column by column + // with taus values on the diagonal + // + // T(j,j) = tau(j) + // + // and in the upper triangular part the following formula applies + // + // T(0:j, j) = T(0:j, 0:j) . -tau(j) . V(j:, 0:j)* . V(j:, j) + // + // + // The result is achieved in two main steps: + // 1) t = -tau(j) . V(j:, 0:j)* . V(j:, j) + // 2) T(0:j, j) = T(0:j, 0:j) . t + + const SizeType v_start = hh_panel.offsetElement(); + const SizeType bsRows = hh_panel.parentDistribution().blockSize().rows(); + const SizeType panelRowBegin = hh_panel.iteratorLocal().begin()->row(); + + const std::size_t nthreads = getTFactorNWorkers(); + ex::start_detached( + ex::when_all(ex::just(std::make_shared>(nthreads)), + ex::when_all_vector(std::move(panel_tiles)), std::move(taus), std::move(t)) | + ex::let_value([=](auto& barrier_ptr, auto& panel, const common::internal::vector& taus, + matrix::Tile& t) { + matrix::Matrix t_all({t.size().rows() * to_SizeType(nthreads - 1), t.size().cols()}, + t.size()); + return ex::when_all_vector( + select(t_all, common::iterate_range2d(t_all.distribution().localNrTiles()))) | + ex::transfer( + dlaf::internal::getBackendScheduler(pika::execution::thread_priority::high)) | + ex::bulk(nthreads, [=, &barrier_ptr, &t, &taus, + &panel](const std::size_t index, + std::vector>& t_all) { + using Helpers = tfactor_l::Helpers; + + tile::internal::set0(index == 0 ? t : t_all[index - 1]); + + // 1st step + // compute the column partial result `t` (multi-threaded) + // First we compute the matrix vector multiplication for each column + // -tau(j) . V(j:, 0:j)* . V(j:, j) + const std::size_t chunk_size = util::ceilDiv(panel.size(), nthreads); + const std::size_t begin = index * chunk_size; + const std::size_t end = std::min(index * chunk_size + chunk_size, panel.size()); + + for (std::size_t i = begin; i < end; ++i) { + const matrix::Tile& tile_v = panel[i].get(); + + const SizeType first_row_tile = + std::max(0, (panelRowBegin + to_SizeType(i)) * bsRows - v_start); + + if (index == 0) + t = Helpers::gemv_func(first_row_tile, tile_v, taus, std::move(t)); + else + t_all[index - 1] = + Helpers::gemv_func(first_row_tile, tile_v, taus, std::move(t_all[index - 1])); + } + + barrier_ptr->arrive_and_wait(); + + // (single-threaded) + if (index == 0) { + // reduce + for (auto& partial_t : t_all) + tile::internal::add(T(1), partial_t, t); + + // 2nd step + // compute the T factor, by performing the last step on each column + // (single-threaded) each column depends on the previous part (all reflectors + // that comes before) so it is performed sequentially + Helpers::trmv_func(t); + } + }); + })); +} + +#ifdef DLAF_WITH_GPU +template +void QR_Tfactor::call(matrix::Panel& hh_panel, + pika::shared_future> taus, + pika::future> t) { + constexpr auto B = Backend::GPU; + constexpr auto D = Device::GPU; + namespace ex = pika::execution::experimental; - using Helpers = tfactor_l::Helpers; + using Helpers = tfactor_l::Helpers; // Fast return in case of no reflectors if (hh_panel.getWidth() == 0) return; const auto v_start = hh_panel.offsetElement(); - ex::unique_any_sender> t_local = Helpers::set0(std::move(t)); + ex::unique_any_sender> t_local = Helpers::set0(std::move(t)); // Note: // T factor is an upper triangular square matrix, built column by column @@ -283,15 +392,127 @@ void QR_Tfactor::call(matrix::Panel& // so it is performed sequentially ex::start_detached(Helpers::trmvUpdateColumn(std::move(t_local))); } +#endif + +template +void QR_Tfactor::call( + matrix::Panel& hh_panel, + pika::shared_future> taus, pika::future> t, + common::Pipeline& mpi_col_task_chain) { + constexpr auto B = Backend::MC; + constexpr auto D = Device::CPU; -template -void QR_Tfactor::call(matrix::Panel& hh_panel, - pika::shared_future> taus, - pika::future> t, - common::Pipeline& mpi_col_task_chain) { namespace ex = pika::execution::experimental; - using Helpers = tfactor_l::Helpers; + // Fast return in case of no reflectors + if (hh_panel.getWidth() == 0) + return; + + std::vector()))> panel_tiles; + for (const auto idx : hh_panel.iteratorLocal()) + panel_tiles.push_back(hh_panel.read_sender(idx)); + + // Note: + // T factor is an upper triangular square matrix, built column by column + // with taus values on the diagonal + // + // T(j,j) = tau(j) + // + // and in the upper triangular part the following formula applies + // + // T(0:j, j) = T(0:j, 0:j) . -tau(j) . V(j:, 0:j)* . V(j:, j) + // + // + // The result is achieved in two main steps: + // 1) t = -tau(j) . V(j:, 0:j)* . V(j:, j) + // 2) T(0:j, j) = T(0:j, 0:j) . t + + const auto dist = hh_panel.parentDistribution(); + + const SizeType v_start = hh_panel.offsetElement(); + const SizeType bsRows = hh_panel.parentDistribution().blockSize().rows(); + const SizeType panelRowBegin = hh_panel.iteratorLocal().begin()->row(); + + const std::size_t nthreads = getTFactorNWorkers(); + ex::start_detached( + ex::when_all(ex::just(std::make_shared>(nthreads)), + ex::when_all_vector(std::move(panel_tiles)), std::move(taus), std::move(t), + mpi_col_task_chain()) | + ex::let_value([=](auto& barrier_ptr, auto&& panel, const common::internal::vector& taus, + matrix::Tile& t, auto&& pcomm) { + matrix::Matrix t_all({t.size().rows() * to_SizeType(nthreads - 1), t.size().cols()}, + t.size()); + return ex::when_all_vector( + select(t_all, common::iterate_range2d(t_all.distribution().localNrTiles()))) | + ex::transfer( + dlaf::internal::getBackendScheduler(pika::execution::thread_priority::high)) | + ex::bulk(nthreads, [=, &barrier_ptr, &t, &taus, &panel, + &pcomm](const std::size_t index, + std::vector>& t_all) { + using Helpers = tfactor_l::Helpers; + + tile::internal::set0(index == 0 ? t : t_all[index - 1]); + + // 1st step + // compute the column partial result `t` (multi-threaded) + // First we compute the matrix vector multiplication for each column + // -tau(j) . V(j:, 0:j)* . V(j:, j) + const std::size_t chunk_size = util::ceilDiv(panel.size(), nthreads); + const std::size_t begin = index * chunk_size; + const std::size_t end = std::min(index * chunk_size + chunk_size, panel.size()); + + for (std::size_t i = begin; i < end; ++i) { + const matrix::Tile& tile_v = panel[i].get(); + + const SizeType first_row_tile = + std::max(0, dist.template globalTileFromLocalTile( + panelRowBegin + to_SizeType(i)) * + bsRows - + v_start); + + if (index == 0) + t = Helpers::gemv_func(first_row_tile, tile_v, taus, std::move(t)); + else + t_all[index - 1] = + Helpers::gemv_func(first_row_tile, tile_v, taus, std::move(t_all[index - 1])); + } + + barrier_ptr->arrive_and_wait(); + + // (single-threaded) + if (index == 0) { + // reduce + for (auto& partial_t : t_all) + tile::internal::add(T(1), partial_t, t); + + // at this point each rank has its partial result for each column + // so, let's reduce the results (on all ranks, so that everyone can + // independently compute T factor) + if (pcomm.ref().size() > 1) + comm::sync::allReduceInPlace(pcomm.ref(), MPI_SUM, common::make_data(t)); + + // 2nd step + // compute the T factor, by performing the last step on each column + // (single-threaded) each column depends on the previous part (all reflectors + // that comes before) so it is performed sequentially + Helpers::trmv_func(t); + } + }); + })); +} + +#ifdef DLAF_WITH_GPU +template +void QR_Tfactor::call( + matrix::Panel& hh_panel, + pika::shared_future> taus, pika::future> t, + common::Pipeline& mpi_col_task_chain) { + constexpr auto B = Backend::GPU; + constexpr auto D = Device::GPU; + + namespace ex = pika::execution::experimental; + + using Helpers = tfactor_l::Helpers; // Fast return in case of no reflectors if (hh_panel.getWidth() == 0) @@ -300,7 +521,7 @@ void QR_Tfactor::call(matrix::Panel& const auto v_start = hh_panel.offsetElement(); auto dist = hh_panel.parentDistribution(); - ex::unique_any_sender> t_local = Helpers::set0(std::move(t)); + ex::unique_any_sender> t_local = Helpers::set0(std::move(t)); // Note: // T factor is an upper triangular square matrix, built column by column @@ -342,5 +563,5 @@ void QR_Tfactor::call(matrix::Panel& // so it is performed sequentially ex::start_detached(Helpers::trmvUpdateColumn(std::move(t_local))); } - +#endif } diff --git a/include/dlaf/tune.h b/include/dlaf/tune.h index 072a8cf0e6..548e659d2d 100644 --- a/include/dlaf/tune.h +++ b/include/dlaf/tune.h @@ -9,6 +9,8 @@ // #pragma once +#include + #include #include @@ -17,6 +19,7 @@ namespace dlaf { /// /// Holds the value of the parameters that can be used to tune DLA-Future. /// - red2band_panel_nworkers: number of threads to use for computing the panel in the reduction to band algorithm. +/// - tfactor_nworkers: number of threads to use for computing the T factor /// - eigensolver_min_band: The minimun value to start looking for a divisor of the block size. /// Set with --dlaf:eigensolver-min-band or env variable DLAF_EIGENSOLVER_MIN_BAND. /// - band_to_tridiag_1d_block_size_base: @@ -29,8 +32,11 @@ namespace dlaf { /// DLAF_BT_BAND_TO_TRIDIAG_HH_APPLY_GROUP_SIZE. /// Note to developers: Users can change these values, therefore consistency has to be ensured by algorithms. struct TuneParameters { - size_t red2band_panel_nworkers = - std::max(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2); + std::size_t red2band_panel_nworkers = + std::max(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2); + + std::size_t tfactor_nworkers = + std::max(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2); SizeType eigensolver_min_band = 100; SizeType band_to_tridiag_1d_block_size_base = 8192;