From 200a1bd1c30f2fbf3c4a090c853729a93d9dd121 Mon Sep 17 00:00:00 2001 From: Alberto Invernizzi Date: Wed, 8 Feb 2023 16:04:14 +0100 Subject: [PATCH 01/13] start factoring out computing functions --- include/dlaf/factorization/qr/t_factor_impl.h | 104 +++++++++--------- 1 file changed, 54 insertions(+), 50 deletions(-) diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h index eb1e9bdfcf..c366b720e2 100644 --- a/include/dlaf/factorization/qr/t_factor_impl.h +++ b/include/dlaf/factorization/qr/t_factor_impl.h @@ -59,74 +59,78 @@ struct Helpers { std::forward(t)); } + static auto gemv_func(const SizeType first_row_tile, const matrix::Tile& tile_v, + const common::internal::vector& taus, + matrix::Tile tile_t) noexcept { + const SizeType k = tile_t.size().cols(); + DLAF_ASSERT(tile_v.size().cols() == k, tile_v.size().cols(), k); + DLAF_ASSERT(taus.size() == k, taus.size(), k); + + for (SizeType j = 0; j < k; ++j) { + const T tau = taus[j]; + + const TileElementIndex t_start{0, j}; + + // Position of the 1 in the diagonal in the current column. + SizeType i_diag = j - first_row_tile; + const SizeType first_element_in_col = std::max(0, i_diag); + + // Break if the reflector starts in the next tile. + if (i_diag >= tile_v.size().rows()) + break; + + // T(0:j, j) = -tau . V(j:, 0:j)* . V(j:, j) + // [j x 1] = [(n-j) x j]* . [(n-j) x 1] + TileElementIndex va_start{first_element_in_col, 0}; + TileElementIndex vb_start{first_element_in_col, j}; + TileElementSize va_size{tile_v.size().rows() - first_element_in_col, j}; + + if (i_diag >= 0) { + tile_t({j, j}) = tau; + } + + blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, va_size.rows(), va_size.cols(), -tau, + tile_v.ptr(va_start), tile_v.ld(), tile_v.ptr(vb_start), 1, 1, tile_t.ptr(t_start), 1); + } + return std::move(tile_t); + }; + template static auto gemvColumnT(SizeType first_row_tile, pika::shared_future> tile_vi, pika::shared_future>& taus, TSender&& tile_t) { namespace ex = pika::execution::experimental; - auto gemv_func = [first_row_tile](const auto& tile_v, const auto& taus, auto&& tile_t) noexcept { - const SizeType k = tile_t.size().cols(); - DLAF_ASSERT(tile_v.size().cols() == k, tile_v.size().cols(), k); - DLAF_ASSERT(taus.size() == k, taus.size(), k); - - for (SizeType j = 0; j < k; ++j) { - const T tau = taus[j]; - - const TileElementIndex t_start{0, j}; - - // Position of the 1 in the diagonal in the current column. - SizeType i_diag = j - first_row_tile; - const SizeType first_element_in_col = std::max(0, i_diag); - - // Break if the reflector starts in the next tile. - if (i_diag >= tile_v.size().rows()) - break; - - // T(0:j, j) = -tau . V(j:, 0:j)* . V(j:, j) - // [j x 1] = [(n-j) x j]* . [(n-j) x 1] - TileElementIndex va_start{first_element_in_col, 0}; - TileElementIndex vb_start{first_element_in_col, j}; - TileElementSize va_size{tile_v.size().rows() - first_element_in_col, j}; - - if (i_diag >= 0) { - tile_t({j, j}) = tau; - } - - blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, va_size.rows(), va_size.cols(), -tau, - tile_v.ptr(va_start), tile_v.ld(), tile_v.ptr(vb_start), 1, 1, tile_t.ptr(t_start), - 1); - } - return std::move(tile_t); - }; return dlaf::internal::transform(dlaf::internal::Policy( pika::execution::thread_priority::high), - std::move(gemv_func), - ex::when_all(dlaf::internal::keepFuture(tile_vi), + gemv_func, + ex::when_all(ex::just(first_row_tile), + dlaf::internal::keepFuture(tile_vi), dlaf::internal::keepFuture(taus), std::forward(tile_t))); } + // Update each column (in order) t = T . t + // remember that T is upper triangular, so it is possible to use TRMV + static auto trmv_func(matrix::Tile&& tile_t) { + for (SizeType j = 0; j < tile_t.size().cols(); ++j) { + const TileElementIndex t_start{0, j}; + const TileElementSize t_size{j, 1}; + + blas::trmv(blas::Layout::ColMajor, blas::Uplo::Upper, blas::Op::NoTrans, blas::Diag::NonUnit, + t_size.rows(), tile_t.ptr(), tile_t.ld(), tile_t.ptr(t_start), 1); + } + // TODO: Why return if the tile is unused? + return std::move(tile_t); + } + template static auto trmvUpdateColumn(TSender&& tile_t) noexcept { namespace ex = pika::execution::experimental; - // Update each column (in order) t = T . t - // remember that T is upper triangular, so it is possible to use TRMV - auto trmv_func = [](matrix::Tile&& tile_t) { - for (SizeType j = 0; j < tile_t.size().cols(); ++j) { - const TileElementIndex t_start{0, j}; - const TileElementSize t_size{j, 1}; - - blas::trmv(blas::Layout::ColMajor, blas::Uplo::Upper, blas::Op::NoTrans, blas::Diag::NonUnit, - t_size.rows(), tile_t.ptr(), tile_t.ld(), tile_t.ptr(t_start), 1); - } - // TODO: Why return if the tile is unused? - return std::move(tile_t); - }; return dlaf::internal::transform(dlaf::internal::Policy( pika::execution::thread_priority::high), - std::move(trmv_func), std::forward(tile_t)); + trmv_func, std::forward(tile_t)); } }; From 002b29a61960f2a639d8c9661233a460c6b72e74 Mon Sep 17 00:00:00 2001 From: Alberto Invernizzi Date: Wed, 8 Feb 2023 16:04:35 +0100 Subject: [PATCH 02/13] basic bulkification --- include/dlaf/factorization/qr/t_factor_impl.h | 102 +++++++++++++----- 1 file changed, 75 insertions(+), 27 deletions(-) diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h index c366b720e2..d86f481dfe 100644 --- a/include/dlaf/factorization/qr/t_factor_impl.h +++ b/include/dlaf/factorization/qr/t_factor_impl.h @@ -10,9 +10,15 @@ #pragma once -#include +#include #include +#include +#include + +#include "dlaf/blas/tile_extensions.h" +#include "dlaf/matrix/tile.h" +#include "dlaf/util_matrix.h" #ifdef DLAF_WITH_GPU #include @@ -238,20 +244,19 @@ struct Helpers { #endif } -template -void QR_Tfactor::call(matrix::Panel& hh_panel, - pika::shared_future> taus, - pika::future> t) { +template +void QR_Tfactor::call(matrix::Panel& hh_panel, + pika::shared_future> taus, + pika::future> t) { namespace ex = pika::execution::experimental; - using Helpers = tfactor_l::Helpers; // Fast return in case of no reflectors if (hh_panel.getWidth() == 0) return; - const auto v_start = hh_panel.offsetElement(); - - ex::unique_any_sender> t_local = Helpers::set0(std::move(t)); + std::vector()))> panel_tiles; + for (const auto idx : hh_panel.iteratorLocal()) + panel_tiles.push_back(hh_panel.read_sender(idx)); // Note: // T factor is an upper triangular square matrix, built column by column @@ -268,24 +273,67 @@ void QR_Tfactor::call(matrix::Panel& // 1) t = -tau(j) . V(j:, 0:j)* . V(j:, j) // 2) T(0:j, j) = T(0:j, 0:j) . t - // 1st step: compute the column partial result `t` - // First we compute the matrix vector multiplication for each column - // -tau(j) . V(j:, 0:j)* . V(j:, j) - for (const auto& v_i : hh_panel.iteratorLocal()) { - const SizeType first_row_tile = - std::max(0, v_i.row() * hh_panel.parentDistribution().blockSize().rows() - v_start); - - // Note: - // Since we are writing always on the same t, the gemv are serialized - // A possible solution to this would be to have multiple places where to store partial - // results, and then locally reduce them just before the reduce over ranks - t_local = Helpers::gemvColumnT(first_row_tile, hh_panel.read(v_i), taus, std::move(t_local)); - } - - // 2nd step: compute the T factor, by performing the last step on each column - // each column depends on the previous part (all reflectors that comes before) - // so it is performed sequentially - ex::start_detached(Helpers::trmvUpdateColumn(std::move(t_local))); + const SizeType v_start = hh_panel.offsetElement(); + const SizeType bsRows = hh_panel.parentDistribution().blockSize().rows(); + const SizeType panelRowBegin = hh_panel.iteratorLocal().begin()->row(); + + const SizeType nthreads = std::max(1, (pika::get_num_worker_threads() / 2)); + ex::start_detached( + ex::when_all(ex::just(std::make_shared>(nthreads)), + ex::when_all_vector(std::move(panel_tiles)), std::move(taus), std::move(t)) | + ex::let_value([nthreads, v_start, panelRowBegin, + bsRows](std::shared_ptr>& barrier_ptr, auto&& panel, + const common::internal::vector& taus, matrix::Tile& t) { + matrix::Matrix t_all({t.size().rows() * (nthreads - 1), t.size().cols()}, t.size()); + return ex::when_all_vector( + select(t_all, common::iterate_range2d(t_all.distribution().localNrTiles()))) | + ex::transfer( + dlaf::internal::getBackendScheduler(pika::execution::thread_priority::high)) | + ex::bulk(nthreads, [&barrier_ptr, &t, &taus, &panel, nthreads, v_start, panelRowBegin, + bsRows](const SizeType index, + std::vector>& t_all) { + using Helpers = tfactor_l::Helpers; + + tile::internal::set0(index == 0 ? t : t_all[index - 1]); + + // 1st step + // compute the column partial result `t` (multi-threaded) + // First we compute the matrix vector multiplication for each column + // -tau(j) . V(j:, 0:j)* . V(j:, j) + const SizeType chunk_size = util::ceilDiv(panel.size(), nthreads); + const SizeType from = to_SizeType(index) * chunk_size; + const SizeType to = std::min(index * chunk_size + chunk_size, SizeType(panel.size())); + + for (auto i = from; i < to; ++i) { + const matrix::Tile& tile_v = panel[i].get(); + + // std::max(0, v_i.row() * hh_panel.parentDistribution().blockSize().rows() - v_start); + const SizeType first_row_tile = + std::max(0, (panelRowBegin + i) * bsRows - v_start); + + if (index == 0) + t = Helpers::gemv_func(first_row_tile, tile_v, taus, std::move(t)); + else + t_all[index - 1] = + Helpers::gemv_func(first_row_tile, tile_v, taus, std::move(t_all[index - 1])); + } + + barrier_ptr->arrive_and_wait(); + + // (single-threaded) + if (index == 0) { + // reduce + for (auto& partial_t : t_all) + tile::internal::add(T(1), partial_t, t); + + // 2nd step + // compute the T factor, by performing the last step on each column (single-threaded) + // each column depends on the previous part (all reflectors that comes before) so it + // is performed sequentially + t = Helpers::trmv_func(std::move(t)); + } + }); + })); } template From 1ac6bc2187ed9132bd7c7642a1844cd2a88d5183 Mon Sep 17 00:00:00 2001 From: Alberto Invernizzi Date: Tue, 14 Feb 2023 10:33:50 +0100 Subject: [PATCH 03/13] fix warnings --- include/dlaf/factorization/qr/t_factor_impl.h | 34 +++++++++---------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h index d86f481dfe..af245b502a 100644 --- a/include/dlaf/factorization/qr/t_factor_impl.h +++ b/include/dlaf/factorization/qr/t_factor_impl.h @@ -98,7 +98,7 @@ struct Helpers { blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, va_size.rows(), va_size.cols(), -tau, tile_v.ptr(va_start), tile_v.ld(), tile_v.ptr(vb_start), 1, 1, tile_t.ptr(t_start), 1); } - return std::move(tile_t); + return tile_t; }; template @@ -277,21 +277,20 @@ void QR_Tfactor::call(matrix::Panel& hh_panel, const SizeType bsRows = hh_panel.parentDistribution().blockSize().rows(); const SizeType panelRowBegin = hh_panel.iteratorLocal().begin()->row(); - const SizeType nthreads = std::max(1, (pika::get_num_worker_threads() / 2)); + const size_t nthreads = std::max(1, (pika::get_num_worker_threads() / 2)); ex::start_detached( ex::when_all(ex::just(std::make_shared>(nthreads)), ex::when_all_vector(std::move(panel_tiles)), std::move(taus), std::move(t)) | - ex::let_value([nthreads, v_start, panelRowBegin, - bsRows](std::shared_ptr>& barrier_ptr, auto&& panel, - const common::internal::vector& taus, matrix::Tile& t) { - matrix::Matrix t_all({t.size().rows() * (nthreads - 1), t.size().cols()}, t.size()); + ex::let_value([=](std::shared_ptr>& barrier_ptr, auto&& panel, + const common::internal::vector& taus, matrix::Tile& t) { + matrix::Matrix t_all({t.size().rows() * to_SizeType(nthreads - 1), t.size().cols()}, + t.size()); return ex::when_all_vector( select(t_all, common::iterate_range2d(t_all.distribution().localNrTiles()))) | ex::transfer( dlaf::internal::getBackendScheduler(pika::execution::thread_priority::high)) | - ex::bulk(nthreads, [&barrier_ptr, &t, &taus, &panel, nthreads, v_start, panelRowBegin, - bsRows](const SizeType index, - std::vector>& t_all) { + ex::bulk(nthreads, [=, &barrier_ptr, &t, &taus, + &panel](const size_t index, std::vector>& t_all) { using Helpers = tfactor_l::Helpers; tile::internal::set0(index == 0 ? t : t_all[index - 1]); @@ -300,16 +299,15 @@ void QR_Tfactor::call(matrix::Panel& hh_panel, // compute the column partial result `t` (multi-threaded) // First we compute the matrix vector multiplication for each column // -tau(j) . V(j:, 0:j)* . V(j:, j) - const SizeType chunk_size = util::ceilDiv(panel.size(), nthreads); - const SizeType from = to_SizeType(index) * chunk_size; - const SizeType to = std::min(index * chunk_size + chunk_size, SizeType(panel.size())); + const size_t chunk_size = util::ceilDiv(panel.size(), nthreads); + const size_t begin = index * chunk_size; + const size_t end = std::min(index * chunk_size + chunk_size, panel.size()); - for (auto i = from; i < to; ++i) { + for (size_t i = begin; i < end; ++i) { const matrix::Tile& tile_v = panel[i].get(); - // std::max(0, v_i.row() * hh_panel.parentDistribution().blockSize().rows() - v_start); const SizeType first_row_tile = - std::max(0, (panelRowBegin + i) * bsRows - v_start); + std::max(0, (panelRowBegin + to_SizeType(i)) * bsRows - v_start); if (index == 0) t = Helpers::gemv_func(first_row_tile, tile_v, taus, std::move(t)); @@ -327,9 +325,9 @@ void QR_Tfactor::call(matrix::Panel& hh_panel, tile::internal::add(T(1), partial_t, t); // 2nd step - // compute the T factor, by performing the last step on each column (single-threaded) - // each column depends on the previous part (all reflectors that comes before) so it - // is performed sequentially + // compute the T factor, by performing the last step on each column + // (single-threaded) each column depends on the previous part (all reflectors + // that comes before) so it is performed sequentially t = Helpers::trmv_func(std::move(t)); } }); From b8cbd6248e8d6f8fc1308c09e99c0fd37765b092 Mon Sep 17 00:00:00 2001 From: Alberto Invernizzi Date: Tue, 14 Feb 2023 12:07:12 +0100 Subject: [PATCH 04/13] split algorithms variant for different backends --- include/dlaf/factorization/qr.h | 2 +- include/dlaf/factorization/qr/api.h | 73 +++++------------- include/dlaf/factorization/qr/t_factor_impl.h | 75 ++++++++++++++++--- 3 files changed, 87 insertions(+), 63 deletions(-) diff --git a/include/dlaf/factorization/qr.h b/include/dlaf/factorization/qr.h index 189987cc6f..6b4216dbf0 100644 --- a/include/dlaf/factorization/qr.h +++ b/include/dlaf/factorization/qr.h @@ -46,7 +46,7 @@ void computeTFactor(matrix::Panel& hh_panel, pika::shared_future> taus, pika::future> t, common::Pipeline& mpi_col_task_chain) { - QR_Tfactor::call(hh_panel, taus, std::move(t), mpi_col_task_chain); + QR_TfactorDistributed::call(hh_panel, taus, std::move(t), mpi_col_task_chain); } } diff --git a/include/dlaf/factorization/qr/api.h b/include/dlaf/factorization/qr/api.h index e803219597..2c1f9a6dfe 100644 --- a/include/dlaf/factorization/qr/api.h +++ b/include/dlaf/factorization/qr/api.h @@ -24,60 +24,26 @@ template struct QR {}; template -struct QR_Tfactor { - /// Forms the triangular factor T of a block of reflectors H, which is defined as a product of k - /// elementary reflectors. - /// - /// It is similar to what xLARFT in LAPACK does. - /// Given @p k elementary reflectors stored in the column of @p v starting at tile @p v_start, - /// together with related tau values in @p taus, in @p t will be formed the triangular factor for the H - /// block of reflector, such that - /// - /// H = I - V . T . V* - /// - /// where H = H1 . H2 . ... . Hk - /// - /// in which Hi represents a single elementary reflector transformation - /// - /// @param k the number of elementary reflectors to use (from the beginning of the tile) - /// @param v where the elementary reflectors are stored - /// @param v_start tile in @p v where the column of reflectors starts - /// @param taus array of taus, associated with the related elementary reflector - /// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size - /// TileElementSize(k, k) - /// - /// @pre k <= t.get().size().rows && k <= t.get().size().cols() - /// @pre k >= 0 - /// @pre v_start.isIn(v.nrTiles()) - static void call(matrix::Panel& panel_view, +struct QR_Tfactor; + +template +struct QR_Tfactor { + static void call(matrix::Panel& panel_view, + pika::shared_future> taus, + pika::future> t); +}; + +#ifdef DLAF_WITH_GPU +template +struct QR_Tfactor { + static void call(matrix::Panel& panel_view, pika::shared_future> taus, - pika::future> t); + pika::future> t); +}; +#endif - /// Forms the triangular factor T of a block of reflectors H, which is defined as a product of k - /// elementary reflectors. - /// - /// It is similar to what xLARFT in LAPACK does. - /// Given @p k elementary reflectors stored in the column of @p v starting at tile @p v_start, - /// together with related tau values in @p taus, in @p t will be formed the triangular factor for the H - /// block of reflector, such that - /// - /// H = I - V . T . V* - /// - /// where H = H1 . H2 . ... . Hk - /// - /// in which Hi represents a single elementary reflector transformation - /// - /// @param k the number of elementary reflectors to use (from the beginning of the tile) - /// @param v where the elementary reflectors are stored - /// @param v_start tile in @p v where the column of reflectors starts - /// @param taus array of taus, associated with the related elementary reflector - /// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size - /// TileElementSize(k, k) - /// @param mpi_col_task_chain where internal communications are issued - /// - /// @pre k <= t.get().size().rows && k <= t.get().size().cols() - /// @pre k >= 0 - /// @pre v_start.isIn(v.nrTiles()) +template +struct QR_TfactorDistributed { static void call(matrix::Panel& hh_panel, pika::shared_future> taus, pika::future> t, @@ -86,7 +52,8 @@ struct QR_Tfactor { /// ---- ETI #define DLAF_FACTORIZATION_QR_TFACTOR_ETI(KWORD, BACKEND, DEVICE, DATATYPE) \ - KWORD template struct QR_Tfactor; + KWORD template struct QR_Tfactor; \ + KWORD template struct QR_TfactorDistributed; DLAF_FACTORIZATION_QR_TFACTOR_ETI(extern, Backend::MC, Device::CPU, float) DLAF_FACTORIZATION_QR_TFACTOR_ETI(extern, Backend::MC, Device::CPU, double) diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h index af245b502a..330c80a20a 100644 --- a/include/dlaf/factorization/qr/t_factor_impl.h +++ b/include/dlaf/factorization/qr/t_factor_impl.h @@ -244,10 +244,13 @@ struct Helpers { #endif } -template -void QR_Tfactor::call(matrix::Panel& hh_panel, - pika::shared_future> taus, - pika::future> t) { +template +void QR_Tfactor::call(matrix::Panel& hh_panel, + pika::shared_future> taus, + pika::future> t) { + constexpr auto B = Backend::MC; + constexpr auto D = Device::CPU; + namespace ex = pika::execution::experimental; // Fast return in case of no reflectors @@ -334,11 +337,66 @@ void QR_Tfactor::call(matrix::Panel& hh_panel, })); } +#ifdef DLAF_WITH_GPU +template +void QR_Tfactor::call(matrix::Panel& hh_panel, + pika::shared_future> taus, + pika::future> t) { + constexpr auto B = Backend::GPU; + constexpr auto D = Device::GPU; + + namespace ex = pika::execution::experimental; + + using Helpers = tfactor_l::Helpers; + // Fast return in case of no reflectors + if (hh_panel.getWidth() == 0) + return; + + const auto v_start = hh_panel.offsetElement(); + + ex::unique_any_sender> t_local = Helpers::set0(std::move(t)); + + // Note: + // T factor is an upper triangular square matrix, built column by column + // with taus values on the diagonal + // + // T(j,j) = tau(j) + // + // and in the upper triangular part the following formula applies + // + // T(0:j, j) = T(0:j, 0:j) . -tau(j) . V(j:, 0:j)* . V(j:, j) + // + // + // The result is achieved in two main steps: + // 1) t = -tau(j) . V(j:, 0:j)* . V(j:, j) + // 2) T(0:j, j) = T(0:j, 0:j) . t + + // 1st step: compute the column partial result `t` + // First we compute the matrix vector multiplication for each column + // -tau(j) . V(j:, 0:j)* . V(j:, j) + for (const auto& v_i : hh_panel.iteratorLocal()) { + const SizeType first_row_tile = + std::max(0, v_i.row() * hh_panel.parentDistribution().blockSize().rows() - v_start); + + // Note: + // Since we are writing always on the same t, the gemv are serialized + // A possible solution to this would be to have multiple places where to store partial + // results, and then locally reduce them just before the reduce over ranks + t_local = Helpers::gemvColumnT(first_row_tile, hh_panel.read(v_i), taus, std::move(t_local)); + } + + // 2nd step: compute the T factor, by performing the last step on each column + // each column depends on the previous part (all reflectors that comes before) + // so it is performed sequentially + ex::start_detached(Helpers::trmvUpdateColumn(std::move(t_local))); +} +#endif + template -void QR_Tfactor::call(matrix::Panel& hh_panel, - pika::shared_future> taus, - pika::future> t, - common::Pipeline& mpi_col_task_chain) { +void QR_TfactorDistributed::call( + matrix::Panel& hh_panel, + pika::shared_future> taus, pika::future> t, + common::Pipeline& mpi_col_task_chain) { namespace ex = pika::execution::experimental; using Helpers = tfactor_l::Helpers; @@ -392,5 +450,4 @@ void QR_Tfactor::call(matrix::Panel& // so it is performed sequentially ex::start_detached(Helpers::trmvUpdateColumn(std::move(t_local))); } - } From 247ab27b486ab2aa1438ddb98ef345b51dd536ab Mon Sep 17 00:00:00 2001 From: Alberto Invernizzi Date: Tue, 14 Feb 2023 14:34:14 +0100 Subject: [PATCH 05/13] first try fixing doc --- include/dlaf/factorization/qr.h | 56 +++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/include/dlaf/factorization/qr.h b/include/dlaf/factorization/qr.h index 6b4216dbf0..ac8f8db9ea 100644 --- a/include/dlaf/factorization/qr.h +++ b/include/dlaf/factorization/qr.h @@ -27,13 +27,28 @@ namespace dlaf::factorization::internal { /// H0 H1 H2 ... HK-1 /// Note: The first element of the HH reflectors is NOT implicitly assumed to be 1, /// it has to be set correctly in the panel (0s as well). - +/// +/// It is similar to what xLARFT in LAPACK does. +/// Given @p k elementary reflectors stored in the column of @p hh_panel together with related tau values +/// in @p taus, in @p t will be formed the triangular factor for the H block of reflectors, such that +/// +/// H = I - V . T . V* +/// +/// where H = H1 . H2 . ... . Hk +/// +/// in which Hi represents a single elementary reflector transformation. +/// /// A Storage-Efficient WY Representation for Products of Householder Transformations. /// Schreiber, Robert & VanLoan, Charles. (1989) /// SIAM Journal on Scientific and Statistical Computing. 10. 10.1137/0910005. /// -/// @pre taus contains a vector with k elements -/// @pre t contains a (k x k) tile +/// @param hh_panel where the elementary reflectors are stored +/// @param taus array of taus, associated with the related elementary reflector +/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size +/// TileElementSize(k, k) +/// @param mpi_col_task_chain where internal communications are issued +/// +/// @pre hh_pane.getWidth() <= t.get().size().rows && hh_panel.size().getWidth() <= t.get().size().cols() template void computeTFactor(matrix::Panel& hh_panel, pika::shared_future> taus, @@ -41,6 +56,41 @@ void computeTFactor(matrix::Panel& hh_panel, QR_Tfactor::call(hh_panel, taus, std::move(t)); } +/// Forms the triangular factor T of a block of reflectors H, which is defined as a product of +/// k := hh_panel.getWidth() elementary reflectors. +/// +/// hh_panel should have the following form +/// H0 0 0 ... 0 +/// . H1 0 ... 0 +/// . . H2 ... 0 +/// . . . ... 0 +/// . . . ... HK-1 +/// . . . ... . +/// H0 H1 H2 ... HK-1 +/// Note: The first element of the HH reflectors is NOT implicitly assumed to be 1, +/// it has to be set correctly in the panel (0s as well). +/// +/// It is similar to what xLARFT in LAPACK does. +/// Given @p k elementary reflectors stored in the column of @p hh_panel together with related tau values +/// in @p taus, in @p t will be formed the triangular factor for the H block of reflectors, such that +/// +/// H = I - V . T . V* +/// +/// where H = H1 . H2 . ... . Hk +/// +/// in which Hi represents a single elementary reflector transformation. +/// +/// A Storage-Efficient WY Representation for Products of Householder Transformations. +/// Schreiber, Robert & VanLoan, Charles. (1989) +/// SIAM Journal on Scientific and Statistical Computing. 10. 10.1137/0910005. +/// +/// @param hh_panel where the elementary reflectors are stored +/// @param taus array of taus, associated with the related elementary reflector +/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size +/// TileElementSize(k, k) +/// @param mpi_col_task_chain where internal communications are issued +/// +/// @pre hh_pane.getWidth() <= t.get().size().rows && hh_panel.size().getWidth() <= t.get().size().cols() template void computeTFactor(matrix::Panel& hh_panel, pika::shared_future> taus, From d0e636dfaa2760a3489b3abdc838bd5f8dfe4a8d Mon Sep 17 00:00:00 2001 From: Alberto Invernizzi Date: Tue, 14 Feb 2023 14:39:13 +0100 Subject: [PATCH 06/13] minor change --- include/dlaf/factorization/qr/t_factor_impl.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h index 330c80a20a..5e5b00da36 100644 --- a/include/dlaf/factorization/qr/t_factor_impl.h +++ b/include/dlaf/factorization/qr/t_factor_impl.h @@ -392,14 +392,14 @@ void QR_Tfactor::call(matrix::Panel -void QR_TfactorDistributed::call( - matrix::Panel& hh_panel, - pika::shared_future> taus, pika::future> t, - common::Pipeline& mpi_col_task_chain) { +template +void QR_TfactorDistributed::call(matrix::Panel& hh_panel, + pika::shared_future> taus, + pika::future> t, + common::Pipeline& mpi_col_task_chain) { namespace ex = pika::execution::experimental; - using Helpers = tfactor_l::Helpers; + using Helpers = tfactor_l::Helpers; // Fast return in case of no reflectors if (hh_panel.getWidth() == 0) @@ -408,7 +408,7 @@ void QR_TfactorDistributed::call( const auto v_start = hh_panel.offsetElement(); auto dist = hh_panel.parentDistribution(); - ex::unique_any_sender> t_local = Helpers::set0(std::move(t)); + ex::unique_any_sender> t_local = Helpers::set0(std::move(t)); // Note: // T factor is an upper triangular square matrix, built column by column From bbdb50fed3ba45f8c90cc9bbd0a45bcf875046b0 Mon Sep 17 00:00:00 2001 From: Alberto Invernizzi Date: Fri, 17 Feb 2023 08:23:45 +0100 Subject: [PATCH 07/13] tfactor-bulk-dist --- include/dlaf/factorization/qr/t_factor_impl.h | 103 +++++++++++++----- 1 file changed, 74 insertions(+), 29 deletions(-) diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h index 5e5b00da36..7fbc0c2b6a 100644 --- a/include/dlaf/factorization/qr/t_factor_impl.h +++ b/include/dlaf/factorization/qr/t_factor_impl.h @@ -33,6 +33,7 @@ #include "dlaf/common/range2d.h" #include "dlaf/common/vector.h" #include "dlaf/communication/kernels/all_reduce.h" +#include "dlaf/communication/sync/all_reduce.h" #include "dlaf/lapack/tile.h" #include "dlaf/matrix/matrix.h" #include "dlaf/matrix/views.h" @@ -284,8 +285,8 @@ void QR_Tfactor::call(matrix::Panel>(nthreads)), ex::when_all_vector(std::move(panel_tiles)), std::move(taus), std::move(t)) | - ex::let_value([=](std::shared_ptr>& barrier_ptr, auto&& panel, - const common::internal::vector& taus, matrix::Tile& t) { + ex::let_value([=](auto& barrier_ptr, auto&& panel, const common::internal::vector& taus, + matrix::Tile& t) { matrix::Matrix t_all({t.size().rows() * to_SizeType(nthreads - 1), t.size().cols()}, t.size()); return ex::when_all_vector( @@ -399,16 +400,13 @@ void QR_TfactorDistributed::call(matrix::Panel& hh_pa common::Pipeline& mpi_col_task_chain) { namespace ex = pika::execution::experimental; - using Helpers = tfactor_l::Helpers; - // Fast return in case of no reflectors if (hh_panel.getWidth() == 0) return; - const auto v_start = hh_panel.offsetElement(); - auto dist = hh_panel.parentDistribution(); - - ex::unique_any_sender> t_local = Helpers::set0(std::move(t)); + std::vector()))> panel_tiles; + for (const auto idx : hh_panel.iteratorLocal()) + panel_tiles.push_back(hh_panel.read_sender(idx)); // Note: // T factor is an upper triangular square matrix, built column by column @@ -425,29 +423,76 @@ void QR_TfactorDistributed::call(matrix::Panel& hh_pa // 1) t = -tau(j) . V(j:, 0:j)* . V(j:, j) // 2) T(0:j, j) = T(0:j, 0:j) . t - // 1st step: compute the column partial result `t` - // First we compute the matrix vector multiplication for each column - // -tau(j) . V(j:, 0:j)* . V(j:, j) - for (const auto& v_i_loc : hh_panel.iteratorLocal()) { - const SizeType v_i = dist.template globalTileFromLocalTile(v_i_loc.row()); - const SizeType first_row_tile = std::max(0, v_i * dist.blockSize().rows() - v_start); + const auto dist = hh_panel.parentDistribution(); - // TODO - // Note: - // Since we are writing always on the same t, the gemv are serialized - // A possible solution to this would be to have multiple places where to store partial - // results, and then locally reduce them just before the reduce over ranks - t_local = Helpers::gemvColumnT(first_row_tile, hh_panel.read(v_i_loc), taus, std::move(t_local)); - } + const SizeType v_start = hh_panel.offsetElement(); + const SizeType bsRows = hh_panel.parentDistribution().blockSize().rows(); + const SizeType panelRowBegin = hh_panel.iteratorLocal().begin()->row(); + + const size_t nthreads = std::max(1, (pika::get_num_worker_threads() / 2)); + ex::start_detached( + ex::when_all(ex::just(std::make_shared>(nthreads)), + ex::when_all_vector(std::move(panel_tiles)), std::move(taus), std::move(t), + mpi_col_task_chain()) | + ex::let_value([=](auto& barrier_ptr, auto&& panel, const common::internal::vector& taus, + matrix::Tile& t, auto&& pcomm) { + matrix::Matrix t_all({t.size().rows() * to_SizeType(nthreads - 1), t.size().cols()}, + t.size()); + return ex::when_all_vector( + select(t_all, common::iterate_range2d(t_all.distribution().localNrTiles()))) | + ex::transfer( + dlaf::internal::getBackendScheduler(pika::execution::thread_priority::high)) | + ex::bulk(nthreads, [=, &barrier_ptr, &t, &taus, &panel, + &pcomm](const size_t index, std::vector>& t_all) { + using Helpers = tfactor_l::Helpers; - // at this point each rank has its partial result for each column - // so, let's reduce the results (on all ranks, so that everyone can independently compute T factor) - if (true) // TODO if the column communicator has more than 1 tile...but I just have the pipeline - t_local = dlaf::comm::scheduleAllReduceInPlace(mpi_col_task_chain(), MPI_SUM, std::move(t_local)); + tile::internal::set0(index == 0 ? t : t_all[index - 1]); - // 2nd step: compute the T factor, by performing the last step on each column - // each column depends on the previous part (all reflectors that comes before) - // so it is performed sequentially - ex::start_detached(Helpers::trmvUpdateColumn(std::move(t_local))); + // 1st step + // compute the column partial result `t` (multi-threaded) + // First we compute the matrix vector multiplication for each column + // -tau(j) . V(j:, 0:j)* . V(j:, j) + const size_t chunk_size = util::ceilDiv(panel.size(), nthreads); + const size_t begin = index * chunk_size; + const size_t end = std::min(index * chunk_size + chunk_size, panel.size()); + + for (size_t i = begin; i < end; ++i) { + const matrix::Tile& tile_v = panel[i].get(); + + const SizeType first_row_tile = + std::max(0, dist.template globalTileFromLocalTile( + panelRowBegin + to_SizeType(i)) * + bsRows - + v_start); + + if (index == 0) + t = Helpers::gemv_func(first_row_tile, tile_v, taus, std::move(t)); + else + t_all[index - 1] = + Helpers::gemv_func(first_row_tile, tile_v, taus, std::move(t_all[index - 1])); + } + + barrier_ptr->arrive_and_wait(); + + // (single-threaded) + if (index == 0) { + // reduce + for (auto& partial_t : t_all) + tile::internal::add(T(1), partial_t, t); + + // at this point each rank has its partial result for each column + // so, let's reduce the results (on all ranks, so that everyone can + // independently compute T factor) + if (pcomm.ref().size() > 1) + comm::sync::allReduceInPlace(pcomm.ref(), MPI_SUM, common::make_data(t)); + + // 2nd step + // compute the T factor, by performing the last step on each column + // (single-threaded) each column depends on the previous part (all reflectors + // that comes before) so it is performed sequentially + t = Helpers::trmv_func(std::move(t)); + } + }); + })); } } From 3ca979e13606edc9dfd5c242daee5ebc20dd3cfe Mon Sep 17 00:00:00 2001 From: Alberto Invernizzi Date: Fri, 17 Feb 2023 09:03:47 +0100 Subject: [PATCH 08/13] add back separate implementation for gpu and refactor variant separation --- include/dlaf/factorization/qr.h | 2 +- include/dlaf/factorization/qr/api.h | 17 ++-- include/dlaf/factorization/qr/t_factor_impl.h | 77 +++++++++++++++++-- 3 files changed, 81 insertions(+), 15 deletions(-) diff --git a/include/dlaf/factorization/qr.h b/include/dlaf/factorization/qr.h index ac8f8db9ea..26fdd5d16e 100644 --- a/include/dlaf/factorization/qr.h +++ b/include/dlaf/factorization/qr.h @@ -96,7 +96,7 @@ void computeTFactor(matrix::Panel& hh_panel, pika::shared_future> taus, pika::future> t, common::Pipeline& mpi_col_task_chain) { - QR_TfactorDistributed::call(hh_panel, taus, std::move(t), mpi_col_task_chain); + QR_Tfactor::call(hh_panel, taus, std::move(t), mpi_col_task_chain); } } diff --git a/include/dlaf/factorization/qr/api.h b/include/dlaf/factorization/qr/api.h index 2c1f9a6dfe..79add6fbce 100644 --- a/include/dlaf/factorization/qr/api.h +++ b/include/dlaf/factorization/qr/api.h @@ -31,6 +31,10 @@ struct QR_Tfactor { static void call(matrix::Panel& panel_view, pika::shared_future> taus, pika::future> t); + static void call(matrix::Panel& hh_panel, + pika::shared_future> taus, + pika::future> t, + common::Pipeline& mpi_col_task_chain); }; #ifdef DLAF_WITH_GPU @@ -39,21 +43,16 @@ struct QR_Tfactor { static void call(matrix::Panel& panel_view, pika::shared_future> taus, pika::future> t); -}; -#endif - -template -struct QR_TfactorDistributed { - static void call(matrix::Panel& hh_panel, + static void call(matrix::Panel& hh_panel, pika::shared_future> taus, - pika::future> t, + pika::future> t, common::Pipeline& mpi_col_task_chain); }; +#endif /// ---- ETI #define DLAF_FACTORIZATION_QR_TFACTOR_ETI(KWORD, BACKEND, DEVICE, DATATYPE) \ - KWORD template struct QR_Tfactor; \ - KWORD template struct QR_TfactorDistributed; + KWORD template struct QR_Tfactor; DLAF_FACTORIZATION_QR_TFACTOR_ETI(extern, Backend::MC, Device::CPU, float) DLAF_FACTORIZATION_QR_TFACTOR_ETI(extern, Backend::MC, Device::CPU, double) diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h index 7fbc0c2b6a..4c1dd4aefe 100644 --- a/include/dlaf/factorization/qr/t_factor_impl.h +++ b/include/dlaf/factorization/qr/t_factor_impl.h @@ -393,11 +393,14 @@ void QR_Tfactor::call(matrix::Panel -void QR_TfactorDistributed::call(matrix::Panel& hh_panel, - pika::shared_future> taus, - pika::future> t, - common::Pipeline& mpi_col_task_chain) { +template +void QR_Tfactor::call( + matrix::Panel& hh_panel, + pika::shared_future> taus, pika::future> t, + common::Pipeline& mpi_col_task_chain) { + constexpr auto B = Backend::MC; + constexpr auto D = Device::CPU; + namespace ex = pika::execution::experimental; // Fast return in case of no reflectors @@ -495,4 +498,68 @@ void QR_TfactorDistributed::call(matrix::Panel& hh_pa }); })); } + +#ifdef DLAF_WITH_GPU +template +void QR_Tfactor::call( + matrix::Panel& hh_panel, + pika::shared_future> taus, pika::future> t, + common::Pipeline& mpi_col_task_chain) { + constexpr auto B = Backend::GPU; + constexpr auto D = Device::GPU; + + namespace ex = pika::execution::experimental; + + using Helpers = tfactor_l::Helpers; + + // Fast return in case of no reflectors + if (hh_panel.getWidth() == 0) + return; + + const auto v_start = hh_panel.offsetElement(); + auto dist = hh_panel.parentDistribution(); + + ex::unique_any_sender> t_local = Helpers::set0(std::move(t)); + + // Note: + // T factor is an upper triangular square matrix, built column by column + // with taus values on the diagonal + // + // T(j,j) = tau(j) + // + // and in the upper triangular part the following formula applies + // + // T(0:j, j) = T(0:j, 0:j) . -tau(j) . V(j:, 0:j)* . V(j:, j) + // + // + // The result is achieved in two main steps: + // 1) t = -tau(j) . V(j:, 0:j)* . V(j:, j) + // 2) T(0:j, j) = T(0:j, 0:j) . t + + // 1st step: compute the column partial result `t` + // First we compute the matrix vector multiplication for each column + // -tau(j) . V(j:, 0:j)* . V(j:, j) + for (const auto& v_i_loc : hh_panel.iteratorLocal()) { + const SizeType v_i = dist.template globalTileFromLocalTile(v_i_loc.row()); + const SizeType first_row_tile = std::max(0, v_i * dist.blockSize().rows() - v_start); + + // TODO + // Note: + // Since we are writing always on the same t, the gemv are serialized + // A possible solution to this would be to have multiple places where to store partial + // results, and then locally reduce them just before the reduce over ranks + t_local = Helpers::gemvColumnT(first_row_tile, hh_panel.read(v_i_loc), taus, std::move(t_local)); + } + + // at this point each rank has its partial result for each column + // so, let's reduce the results (on all ranks, so that everyone can independently compute T factor) + if (true) // TODO if the column communicator has more than 1 tile...but I just have the pipeline + t_local = dlaf::comm::scheduleAllReduceInPlace(mpi_col_task_chain(), MPI_SUM, std::move(t_local)); + + // 2nd step: compute the T factor, by performing the last step on each column + // each column depends on the previous part (all reflectors that comes before) + // so it is performed sequentially + ex::start_detached(Helpers::trmvUpdateColumn(std::move(t_local))); +} +#endif } From 26d72aac8663cf72ddf0aa36150e3883f51e4c98 Mon Sep 17 00:00:00 2001 From: Alberto Invernizzi Date: Tue, 18 Apr 2023 12:13:59 +0200 Subject: [PATCH 09/13] add tunable nthreads parameter --- .../dlaf/eigensolver/get_tfactor_nworkers.h | 32 +++++++++++++++++++ include/dlaf/factorization/qr/t_factor_impl.h | 28 +++++++++------- include/dlaf/tune.h | 10 ++++-- 3 files changed, 56 insertions(+), 14 deletions(-) create mode 100644 include/dlaf/eigensolver/get_tfactor_nworkers.h diff --git a/include/dlaf/eigensolver/get_tfactor_nworkers.h b/include/dlaf/eigensolver/get_tfactor_nworkers.h new file mode 100644 index 0000000000..ca61e9d0d2 --- /dev/null +++ b/include/dlaf/eigensolver/get_tfactor_nworkers.h @@ -0,0 +1,32 @@ +// +// Distributed Linear Algebra with Future (DLAF) +// +// Copyright (c) 2018-2023, ETH Zurich +// All rights reserved. +// +// Please, refer to the LICENSE file in the root directory. +// SPDX-License-Identifier: BSD-3-Clause +// +#pragma once + +#include +#include + +#include + +#include "dlaf/common/assert.h" +#include "dlaf/tune.h" + +namespace dlaf::factorization::internal { + +inline size_t getTFactorNWorkers() noexcept { + const size_t nworkers = getTuneParameters().tfactor_nworkers; + + // Note: precautionarily we leave at least 1 thread "free" to do other stuff + const size_t max_workers = pika::resource::get_thread_pool("default").get_os_thread_count() - 1; + + // 1 <= number of workers < max_workers + return std::max(1, std::min(max_workers, nworkers)); +} + +} diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h index 4c1dd4aefe..a3ed3c6b90 100644 --- a/include/dlaf/factorization/qr/t_factor_impl.h +++ b/include/dlaf/factorization/qr/t_factor_impl.h @@ -10,6 +10,7 @@ #pragma once +#include #include #include @@ -34,6 +35,7 @@ #include "dlaf/common/vector.h" #include "dlaf/communication/kernels/all_reduce.h" #include "dlaf/communication/sync/all_reduce.h" +#include "dlaf/eigensolver/get_tfactor_nworkers.h" #include "dlaf/lapack/tile.h" #include "dlaf/matrix/matrix.h" #include "dlaf/matrix/views.h" @@ -281,7 +283,7 @@ void QR_Tfactor::call(matrix::Panelrow(); - const size_t nthreads = std::max(1, (pika::get_num_worker_threads() / 2)); + const std::size_t nthreads = getTFactorNWorkers(); ex::start_detached( ex::when_all(ex::just(std::make_shared>(nthreads)), ex::when_all_vector(std::move(panel_tiles)), std::move(taus), std::move(t)) | @@ -294,7 +296,8 @@ void QR_Tfactor::call(matrix::Panel(pika::execution::thread_priority::high)) | ex::bulk(nthreads, [=, &barrier_ptr, &t, &taus, - &panel](const size_t index, std::vector>& t_all) { + &panel](const std::size_t index, + std::vector>& t_all) { using Helpers = tfactor_l::Helpers; tile::internal::set0(index == 0 ? t : t_all[index - 1]); @@ -303,11 +306,11 @@ void QR_Tfactor::call(matrix::Panel& tile_v = panel[i].get(); const SizeType first_row_tile = @@ -432,7 +435,7 @@ void QR_Tfactor::call( const SizeType bsRows = hh_panel.parentDistribution().blockSize().rows(); const SizeType panelRowBegin = hh_panel.iteratorLocal().begin()->row(); - const size_t nthreads = std::max(1, (pika::get_num_worker_threads() / 2)); + const std::size_t nthreads = getTFactorNWorkers(); ex::start_detached( ex::when_all(ex::just(std::make_shared>(nthreads)), ex::when_all_vector(std::move(panel_tiles)), std::move(taus), std::move(t), @@ -446,7 +449,8 @@ void QR_Tfactor::call( ex::transfer( dlaf::internal::getBackendScheduler(pika::execution::thread_priority::high)) | ex::bulk(nthreads, [=, &barrier_ptr, &t, &taus, &panel, - &pcomm](const size_t index, std::vector>& t_all) { + &pcomm](const std::size_t index, + std::vector>& t_all) { using Helpers = tfactor_l::Helpers; tile::internal::set0(index == 0 ? t : t_all[index - 1]); @@ -455,11 +459,11 @@ void QR_Tfactor::call( // compute the column partial result `t` (multi-threaded) // First we compute the matrix vector multiplication for each column // -tau(j) . V(j:, 0:j)* . V(j:, j) - const size_t chunk_size = util::ceilDiv(panel.size(), nthreads); - const size_t begin = index * chunk_size; - const size_t end = std::min(index * chunk_size + chunk_size, panel.size()); + const std::size_t chunk_size = util::ceilDiv(panel.size(), nthreads); + const std::size_t begin = index * chunk_size; + const std::size_t end = std::min(index * chunk_size + chunk_size, panel.size()); - for (size_t i = begin; i < end; ++i) { + for (std::size_t i = begin; i < end; ++i) { const matrix::Tile& tile_v = panel[i].get(); const SizeType first_row_tile = diff --git a/include/dlaf/tune.h b/include/dlaf/tune.h index 072a8cf0e6..548e659d2d 100644 --- a/include/dlaf/tune.h +++ b/include/dlaf/tune.h @@ -9,6 +9,8 @@ // #pragma once +#include + #include #include @@ -17,6 +19,7 @@ namespace dlaf { /// /// Holds the value of the parameters that can be used to tune DLA-Future. /// - red2band_panel_nworkers: number of threads to use for computing the panel in the reduction to band algorithm. +/// - tfactor_nworkers: number of threads to use for computing the T factor /// - eigensolver_min_band: The minimun value to start looking for a divisor of the block size. /// Set with --dlaf:eigensolver-min-band or env variable DLAF_EIGENSOLVER_MIN_BAND. /// - band_to_tridiag_1d_block_size_base: @@ -29,8 +32,11 @@ namespace dlaf { /// DLAF_BT_BAND_TO_TRIDIAG_HH_APPLY_GROUP_SIZE. /// Note to developers: Users can change these values, therefore consistency has to be ensured by algorithms. struct TuneParameters { - size_t red2band_panel_nworkers = - std::max(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2); + std::size_t red2band_panel_nworkers = + std::max(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2); + + std::size_t tfactor_nworkers = + std::max(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2); SizeType eigensolver_min_band = 100; SizeType band_to_tridiag_1d_block_size_base = 8192; From 33b204d62168b7215e2131781f859cf97e128a3f Mon Sep 17 00:00:00 2001 From: Alberto Invernizzi Date: Tue, 18 Apr 2023 12:45:05 +0200 Subject: [PATCH 10/13] minor doc fix --- include/dlaf/factorization/qr.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/dlaf/factorization/qr.h b/include/dlaf/factorization/qr.h index 26fdd5d16e..f351217732 100644 --- a/include/dlaf/factorization/qr.h +++ b/include/dlaf/factorization/qr.h @@ -46,7 +46,6 @@ namespace dlaf::factorization::internal { /// @param taus array of taus, associated with the related elementary reflector /// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size /// TileElementSize(k, k) -/// @param mpi_col_task_chain where internal communications are issued /// /// @pre hh_pane.getWidth() <= t.get().size().rows && hh_panel.size().getWidth() <= t.get().size().cols() template From dbbf122d074451017bea5dfe77e64987ce18528e Mon Sep 17 00:00:00 2001 From: Alberto Invernizzi Date: Wed, 19 Apr 2023 08:46:43 +0200 Subject: [PATCH 11/13] fix todo about trmv on cpu --- include/dlaf/factorization/qr/t_factor_impl.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h index a3ed3c6b90..a3cf0b95b8 100644 --- a/include/dlaf/factorization/qr/t_factor_impl.h +++ b/include/dlaf/factorization/qr/t_factor_impl.h @@ -121,7 +121,7 @@ struct Helpers { // Update each column (in order) t = T . t // remember that T is upper triangular, so it is possible to use TRMV - static auto trmv_func(matrix::Tile&& tile_t) { + static void trmv_func(matrix::Tile& tile_t) { for (SizeType j = 0; j < tile_t.size().cols(); ++j) { const TileElementIndex t_start{0, j}; const TileElementSize t_size{j, 1}; @@ -129,8 +129,6 @@ struct Helpers { blas::trmv(blas::Layout::ColMajor, blas::Uplo::Upper, blas::Op::NoTrans, blas::Diag::NonUnit, t_size.rows(), tile_t.ptr(), tile_t.ld(), tile_t.ptr(t_start), 1); } - // TODO: Why return if the tile is unused? - return std::move(tile_t); } template @@ -335,7 +333,7 @@ void QR_Tfactor::call(matrix::Panel::call( // compute the T factor, by performing the last step on each column // (single-threaded) each column depends on the previous part (all reflectors // that comes before) so it is performed sequentially - t = Helpers::trmv_func(std::move(t)); + Helpers::trmv_func(t); } }); })); From a0aa9c899bf0c18e93af2a83f240de00ad094cad Mon Sep 17 00:00:00 2001 From: Alberto Invernizzi Date: Wed, 19 Apr 2023 09:30:17 +0200 Subject: [PATCH 12/13] minor fix --- include/dlaf/factorization/qr/t_factor_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h index a3cf0b95b8..1dbe267a49 100644 --- a/include/dlaf/factorization/qr/t_factor_impl.h +++ b/include/dlaf/factorization/qr/t_factor_impl.h @@ -285,7 +285,7 @@ void QR_Tfactor::call(matrix::Panel>(nthreads)), ex::when_all_vector(std::move(panel_tiles)), std::move(taus), std::move(t)) | - ex::let_value([=](auto& barrier_ptr, auto&& panel, const common::internal::vector& taus, + ex::let_value([=](auto& barrier_ptr, auto& panel, const common::internal::vector& taus, matrix::Tile& t) { matrix::Matrix t_all({t.size().rows() * to_SizeType(nthreads - 1), t.size().cols()}, t.size()); From 916dc32e32d0400c08e97b378d65694ff159c9b6 Mon Sep 17 00:00:00 2001 From: Alberto Invernizzi Date: Wed, 19 Apr 2023 18:07:35 +0200 Subject: [PATCH 13/13] WIP: workaround for deadlock? if it works, eventually use to investigate --- include/dlaf/eigensolver/eigensolver/impl.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/dlaf/eigensolver/eigensolver/impl.h b/include/dlaf/eigensolver/eigensolver/impl.h index 91afb7da92..cbb03c6616 100644 --- a/include/dlaf/eigensolver/eigensolver/impl.h +++ b/include/dlaf/eigensolver/eigensolver/impl.h @@ -65,6 +65,7 @@ void Eigensolver::call(comm::CommunicatorGrid grid, blas::Uplo uplo, Ma eigensolver::tridiagSolver(grid, ret.tridiagonal, evals, mat_e); backTransformationBandToTridiag(grid, band_size, mat_e, ret.hh_reflectors); + pika::threads::get_thread_manager().wait(); backTransformationReductionToBand(grid, band_size, mat_e, mat_a, taus); } }