Skip to content

Commit

Permalink
computeTFactor parallelise computation with bulk (MC Local and Distri…
Browse files Browse the repository at this point in the history
…buted) (#798)
  • Loading branch information
rasolca committed Apr 19, 2023
1 parent 6a8a3d7 commit 535d6c8
Show file tree
Hide file tree
Showing 5 changed files with 396 additions and 122 deletions.
32 changes: 32 additions & 0 deletions include/dlaf/eigensolver/get_tfactor_nworkers.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
//
// Distributed Linear Algebra with Future (DLAF)
//
// Copyright (c) 2018-2023, ETH Zurich
// All rights reserved.
//
// Please, refer to the LICENSE file in the root directory.
// SPDX-License-Identifier: BSD-3-Clause
//
#pragma once

#include <algorithm>
#include <cmath>

#include <pika/runtime.hpp>

#include "dlaf/common/assert.h"
#include "dlaf/tune.h"

namespace dlaf::factorization::internal {

inline size_t getTFactorNWorkers() noexcept {
const size_t nworkers = getTuneParameters().tfactor_nworkers;

// Note: precautionarily we leave at least 1 thread "free" to do other stuff
const size_t max_workers = pika::resource::get_thread_pool("default").get_os_thread_count() - 1;

// 1 <= number of workers < max_workers
return std::max<std::size_t>(1, std::min<std::size_t>(max_workers, nworkers));
}

}
55 changes: 52 additions & 3 deletions include/dlaf/factorization/qr.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,69 @@ namespace dlaf::factorization::internal {
/// H0 H1 H2 ... HK-1
/// Note: The first element of the HH reflectors is NOT implicitly assumed to be 1,
/// it has to be set correctly in the panel (0s as well).

///
/// It is similar to what xLARFT in LAPACK does.
/// Given @p k elementary reflectors stored in the column of @p hh_panel together with related tau values
/// in @p taus, in @p t will be formed the triangular factor for the H block of reflectors, such that
///
/// H = I - V . T . V*
///
/// where H = H1 . H2 . ... . Hk
///
/// in which Hi represents a single elementary reflector transformation.
///
/// A Storage-Efficient WY Representation for Products of Householder Transformations.
/// Schreiber, Robert & VanLoan, Charles. (1989)
/// SIAM Journal on Scientific and Statistical Computing. 10. 10.1137/0910005.
///
/// @pre taus contains a vector with k elements
/// @pre t contains a (k x k) tile
/// @param hh_panel where the elementary reflectors are stored
/// @param taus array of taus, associated with the related elementary reflector
/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
/// TileElementSize(k, k)
///
/// @pre hh_pane.getWidth() <= t.get().size().rows && hh_panel.size().getWidth() <= t.get().size().cols()
template <Backend backend, Device device, class T>
void computeTFactor(matrix::Panel<Coord::Col, T, device>& hh_panel,
pika::shared_future<common::internal::vector<T>> taus,
pika::future<matrix::Tile<T, device>> t) {
QR_Tfactor<backend, device, T>::call(hh_panel, taus, std::move(t));
}

/// Forms the triangular factor T of a block of reflectors H, which is defined as a product of
/// k := hh_panel.getWidth() elementary reflectors.
///
/// hh_panel should have the following form
/// H0 0 0 ... 0
/// . H1 0 ... 0
/// . . H2 ... 0
/// . . . ... 0
/// . . . ... HK-1
/// . . . ... .
/// H0 H1 H2 ... HK-1
/// Note: The first element of the HH reflectors is NOT implicitly assumed to be 1,
/// it has to be set correctly in the panel (0s as well).
///
/// It is similar to what xLARFT in LAPACK does.
/// Given @p k elementary reflectors stored in the column of @p hh_panel together with related tau values
/// in @p taus, in @p t will be formed the triangular factor for the H block of reflectors, such that
///
/// H = I - V . T . V*
///
/// where H = H1 . H2 . ... . Hk
///
/// in which Hi represents a single elementary reflector transformation.
///
/// A Storage-Efficient WY Representation for Products of Householder Transformations.
/// Schreiber, Robert & VanLoan, Charles. (1989)
/// SIAM Journal on Scientific and Statistical Computing. 10. 10.1137/0910005.
///
/// @param hh_panel where the elementary reflectors are stored
/// @param taus array of taus, associated with the related elementary reflector
/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
/// TileElementSize(k, k)
/// @param mpi_col_task_chain where internal communications are issued
///
/// @pre hh_pane.getWidth() <= t.get().size().rows && hh_panel.size().getWidth() <= t.get().size().cols()
template <Backend backend, Device device, class T>
void computeTFactor(matrix::Panel<Coord::Col, T, device>& hh_panel,
pika::shared_future<common::internal::vector<T>> taus,
Expand Down
74 changes: 20 additions & 54 deletions include/dlaf/factorization/qr/api.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,65 +24,31 @@ template <Backend backend, Device device, class T>
struct QR {};

template <Backend backend, Device device, class T>
struct QR_Tfactor {
/// Forms the triangular factor T of a block of reflectors H, which is defined as a product of k
/// elementary reflectors.
///
/// It is similar to what xLARFT in LAPACK does.
/// Given @p k elementary reflectors stored in the column of @p v starting at tile @p v_start,
/// together with related tau values in @p taus, in @p t will be formed the triangular factor for the H
/// block of reflector, such that
///
/// H = I - V . T . V*
///
/// where H = H1 . H2 . ... . Hk
///
/// in which Hi represents a single elementary reflector transformation
///
/// @param k the number of elementary reflectors to use (from the beginning of the tile)
/// @param v where the elementary reflectors are stored
/// @param v_start tile in @p v where the column of reflectors starts
/// @param taus array of taus, associated with the related elementary reflector
/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
/// TileElementSize(k, k)
///
/// @pre k <= t.get().size().rows && k <= t.get().size().cols()
/// @pre k >= 0
/// @pre v_start.isIn(v.nrTiles())
static void call(matrix::Panel<Coord::Col, T, device>& panel_view,
struct QR_Tfactor;

template <class T>
struct QR_Tfactor<Backend::MC, Device::CPU, T> {
static void call(matrix::Panel<Coord::Col, T, Device::CPU>& panel_view,
pika::shared_future<common::internal::vector<T>> taus,
pika::future<matrix::Tile<T, Device::CPU>> t);
static void call(matrix::Panel<Coord::Col, T, Device::CPU>& hh_panel,
pika::shared_future<common::internal::vector<T>> taus,
pika::future<matrix::Tile<T, device>> t);
pika::future<matrix::Tile<T, Device::CPU>> t,
common::Pipeline<comm::Communicator>& mpi_col_task_chain);
};

/// Forms the triangular factor T of a block of reflectors H, which is defined as a product of k
/// elementary reflectors.
///
/// It is similar to what xLARFT in LAPACK does.
/// Given @p k elementary reflectors stored in the column of @p v starting at tile @p v_start,
/// together with related tau values in @p taus, in @p t will be formed the triangular factor for the H
/// block of reflector, such that
///
/// H = I - V . T . V*
///
/// where H = H1 . H2 . ... . Hk
///
/// in which Hi represents a single elementary reflector transformation
///
/// @param k the number of elementary reflectors to use (from the beginning of the tile)
/// @param v where the elementary reflectors are stored
/// @param v_start tile in @p v where the column of reflectors starts
/// @param taus array of taus, associated with the related elementary reflector
/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
/// TileElementSize(k, k)
/// @param mpi_col_task_chain where internal communications are issued
///
/// @pre k <= t.get().size().rows && k <= t.get().size().cols()
/// @pre k >= 0
/// @pre v_start.isIn(v.nrTiles())
static void call(matrix::Panel<Coord::Col, T, device>& hh_panel,
#ifdef DLAF_WITH_GPU
template <class T>
struct QR_Tfactor<Backend::GPU, Device::GPU, T> {
static void call(matrix::Panel<Coord::Col, T, Device::GPU>& panel_view,
pika::shared_future<common::internal::vector<T>> taus,
pika::future<matrix::Tile<T, Device::GPU>> t);
static void call(matrix::Panel<Coord::Col, T, Device::GPU>& hh_panel,
pika::shared_future<common::internal::vector<T>> taus,
pika::future<matrix::Tile<T, device>> t,
pika::future<matrix::Tile<T, Device::GPU>> t,
common::Pipeline<comm::Communicator>& mpi_col_task_chain);
};
#endif

/// ---- ETI
#define DLAF_FACTORIZATION_QR_TFACTOR_ETI(KWORD, BACKEND, DEVICE, DATATYPE) \
Expand Down
Loading

0 comments on commit 535d6c8

Please sign in to comment.