Skip to content

Commit

Permalink
add back separate implementation for gpu and refactor variant separation
Browse files Browse the repository at this point in the history
  • Loading branch information
albestro committed Apr 18, 2023
1 parent bbdb50f commit 3ca979e
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 15 deletions.
2 changes: 1 addition & 1 deletion include/dlaf/factorization/qr.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ void computeTFactor(matrix::Panel<Coord::Col, T, device>& hh_panel,
pika::shared_future<common::internal::vector<T>> taus,
pika::future<matrix::Tile<T, device>> t,
common::Pipeline<comm::Communicator>& mpi_col_task_chain) {
QR_TfactorDistributed<backend, device, T>::call(hh_panel, taus, std::move(t), mpi_col_task_chain);
QR_Tfactor<backend, device, T>::call(hh_panel, taus, std::move(t), mpi_col_task_chain);
}

}
17 changes: 8 additions & 9 deletions include/dlaf/factorization/qr/api.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ struct QR_Tfactor<Backend::MC, Device::CPU, T> {
static void call(matrix::Panel<Coord::Col, T, Device::CPU>& panel_view,
pika::shared_future<common::internal::vector<T>> taus,
pika::future<matrix::Tile<T, Device::CPU>> t);
static void call(matrix::Panel<Coord::Col, T, Device::CPU>& hh_panel,
pika::shared_future<common::internal::vector<T>> taus,
pika::future<matrix::Tile<T, Device::CPU>> t,
common::Pipeline<comm::Communicator>& mpi_col_task_chain);
};

#ifdef DLAF_WITH_GPU
Expand All @@ -39,21 +43,16 @@ struct QR_Tfactor<Backend::GPU, Device::GPU, T> {
static void call(matrix::Panel<Coord::Col, T, Device::GPU>& panel_view,
pika::shared_future<common::internal::vector<T>> taus,
pika::future<matrix::Tile<T, Device::GPU>> t);
};
#endif

template <Backend backend, Device device, class T>
struct QR_TfactorDistributed {
static void call(matrix::Panel<Coord::Col, T, device>& hh_panel,
static void call(matrix::Panel<Coord::Col, T, Device::GPU>& hh_panel,
pika::shared_future<common::internal::vector<T>> taus,
pika::future<matrix::Tile<T, device>> t,
pika::future<matrix::Tile<T, Device::GPU>> t,
common::Pipeline<comm::Communicator>& mpi_col_task_chain);
};
#endif

/// ---- ETI
#define DLAF_FACTORIZATION_QR_TFACTOR_ETI(KWORD, BACKEND, DEVICE, DATATYPE) \
KWORD template struct QR_Tfactor<BACKEND, DEVICE, DATATYPE>; \
KWORD template struct QR_TfactorDistributed<BACKEND, DEVICE, DATATYPE>;
KWORD template struct QR_Tfactor<BACKEND, DEVICE, DATATYPE>;

DLAF_FACTORIZATION_QR_TFACTOR_ETI(extern, Backend::MC, Device::CPU, float)
DLAF_FACTORIZATION_QR_TFACTOR_ETI(extern, Backend::MC, Device::CPU, double)
Expand Down
77 changes: 72 additions & 5 deletions include/dlaf/factorization/qr/t_factor_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -393,11 +393,14 @@ void QR_Tfactor<Backend::GPU, Device::GPU, T>::call(matrix::Panel<Coord::Col, T,
}
#endif

template <Backend B, Device D, class T>
void QR_TfactorDistributed<B, D, T>::call(matrix::Panel<Coord::Col, T, D>& hh_panel,
pika::shared_future<common::internal::vector<T>> taus,
pika::future<matrix::Tile<T, D>> t,
common::Pipeline<comm::Communicator>& mpi_col_task_chain) {
template <class T>
void QR_Tfactor<Backend::MC, Device::CPU, T>::call(
matrix::Panel<Coord::Col, T, Device::CPU>& hh_panel,
pika::shared_future<common::internal::vector<T>> taus, pika::future<matrix::Tile<T, Device::CPU>> t,
common::Pipeline<comm::Communicator>& mpi_col_task_chain) {
constexpr auto B = Backend::MC;
constexpr auto D = Device::CPU;

namespace ex = pika::execution::experimental;

// Fast return in case of no reflectors
Expand Down Expand Up @@ -495,4 +498,68 @@ void QR_TfactorDistributed<B, D, T>::call(matrix::Panel<Coord::Col, T, D>& hh_pa
});
}));
}

#ifdef DLAF_WITH_GPU
template <class T>
void QR_Tfactor<Backend::GPU, Device::GPU, T>::call(
matrix::Panel<Coord::Col, T, Device::GPU>& hh_panel,
pika::shared_future<common::internal::vector<T>> taus, pika::future<matrix::Tile<T, Device::GPU>> t,
common::Pipeline<comm::Communicator>& mpi_col_task_chain) {
constexpr auto B = Backend::GPU;
constexpr auto D = Device::GPU;

namespace ex = pika::execution::experimental;

using Helpers = tfactor_l::Helpers<B, D, T>;

// Fast return in case of no reflectors
if (hh_panel.getWidth() == 0)
return;

const auto v_start = hh_panel.offsetElement();
auto dist = hh_panel.parentDistribution();

ex::unique_any_sender<matrix::Tile<T, D>> t_local = Helpers::set0(std::move(t));

// Note:
// T factor is an upper triangular square matrix, built column by column
// with taus values on the diagonal
//
// T(j,j) = tau(j)
//
// and in the upper triangular part the following formula applies
//
// T(0:j, j) = T(0:j, 0:j) . -tau(j) . V(j:, 0:j)* . V(j:, j)
//
//
// The result is achieved in two main steps:
// 1) t = -tau(j) . V(j:, 0:j)* . V(j:, j)
// 2) T(0:j, j) = T(0:j, 0:j) . t

// 1st step: compute the column partial result `t`
// First we compute the matrix vector multiplication for each column
// -tau(j) . V(j:, 0:j)* . V(j:, j)
for (const auto& v_i_loc : hh_panel.iteratorLocal()) {
const SizeType v_i = dist.template globalTileFromLocalTile<Coord::Row>(v_i_loc.row());
const SizeType first_row_tile = std::max<SizeType>(0, v_i * dist.blockSize().rows() - v_start);

// TODO
// Note:
// Since we are writing always on the same t, the gemv are serialized
// A possible solution to this would be to have multiple places where to store partial
// results, and then locally reduce them just before the reduce over ranks
t_local = Helpers::gemvColumnT(first_row_tile, hh_panel.read(v_i_loc), taus, std::move(t_local));
}

// at this point each rank has its partial result for each column
// so, let's reduce the results (on all ranks, so that everyone can independently compute T factor)
if (true) // TODO if the column communicator has more than 1 tile...but I just have the pipeline
t_local = dlaf::comm::scheduleAllReduceInPlace(mpi_col_task_chain(), MPI_SUM, std::move(t_local));

// 2nd step: compute the T factor, by performing the last step on each column
// each column depends on the previous part (all reflectors that comes before)
// so it is performed sequentially
ex::start_detached(Helpers::trmvUpdateColumn(std::move(t_local)));
}
#endif
}

0 comments on commit 3ca979e

Please sign in to comment.