From 0d944b5b9c0ea55b24a41c94ab757f881d869aea Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Fri, 17 Feb 2023 09:03:47 +0100
Subject: [PATCH] add back separate implementation for gpu and refactor variant
 separation

---
 include/dlaf/factorization/qr.h               |  2 +-
 include/dlaf/factorization/qr/api.h           | 17 ++--
 include/dlaf/factorization/qr/t_factor_impl.h | 77 +++++++++++++++++--
 3 files changed, 81 insertions(+), 15 deletions(-)
diff --git a/include/dlaf/factorization/qr.h b/include/dlaf/factorization/qr.h
index ac8f8db9ea..26fdd5d16e 100644
--- a/include/dlaf/factorization/qr.h
+++ b/include/dlaf/factorization/qr.h
@@ -96,7 +96,7 @@ void computeTFactor(matrix::Panel<Coord::Col, T, device>& hh_panel,
                     pika::shared_future<common::internal::vector<T>> taus,
                     pika::future<matrix::Tile<T, device>> t,
                     common::Pipeline<comm::Communicator>& mpi_col_task_chain) {
-  QR_TfactorDistributed<backend, device, T>::call(hh_panel, taus, std::move(t), mpi_col_task_chain);
+  QR_Tfactor<backend, device, T>::call(hh_panel, taus, std::move(t), mpi_col_task_chain);
 }
 
 }
diff --git a/include/dlaf/factorization/qr/api.h b/include/dlaf/factorization/qr/api.h
index 2c1f9a6dfe..79add6fbce 100644
--- a/include/dlaf/factorization/qr/api.h
+++ b/include/dlaf/factorization/qr/api.h
@@ -31,6 +31,10 @@ struct QR_Tfactor<Backend::MC, Device::CPU, T> {
   static void call(matrix::Panel<Coord::Col, T, Device::CPU>& panel_view,
                    pika::shared_future<common::internal::vector<T>> taus,
                    pika::future<matrix::Tile<T, Device::CPU>> t);
+  static void call(matrix::Panel<Coord::Col, T, Device::CPU>& hh_panel,
+                   pika::shared_future<common::internal::vector<T>> taus,
+                   pika::future<matrix::Tile<T, Device::CPU>> t,
+                   common::Pipeline<comm::Communicator>& mpi_col_task_chain);
 };
 
 #ifdef DLAF_WITH_GPU
@@ -39,21 +43,16 @@ struct QR_Tfactor<Backend::GPU, Device::GPU, T> {
   static void call(matrix::Panel<Coord::Col, T, Device::GPU>& panel_view,
                    pika::shared_future<common::internal::vector<T>> taus,
                    pika::future<matrix::Tile<T, Device::GPU>> t);
-};
-#endif
-
-template <Backend backend, Device device, class T>
-struct QR_TfactorDistributed {
-  static void call(matrix::Panel<Coord::Col, T, device>& hh_panel,
+  static void call(matrix::Panel<Coord::Col, T, Device::GPU>& hh_panel,
                    pika::shared_future<common::internal::vector<T>> taus,
-                   pika::future<matrix::Tile<T, device>> t,
+                   pika::future<matrix::Tile<T, Device::GPU>> t,
                    common::Pipeline<comm::Communicator>& mpi_col_task_chain);
 };
+#endif
 
 /// ---- ETI
 #define DLAF_FACTORIZATION_QR_TFACTOR_ETI(KWORD, BACKEND, DEVICE, DATATYPE) \
-  KWORD template struct QR_Tfactor<BACKEND, DEVICE, DATATYPE>;              \
-  KWORD template struct QR_TfactorDistributed<BACKEND, DEVICE, DATATYPE>;
+  KWORD template struct QR_Tfactor<BACKEND, DEVICE, DATATYPE>;
 
 DLAF_FACTORIZATION_QR_TFACTOR_ETI(extern, Backend::MC, Device::CPU, float)
 DLAF_FACTORIZATION_QR_TFACTOR_ETI(extern, Backend::MC, Device::CPU, double)
diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h
index 7fbc0c2b6a..4c1dd4aefe 100644
--- a/include/dlaf/factorization/qr/t_factor_impl.h
+++ b/include/dlaf/factorization/qr/t_factor_impl.h
@@ -393,11 +393,14 @@ void QR_Tfactor<Backend::GPU, Device::GPU, T>::call(matrix::Panel<Coord::Col, T,
 }
 #endif
 
-template <Backend B, Device D, class T>
-void QR_TfactorDistributed<B, D, T>::call(matrix::Panel<Coord::Col, T, D>& hh_panel,
-                                          pika::shared_future<common::internal::vector<T>> taus,
-                                          pika::future<matrix::Tile<T, D>> t,
-                                          common::Pipeline<comm::Communicator>& mpi_col_task_chain) {
+template <class T>
+void QR_Tfactor<Backend::MC, Device::CPU, T>::call(
+    matrix::Panel<Coord::Col, T, Device::CPU>& hh_panel,
+    pika::shared_future<common::internal::vector<T>> taus, pika::future<matrix::Tile<T, Device::CPU>> t,
+    common::Pipeline<comm::Communicator>& mpi_col_task_chain) {
+  constexpr auto B = Backend::MC;
+  constexpr auto D = Device::CPU;
+
   namespace ex = pika::execution::experimental;
 
   // Fast return in case of no reflectors
@@ -495,4 +498,68 @@ void QR_TfactorDistributed<B, D, T>::call(matrix::Panel<Coord::Col, T, D>& hh_pa
                });
       }));
 }
+
+#ifdef DLAF_WITH_GPU
+template <class T>
+void QR_Tfactor<Backend::GPU, Device::GPU, T>::call(
+    matrix::Panel<Coord::Col, T, Device::GPU>& hh_panel,
+    pika::shared_future<common::internal::vector<T>> taus, pika::future<matrix::Tile<T, Device::GPU>> t,
+    common::Pipeline<comm::Communicator>& mpi_col_task_chain) {
+  constexpr auto B = Backend::GPU;
+  constexpr auto D = Device::GPU;
+
+  namespace ex = pika::execution::experimental;
+
+  using Helpers = tfactor_l::Helpers<B, D, T>;
+
+  // Fast return in case of no reflectors
+  if (hh_panel.getWidth() == 0)
+    return;
+
+  const auto v_start = hh_panel.offsetElement();
+  auto dist = hh_panel.parentDistribution();
+
+  ex::unique_any_sender<matrix::Tile<T, D>> t_local = Helpers::set0(std::move(t));
+
+  // Note:
+  // T factor is an upper triangular square matrix, built column by column
+  // with taus values on the diagonal
+  //
+  // T(j,j) = tau(j)
+  //
+  // and in the upper triangular part the following formula applies
+  //
+  // T(0:j, j) = T(0:j, 0:j) . -tau(j) . V(j:, 0:j)* . V(j:, j)
+  //
+  //
+  // The result is achieved in two main steps:
+  // 1) t = -tau(j) . V(j:, 0:j)* . V(j:, j)
+  // 2) T(0:j, j) = T(0:j, 0:j) . t
+
+  // 1st step: compute the column partial result `t`
+  // First we compute the matrix vector multiplication for each column
+  // -tau(j) . V(j:, 0:j)* . V(j:, j)
+  for (const auto& v_i_loc : hh_panel.iteratorLocal()) {
+    const SizeType v_i = dist.template globalTileFromLocalTile<Coord::Row>(v_i_loc.row());
+    const SizeType first_row_tile = std::max<SizeType>(0, v_i * dist.blockSize().rows() - v_start);
+
+    // TODO
+    // Note:
+    // Since we are writing always on the same t, the gemv are serialized
+    // A possible solution to this would be to have multiple places where to store partial
+    // results, and then locally reduce them just before the reduce over ranks
+    t_local = Helpers::gemvColumnT(first_row_tile, hh_panel.read(v_i_loc), taus, std::move(t_local));
+  }
+
+  // at this point each rank has its partial result for each column
+  // so, let's reduce the results (on all ranks, so that everyone can independently compute T factor)
+  if (true)  // TODO if the column communicator has more than 1 tile...but I just have the pipeline
+    t_local = dlaf::comm::scheduleAllReduceInPlace(mpi_col_task_chain(), MPI_SUM, std::move(t_local));
+
+  // 2nd step: compute the T factor, by performing the last step on each column
+  // each column depends on the previous part (all reflectors that comes before)
+  // so it is performed sequentially
+  ex::start_detached(Helpers::trmvUpdateColumn(std::move(t_local)));
+}
+#endif
 }