From 200a1bd1c30f2fbf3c4a090c853729a93d9dd121 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Wed, 8 Feb 2023 16:04:14 +0100
Subject: [PATCH 01/13] start factoring out computing functions

---
 include/dlaf/factorization/qr/t_factor_impl.h | 104 +++++++++---------
 1 file changed, 54 insertions(+), 50 deletions(-)
diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h
index eb1e9bdfcf..c366b720e2 100644
--- a/include/dlaf/factorization/qr/t_factor_impl.h
+++ b/include/dlaf/factorization/qr/t_factor_impl.h
@@ -59,74 +59,78 @@ struct Helpers<Backend::MC, Device::CPU, T> {
         std::forward<TSender>(t));
   }
 
+  static auto gemv_func(const SizeType first_row_tile, const matrix::Tile<const T, Device::CPU>& tile_v,
+                        const common::internal::vector<T>& taus,
+                        matrix::Tile<T, Device::CPU> tile_t) noexcept {
+    const SizeType k = tile_t.size().cols();
+    DLAF_ASSERT(tile_v.size().cols() == k, tile_v.size().cols(), k);
+    DLAF_ASSERT(taus.size() == k, taus.size(), k);
+
+    for (SizeType j = 0; j < k; ++j) {
+      const T tau = taus[j];
+
+      const TileElementIndex t_start{0, j};
+
+      // Position of the 1 in the diagonal in the current column.
+      SizeType i_diag = j - first_row_tile;
+      const SizeType first_element_in_col = std::max<SizeType>(0, i_diag);
+
+      // Break if the reflector starts in the next tile.
+      if (i_diag >= tile_v.size().rows())
+        break;
+
+      // T(0:j, j) = -tau . V(j:, 0:j)* . V(j:, j)
+      // [j x 1] = [(n-j) x j]* . [(n-j) x 1]
+      TileElementIndex va_start{first_element_in_col, 0};
+      TileElementIndex vb_start{first_element_in_col, j};
+      TileElementSize va_size{tile_v.size().rows() - first_element_in_col, j};
+
+      if (i_diag >= 0) {
+        tile_t({j, j}) = tau;
+      }
+
+      blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, va_size.rows(), va_size.cols(), -tau,
+                 tile_v.ptr(va_start), tile_v.ld(), tile_v.ptr(vb_start), 1, 1, tile_t.ptr(t_start), 1);
+    }
+    return std::move(tile_t);
+  };
+
   template <class TSender>
   static auto gemvColumnT(SizeType first_row_tile,
                           pika::shared_future<matrix::Tile<const T, Device::CPU>> tile_vi,
                           pika::shared_future<common::internal::vector<T>>& taus, TSender&& tile_t) {
     namespace ex = pika::execution::experimental;
 
-    auto gemv_func = [first_row_tile](const auto& tile_v, const auto& taus, auto&& tile_t) noexcept {
-      const SizeType k = tile_t.size().cols();
-      DLAF_ASSERT(tile_v.size().cols() == k, tile_v.size().cols(), k);
-      DLAF_ASSERT(taus.size() == k, taus.size(), k);
-
-      for (SizeType j = 0; j < k; ++j) {
-        const T tau = taus[j];
-
-        const TileElementIndex t_start{0, j};
-
-        // Position of the 1 in the diagonal in the current column.
-        SizeType i_diag = j - first_row_tile;
-        const SizeType first_element_in_col = std::max<SizeType>(0, i_diag);
-
-        // Break if the reflector starts in the next tile.
-        if (i_diag >= tile_v.size().rows())
-          break;
-
-        // T(0:j, j) = -tau . V(j:, 0:j)* . V(j:, j)
-        // [j x 1] = [(n-j) x j]* . [(n-j) x 1]
-        TileElementIndex va_start{first_element_in_col, 0};
-        TileElementIndex vb_start{first_element_in_col, j};
-        TileElementSize va_size{tile_v.size().rows() - first_element_in_col, j};
-
-        if (i_diag >= 0) {
-          tile_t({j, j}) = tau;
-        }
-
-        blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, va_size.rows(), va_size.cols(), -tau,
-                   tile_v.ptr(va_start), tile_v.ld(), tile_v.ptr(vb_start), 1, 1, tile_t.ptr(t_start),
-                   1);
-      }
-      return std::move(tile_t);
-    };
     return dlaf::internal::transform(dlaf::internal::Policy<Backend::MC>(
                                          pika::execution::thread_priority::high),
-                                     std::move(gemv_func),
-                                     ex::when_all(dlaf::internal::keepFuture(tile_vi),
+                                     gemv_func,
+                                     ex::when_all(ex::just(first_row_tile),
+                                                  dlaf::internal::keepFuture(tile_vi),
                                                   dlaf::internal::keepFuture(taus),
                                                   std::forward<TSender>(tile_t)));
   }
 
+  // Update each column (in order) t = T . t
+  // remember that T is upper triangular, so it is possible to use TRMV
+  static auto trmv_func(matrix::Tile<T, Device::CPU>&& tile_t) {
+    for (SizeType j = 0; j < tile_t.size().cols(); ++j) {
+      const TileElementIndex t_start{0, j};
+      const TileElementSize t_size{j, 1};
+
+      blas::trmv(blas::Layout::ColMajor, blas::Uplo::Upper, blas::Op::NoTrans, blas::Diag::NonUnit,
+                 t_size.rows(), tile_t.ptr(), tile_t.ld(), tile_t.ptr(t_start), 1);
+    }
+    // TODO: Why return if the tile is unused?
+    return std::move(tile_t);
+  }
+
   template <typename TSender>
   static auto trmvUpdateColumn(TSender&& tile_t) noexcept {
     namespace ex = pika::execution::experimental;
 
-    // Update each column (in order) t = T . t
-    // remember that T is upper triangular, so it is possible to use TRMV
-    auto trmv_func = [](matrix::Tile<T, Device::CPU>&& tile_t) {
-      for (SizeType j = 0; j < tile_t.size().cols(); ++j) {
-        const TileElementIndex t_start{0, j};
-        const TileElementSize t_size{j, 1};
-
-        blas::trmv(blas::Layout::ColMajor, blas::Uplo::Upper, blas::Op::NoTrans, blas::Diag::NonUnit,
-                   t_size.rows(), tile_t.ptr(), tile_t.ld(), tile_t.ptr(t_start), 1);
-      }
-      // TODO: Why return if the tile is unused?
-      return std::move(tile_t);
-    };
     return dlaf::internal::transform(dlaf::internal::Policy<Backend::MC>(
                                          pika::execution::thread_priority::high),
-                                     std::move(trmv_func), std::forward<TSender>(tile_t));
+                                     trmv_func, std::forward<TSender>(tile_t));
   }
 };
 

From 002b29a61960f2a639d8c9661233a460c6b72e74 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Wed, 8 Feb 2023 16:04:35 +0100
Subject: [PATCH 02/13] basic bulkification

---
 include/dlaf/factorization/qr/t_factor_impl.h | 102 +++++++++++++-----
 1 file changed, 75 insertions(+), 27 deletions(-)

diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h
index c366b720e2..d86f481dfe 100644
--- a/include/dlaf/factorization/qr/t_factor_impl.h
+++ b/include/dlaf/factorization/qr/t_factor_impl.h
@@ -10,9 +10,15 @@
 
 #pragma once
 
-#include <pika/future.hpp>
+#include <memory>
 
 #include <blas.hh>
+#include <pika/barrier.hpp>
+#include <pika/future.hpp>
+
+#include "dlaf/blas/tile_extensions.h"
+#include "dlaf/matrix/tile.h"
+#include "dlaf/util_matrix.h"
 
 #ifdef DLAF_WITH_GPU
 #include <whip.hpp>
@@ -238,20 +244,19 @@ struct Helpers<Backend::GPU, Device::GPU, T> {
 #endif
 }
 
-template <Backend backend, Device device, class T>
-void QR_Tfactor<backend, device, T>::call(matrix::Panel<Coord::Col, T, device>& hh_panel,
-                                          pika::shared_future<common::internal::vector<T>> taus,
-                                          pika::future<matrix::Tile<T, device>> t) {
+template <Backend B, Device D, class T>
+void QR_Tfactor<B, D, T>::call(matrix::Panel<Coord::Col, T, D>& hh_panel,
+                               pika::shared_future<common::internal::vector<T>> taus,
+                               pika::future<matrix::Tile<T, D>> t) {
   namespace ex = pika::execution::experimental;
 
-  using Helpers = tfactor_l::Helpers<backend, device, T>;
   // Fast return in case of no reflectors
   if (hh_panel.getWidth() == 0)
     return;
 
-  const auto v_start = hh_panel.offsetElement();
-
-  ex::unique_any_sender<matrix::Tile<T, device>> t_local = Helpers::set0(std::move(t));
+  std::vector<decltype(hh_panel.read_sender(std::declval<LocalTileIndex>()))> panel_tiles;
+  for (const auto idx : hh_panel.iteratorLocal())
+    panel_tiles.push_back(hh_panel.read_sender(idx));
 
   // Note:
   // T factor is an upper triangular square matrix, built column by column
@@ -268,24 +273,67 @@ void QR_Tfactor<backend, device, T>::call(matrix::Panel<Coord::Col, T, device>&
   // 1) t = -tau(j) . V(j:, 0:j)* . V(j:, j)
   // 2) T(0:j, j) = T(0:j, 0:j) . t
 
-  // 1st step: compute the column partial result `t`
-  // First we compute the matrix vector multiplication for each column
-  // -tau(j) . V(j:, 0:j)* . V(j:, j)
-  for (const auto& v_i : hh_panel.iteratorLocal()) {
-    const SizeType first_row_tile =
-        std::max<SizeType>(0, v_i.row() * hh_panel.parentDistribution().blockSize().rows() - v_start);
-
-    // Note:
-    // Since we are writing always on the same t, the gemv are serialized
-    // A possible solution to this would be to have multiple places where to store partial
-    // results, and then locally reduce them just before the reduce over ranks
-    t_local = Helpers::gemvColumnT(first_row_tile, hh_panel.read(v_i), taus, std::move(t_local));
-  }
-
-  // 2nd step: compute the T factor, by performing the last step on each column
-  // each column depends on the previous part (all reflectors that comes before)
-  // so it is performed sequentially
-  ex::start_detached(Helpers::trmvUpdateColumn(std::move(t_local)));
+  const SizeType v_start = hh_panel.offsetElement();
+  const SizeType bsRows = hh_panel.parentDistribution().blockSize().rows();
+  const SizeType panelRowBegin = hh_panel.iteratorLocal().begin()->row();
+
+  const SizeType nthreads = std::max<SizeType>(1, (pika::get_num_worker_threads() / 2));
+  ex::start_detached(
+      ex::when_all(ex::just(std::make_shared<pika::barrier<>>(nthreads)),
+                   ex::when_all_vector(std::move(panel_tiles)), std::move(taus), std::move(t)) |
+      ex::let_value([nthreads, v_start, panelRowBegin,
+                     bsRows](std::shared_ptr<pika::barrier<>>& barrier_ptr, auto&& panel,
+                             const common::internal::vector<T>& taus, matrix::Tile<T, D>& t) {
+        matrix::Matrix<T, D> t_all({t.size().rows() * (nthreads - 1), t.size().cols()}, t.size());
+        return ex::when_all_vector(
+                   select(t_all, common::iterate_range2d(t_all.distribution().localNrTiles()))) |
+               ex::transfer(
+                   dlaf::internal::getBackendScheduler<B>(pika::execution::thread_priority::high)) |
+               ex::bulk(nthreads, [&barrier_ptr, &t, &taus, &panel, nthreads, v_start, panelRowBegin,
+                                   bsRows](const SizeType index,
+                                           std::vector<matrix::Tile<T, D>>& t_all) {
+                 using Helpers = tfactor_l::Helpers<B, D, T>;
+
+                 tile::internal::set0<T>(index == 0 ? t : t_all[index - 1]);
+
+                 // 1st step
+                 // compute the column partial result `t` (multi-threaded)
+                 // First we compute the matrix vector multiplication for each column
+                 // -tau(j) . V(j:, 0:j)* . V(j:, j)
+                 const SizeType chunk_size = util::ceilDiv<SizeType>(panel.size(), nthreads);
+                 const SizeType from = to_SizeType(index) * chunk_size;
+                 const SizeType to = std::min(index * chunk_size + chunk_size, SizeType(panel.size()));
+
+                 for (auto i = from; i < to; ++i) {
+                   const matrix::Tile<const T, D>& tile_v = panel[i].get();
+
+                   // std::max<SizeType>(0, v_i.row() * hh_panel.parentDistribution().blockSize().rows() - v_start);
+                   const SizeType first_row_tile =
+                       std::max<SizeType>(0, (panelRowBegin + i) * bsRows - v_start);
+
+                   if (index == 0)
+                     t = Helpers::gemv_func(first_row_tile, tile_v, taus, std::move(t));
+                   else
+                     t_all[index - 1] =
+                         Helpers::gemv_func(first_row_tile, tile_v, taus, std::move(t_all[index - 1]));
+                 }
+
+                 barrier_ptr->arrive_and_wait();
+
+                 // (single-threaded)
+                 if (index == 0) {
+                   // reduce
+                   for (auto& partial_t : t_all)
+                     tile::internal::add(T(1), partial_t, t);
+
+                   // 2nd step
+                   // compute the T factor, by performing the last step on each column (single-threaded)
+                   // each column depends on the previous part (all reflectors that comes before) so it
+                   // is performed sequentially
+                   t = Helpers::trmv_func(std::move(t));
+                 }
+               });
+      }));
 }
 
 template <Backend backend, Device device, class T>

From 1ac6bc2187ed9132bd7c7642a1844cd2a88d5183 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Tue, 14 Feb 2023 10:33:50 +0100
Subject: [PATCH 03/13] fix warnings

---
 include/dlaf/factorization/qr/t_factor_impl.h | 34 +++++++++----------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h
index d86f481dfe..af245b502a 100644
--- a/include/dlaf/factorization/qr/t_factor_impl.h
+++ b/include/dlaf/factorization/qr/t_factor_impl.h
@@ -98,7 +98,7 @@ struct Helpers<Backend::MC, Device::CPU, T> {
       blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, va_size.rows(), va_size.cols(), -tau,
                  tile_v.ptr(va_start), tile_v.ld(), tile_v.ptr(vb_start), 1, 1, tile_t.ptr(t_start), 1);
     }
-    return std::move(tile_t);
+    return tile_t;
   };
 
   template <class TSender>
@@ -277,21 +277,20 @@ void QR_Tfactor<B, D, T>::call(matrix::Panel<Coord::Col, T, D>& hh_panel,
   const SizeType bsRows = hh_panel.parentDistribution().blockSize().rows();
   const SizeType panelRowBegin = hh_panel.iteratorLocal().begin()->row();
 
-  const SizeType nthreads = std::max<SizeType>(1, (pika::get_num_worker_threads() / 2));
+  const size_t nthreads = std::max<size_t>(1, (pika::get_num_worker_threads() / 2));
   ex::start_detached(
       ex::when_all(ex::just(std::make_shared<pika::barrier<>>(nthreads)),
                    ex::when_all_vector(std::move(panel_tiles)), std::move(taus), std::move(t)) |
-      ex::let_value([nthreads, v_start, panelRowBegin,
-                     bsRows](std::shared_ptr<pika::barrier<>>& barrier_ptr, auto&& panel,
-                             const common::internal::vector<T>& taus, matrix::Tile<T, D>& t) {
-        matrix::Matrix<T, D> t_all({t.size().rows() * (nthreads - 1), t.size().cols()}, t.size());
+      ex::let_value([=](std::shared_ptr<pika::barrier<>>& barrier_ptr, auto&& panel,
+                        const common::internal::vector<T>& taus, matrix::Tile<T, D>& t) {
+        matrix::Matrix<T, D> t_all({t.size().rows() * to_SizeType(nthreads - 1), t.size().cols()},
+                                   t.size());
         return ex::when_all_vector(
                    select(t_all, common::iterate_range2d(t_all.distribution().localNrTiles()))) |
                ex::transfer(
                    dlaf::internal::getBackendScheduler<B>(pika::execution::thread_priority::high)) |
-               ex::bulk(nthreads, [&barrier_ptr, &t, &taus, &panel, nthreads, v_start, panelRowBegin,
-                                   bsRows](const SizeType index,
-                                           std::vector<matrix::Tile<T, D>>& t_all) {
+               ex::bulk(nthreads, [=, &barrier_ptr, &t, &taus,
+                                   &panel](const size_t index, std::vector<matrix::Tile<T, D>>& t_all) {
                  using Helpers = tfactor_l::Helpers<B, D, T>;
 
                  tile::internal::set0<T>(index == 0 ? t : t_all[index - 1]);
@@ -300,16 +299,15 @@ void QR_Tfactor<B, D, T>::call(matrix::Panel<Coord::Col, T, D>& hh_panel,
                  // compute the column partial result `t` (multi-threaded)
                  // First we compute the matrix vector multiplication for each column
                  // -tau(j) . V(j:, 0:j)* . V(j:, j)
-                 const SizeType chunk_size = util::ceilDiv<SizeType>(panel.size(), nthreads);
-                 const SizeType from = to_SizeType(index) * chunk_size;
-                 const SizeType to = std::min(index * chunk_size + chunk_size, SizeType(panel.size()));
+                 const size_t chunk_size = util::ceilDiv(panel.size(), nthreads);
+                 const size_t begin = index * chunk_size;
+                 const size_t end = std::min(index * chunk_size + chunk_size, panel.size());
 
-                 for (auto i = from; i < to; ++i) {
+                 for (size_t i = begin; i < end; ++i) {
                    const matrix::Tile<const T, D>& tile_v = panel[i].get();
 
-                   // std::max<SizeType>(0, v_i.row() * hh_panel.parentDistribution().blockSize().rows() - v_start);
                    const SizeType first_row_tile =
-                       std::max<SizeType>(0, (panelRowBegin + i) * bsRows - v_start);
+                       std::max<SizeType>(0, (panelRowBegin + to_SizeType(i)) * bsRows - v_start);
 
                    if (index == 0)
                      t = Helpers::gemv_func(first_row_tile, tile_v, taus, std::move(t));
@@ -327,9 +325,9 @@ void QR_Tfactor<B, D, T>::call(matrix::Panel<Coord::Col, T, D>& hh_panel,
                      tile::internal::add(T(1), partial_t, t);
 
                    // 2nd step
-                   // compute the T factor, by performing the last step on each column (single-threaded)
-                   // each column depends on the previous part (all reflectors that comes before) so it
-                   // is performed sequentially
+                   // compute the T factor, by performing the last step on each column
+                   // (single-threaded) each column depends on the previous part (all reflectors
+                   // that comes before) so it is performed sequentially
                    t = Helpers::trmv_func(std::move(t));
                  }
                });

From b8cbd6248e8d6f8fc1308c09e99c0fd37765b092 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Tue, 14 Feb 2023 12:07:12 +0100
Subject: [PATCH 04/13] split algorithms variant for different backends

---
 include/dlaf/factorization/qr.h               |  2 +-
 include/dlaf/factorization/qr/api.h           | 73 +++++-------------
 include/dlaf/factorization/qr/t_factor_impl.h | 75 ++++++++++++++++---
 3 files changed, 87 insertions(+), 63 deletions(-)

diff --git a/include/dlaf/factorization/qr.h b/include/dlaf/factorization/qr.h
index 189987cc6f..6b4216dbf0 100644
--- a/include/dlaf/factorization/qr.h
+++ b/include/dlaf/factorization/qr.h
@@ -46,7 +46,7 @@ void computeTFactor(matrix::Panel<Coord::Col, T, device>& hh_panel,
                     pika::shared_future<common::internal::vector<T>> taus,
                     pika::future<matrix::Tile<T, device>> t,
                     common::Pipeline<comm::Communicator>& mpi_col_task_chain) {
-  QR_Tfactor<backend, device, T>::call(hh_panel, taus, std::move(t), mpi_col_task_chain);
+  QR_TfactorDistributed<backend, device, T>::call(hh_panel, taus, std::move(t), mpi_col_task_chain);
 }
 
 }
diff --git a/include/dlaf/factorization/qr/api.h b/include/dlaf/factorization/qr/api.h
index e803219597..2c1f9a6dfe 100644
--- a/include/dlaf/factorization/qr/api.h
+++ b/include/dlaf/factorization/qr/api.h
@@ -24,60 +24,26 @@ template <Backend backend, Device device, class T>
 struct QR {};
 
 template <Backend backend, Device device, class T>
-struct QR_Tfactor {
-  /// Forms the triangular factor T of a block of reflectors H, which is defined as a product of k
-  /// elementary reflectors.
-  ///
-  /// It is similar to what xLARFT in LAPACK does.
-  /// Given @p k elementary reflectors stored in the column of @p v starting at tile @p v_start,
-  /// together with related tau values in @p taus, in @p t will be formed the triangular factor for the H
-  /// block of reflector, such that
-  ///
-  /// H = I - V . T . V*
-  ///
-  /// where H = H1 . H2 . ... . Hk
-  ///
-  /// in which Hi represents a single elementary reflector transformation
-  ///
-  /// @param k the number of elementary reflectors to use (from the beginning of the tile)
-  /// @param v where the elementary reflectors are stored
-  /// @param v_start tile in @p v where the column of reflectors starts
-  /// @param taus array of taus, associated with the related elementary reflector
-  /// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
-  /// TileElementSize(k, k)
-  ///
-  /// @pre k <= t.get().size().rows && k <= t.get().size().cols()
-  /// @pre k >= 0
-  /// @pre v_start.isIn(v.nrTiles())
-  static void call(matrix::Panel<Coord::Col, T, device>& panel_view,
+struct QR_Tfactor;
+
+template <class T>
+struct QR_Tfactor<Backend::MC, Device::CPU, T> {
+  static void call(matrix::Panel<Coord::Col, T, Device::CPU>& panel_view,
+                   pika::shared_future<common::internal::vector<T>> taus,
+                   pika::future<matrix::Tile<T, Device::CPU>> t);
+};
+
+#ifdef DLAF_WITH_GPU
+template <class T>
+struct QR_Tfactor<Backend::GPU, Device::GPU, T> {
+  static void call(matrix::Panel<Coord::Col, T, Device::GPU>& panel_view,
                    pika::shared_future<common::internal::vector<T>> taus,
-                   pika::future<matrix::Tile<T, device>> t);
+                   pika::future<matrix::Tile<T, Device::GPU>> t);
+};
+#endif
 
-  /// Forms the triangular factor T of a block of reflectors H, which is defined as a product of k
-  /// elementary reflectors.
-  ///
-  /// It is similar to what xLARFT in LAPACK does.
-  /// Given @p k elementary reflectors stored in the column of @p v starting at tile @p v_start,
-  /// together with related tau values in @p taus, in @p t will be formed the triangular factor for the H
-  /// block of reflector, such that
-  ///
-  /// H = I - V . T . V*
-  ///
-  /// where H = H1 . H2 . ... . Hk
-  ///
-  /// in which Hi represents a single elementary reflector transformation
-  ///
-  /// @param k the number of elementary reflectors to use (from the beginning of the tile)
-  /// @param v where the elementary reflectors are stored
-  /// @param v_start tile in @p v where the column of reflectors starts
-  /// @param taus array of taus, associated with the related elementary reflector
-  /// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
-  /// TileElementSize(k, k)
-  /// @param mpi_col_task_chain where internal communications are issued
-  ///
-  /// @pre k <= t.get().size().rows && k <= t.get().size().cols()
-  /// @pre k >= 0
-  /// @pre v_start.isIn(v.nrTiles())
+template <Backend backend, Device device, class T>
+struct QR_TfactorDistributed {
   static void call(matrix::Panel<Coord::Col, T, device>& hh_panel,
                    pika::shared_future<common::internal::vector<T>> taus,
                    pika::future<matrix::Tile<T, device>> t,
@@ -86,7 +52,8 @@ struct QR_Tfactor {
 
 /// ---- ETI
 #define DLAF_FACTORIZATION_QR_TFACTOR_ETI(KWORD, BACKEND, DEVICE, DATATYPE) \
-  KWORD template struct QR_Tfactor<BACKEND, DEVICE, DATATYPE>;
+  KWORD template struct QR_Tfactor<BACKEND, DEVICE, DATATYPE>;              \
+  KWORD template struct QR_TfactorDistributed<BACKEND, DEVICE, DATATYPE>;
 
 DLAF_FACTORIZATION_QR_TFACTOR_ETI(extern, Backend::MC, Device::CPU, float)
 DLAF_FACTORIZATION_QR_TFACTOR_ETI(extern, Backend::MC, Device::CPU, double)
diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h
index af245b502a..330c80a20a 100644
--- a/include/dlaf/factorization/qr/t_factor_impl.h
+++ b/include/dlaf/factorization/qr/t_factor_impl.h
@@ -244,10 +244,13 @@ struct Helpers<Backend::GPU, Device::GPU, T> {
 #endif
 }
 
-template <Backend B, Device D, class T>
-void QR_Tfactor<B, D, T>::call(matrix::Panel<Coord::Col, T, D>& hh_panel,
-                               pika::shared_future<common::internal::vector<T>> taus,
-                               pika::future<matrix::Tile<T, D>> t) {
+template <class T>
+void QR_Tfactor<Backend::MC, Device::CPU, T>::call(matrix::Panel<Coord::Col, T, Device::CPU>& hh_panel,
+                                                   pika::shared_future<common::internal::vector<T>> taus,
+                                                   pika::future<matrix::Tile<T, Device::CPU>> t) {
+  constexpr auto B = Backend::MC;
+  constexpr auto D = Device::CPU;
+
   namespace ex = pika::execution::experimental;
 
   // Fast return in case of no reflectors
@@ -334,11 +337,66 @@ void QR_Tfactor<B, D, T>::call(matrix::Panel<Coord::Col, T, D>& hh_panel,
       }));
 }
 
+#ifdef DLAF_WITH_GPU
+template <class T>
+void QR_Tfactor<Backend::GPU, Device::GPU, T>::call(matrix::Panel<Coord::Col, T, Device::GPU>& hh_panel,
+                                                    pika::shared_future<common::internal::vector<T>> taus,
+                                                    pika::future<matrix::Tile<T, Device::GPU>> t) {
+  constexpr auto B = Backend::GPU;
+  constexpr auto D = Device::GPU;
+
+  namespace ex = pika::execution::experimental;
+
+  using Helpers = tfactor_l::Helpers<B, D, T>;
+  // Fast return in case of no reflectors
+  if (hh_panel.getWidth() == 0)
+    return;
+
+  const auto v_start = hh_panel.offsetElement();
+
+  ex::unique_any_sender<matrix::Tile<T, D>> t_local = Helpers::set0(std::move(t));
+
+  // Note:
+  // T factor is an upper triangular square matrix, built column by column
+  // with taus values on the diagonal
+  //
+  // T(j,j) = tau(j)
+  //
+  // and in the upper triangular part the following formula applies
+  //
+  // T(0:j, j) = T(0:j, 0:j) . -tau(j) . V(j:, 0:j)* . V(j:, j)
+  //
+  //
+  // The result is achieved in two main steps:
+  // 1) t = -tau(j) . V(j:, 0:j)* . V(j:, j)
+  // 2) T(0:j, j) = T(0:j, 0:j) . t
+
+  // 1st step: compute the column partial result `t`
+  // First we compute the matrix vector multiplication for each column
+  // -tau(j) . V(j:, 0:j)* . V(j:, j)
+  for (const auto& v_i : hh_panel.iteratorLocal()) {
+    const SizeType first_row_tile =
+        std::max<SizeType>(0, v_i.row() * hh_panel.parentDistribution().blockSize().rows() - v_start);
+
+    // Note:
+    // Since we are writing always on the same t, the gemv are serialized
+    // A possible solution to this would be to have multiple places where to store partial
+    // results, and then locally reduce them just before the reduce over ranks
+    t_local = Helpers::gemvColumnT(first_row_tile, hh_panel.read(v_i), taus, std::move(t_local));
+  }
+
+  // 2nd step: compute the T factor, by performing the last step on each column
+  // each column depends on the previous part (all reflectors that comes before)
+  // so it is performed sequentially
+  ex::start_detached(Helpers::trmvUpdateColumn(std::move(t_local)));
+}
+#endif
+
 template <Backend backend, Device device, class T>
-void QR_Tfactor<backend, device, T>::call(matrix::Panel<Coord::Col, T, device>& hh_panel,
-                                          pika::shared_future<common::internal::vector<T>> taus,
-                                          pika::future<matrix::Tile<T, device>> t,
-                                          common::Pipeline<comm::Communicator>& mpi_col_task_chain) {
+void QR_TfactorDistributed<backend, device, T>::call(
+    matrix::Panel<Coord::Col, T, device>& hh_panel,
+    pika::shared_future<common::internal::vector<T>> taus, pika::future<matrix::Tile<T, device>> t,
+    common::Pipeline<comm::Communicator>& mpi_col_task_chain) {
   namespace ex = pika::execution::experimental;
 
   using Helpers = tfactor_l::Helpers<backend, device, T>;
@@ -392,5 +450,4 @@ void QR_Tfactor<backend, device, T>::call(matrix::Panel<Coord::Col, T, device>&
   // so it is performed sequentially
   ex::start_detached(Helpers::trmvUpdateColumn(std::move(t_local)));
 }
-
 }

From 247ab27b486ab2aa1438ddb98ef345b51dd536ab Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Tue, 14 Feb 2023 14:34:14 +0100
Subject: [PATCH 05/13] first try fixing doc

---
 include/dlaf/factorization/qr.h | 56 +++++++++++++++++++++++++++++++--
 1 file changed, 53 insertions(+), 3 deletions(-)

diff --git a/include/dlaf/factorization/qr.h b/include/dlaf/factorization/qr.h
index 6b4216dbf0..ac8f8db9ea 100644
--- a/include/dlaf/factorization/qr.h
+++ b/include/dlaf/factorization/qr.h
@@ -27,13 +27,28 @@ namespace dlaf::factorization::internal {
 /// H0 H1 H2 ... HK-1
 /// Note: The first element of the HH reflectors is NOT implicitly assumed to be 1,
 ///       it has to be set correctly in the panel (0s as well).
-
+///
+/// It is similar to what xLARFT in LAPACK does.
+/// Given @p k elementary reflectors stored in the column of @p hh_panel together with related tau values
+/// in @p taus, in @p t will be formed the triangular factor for the H block of reflectors, such that
+///
+/// H = I - V . T . V*
+///
+/// where H = H1 . H2 . ... . Hk
+///
+/// in which Hi represents a single elementary reflector transformation.
+///
 /// A Storage-Efficient WY Representation for Products of Householder Transformations.
 /// Schreiber, Robert & VanLoan, Charles. (1989)
 /// SIAM Journal on Scientific and Statistical Computing. 10. 10.1137/0910005.
 ///
-/// @pre taus contains a vector with k elements
-/// @pre t contains a (k x k) tile
+/// @param hh_panel where the elementary reflectors are stored
+/// @param taus array of taus, associated with the related elementary reflector
+/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
+/// TileElementSize(k, k)
+/// @param mpi_col_task_chain where internal communications are issued
+///
+/// @pre hh_pane.getWidth() <= t.get().size().rows && hh_panel.size().getWidth() <= t.get().size().cols()
 template <Backend backend, Device device, class T>
 void computeTFactor(matrix::Panel<Coord::Col, T, device>& hh_panel,
                     pika::shared_future<common::internal::vector<T>> taus,
@@ -41,6 +56,41 @@ void computeTFactor(matrix::Panel<Coord::Col, T, device>& hh_panel,
   QR_Tfactor<backend, device, T>::call(hh_panel, taus, std::move(t));
 }
 
+/// Forms the triangular factor T of a block of reflectors H, which is defined as a product of
+/// k := hh_panel.getWidth() elementary reflectors.
+///
+/// hh_panel should have the following form
+/// H0  0  0 ...    0
+///  . H1  0 ...    0
+///  .  . H2 ...    0
+///  .  .  . ...    0
+///  .  .  . ... HK-1
+///  .  .  . ...    .
+/// H0 H1 H2 ... HK-1
+/// Note: The first element of the HH reflectors is NOT implicitly assumed to be 1,
+///       it has to be set correctly in the panel (0s as well).
+///
+/// It is similar to what xLARFT in LAPACK does.
+/// Given @p k elementary reflectors stored in the column of @p hh_panel together with related tau values
+/// in @p taus, in @p t will be formed the triangular factor for the H block of reflectors, such that
+///
+/// H = I - V . T . V*
+///
+/// where H = H1 . H2 . ... . Hk
+///
+/// in which Hi represents a single elementary reflector transformation.
+///
+/// A Storage-Efficient WY Representation for Products of Householder Transformations.
+/// Schreiber, Robert & VanLoan, Charles. (1989)
+/// SIAM Journal on Scientific and Statistical Computing. 10. 10.1137/0910005.
+///
+/// @param hh_panel where the elementary reflectors are stored
+/// @param taus array of taus, associated with the related elementary reflector
+/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
+/// TileElementSize(k, k)
+/// @param mpi_col_task_chain where internal communications are issued
+///
+/// @pre hh_pane.getWidth() <= t.get().size().rows && hh_panel.size().getWidth() <= t.get().size().cols()
 template <Backend backend, Device device, class T>
 void computeTFactor(matrix::Panel<Coord::Col, T, device>& hh_panel,
                     pika::shared_future<common::internal::vector<T>> taus,

From d0e636dfaa2760a3489b3abdc838bd5f8dfe4a8d Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Tue, 14 Feb 2023 14:39:13 +0100
Subject: [PATCH 06/13] minor change

---
 include/dlaf/factorization/qr/t_factor_impl.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h
index 330c80a20a..5e5b00da36 100644
--- a/include/dlaf/factorization/qr/t_factor_impl.h
+++ b/include/dlaf/factorization/qr/t_factor_impl.h
@@ -392,14 +392,14 @@ void QR_Tfactor<Backend::GPU, Device::GPU, T>::call(matrix::Panel<Coord::Col, T,
 }
 #endif
 
-template <Backend backend, Device device, class T>
-void QR_TfactorDistributed<backend, device, T>::call(
-    matrix::Panel<Coord::Col, T, device>& hh_panel,
-    pika::shared_future<common::internal::vector<T>> taus, pika::future<matrix::Tile<T, device>> t,
-    common::Pipeline<comm::Communicator>& mpi_col_task_chain) {
+template <Backend B, Device D, class T>
+void QR_TfactorDistributed<B, D, T>::call(matrix::Panel<Coord::Col, T, D>& hh_panel,
+                                          pika::shared_future<common::internal::vector<T>> taus,
+                                          pika::future<matrix::Tile<T, D>> t,
+                                          common::Pipeline<comm::Communicator>& mpi_col_task_chain) {
   namespace ex = pika::execution::experimental;
 
-  using Helpers = tfactor_l::Helpers<backend, device, T>;
+  using Helpers = tfactor_l::Helpers<B, D, T>;
 
   // Fast return in case of no reflectors
   if (hh_panel.getWidth() == 0)
@@ -408,7 +408,7 @@ void QR_TfactorDistributed<backend, device, T>::call(
   const auto v_start = hh_panel.offsetElement();
   auto dist = hh_panel.parentDistribution();
 
-  ex::unique_any_sender<matrix::Tile<T, device>> t_local = Helpers::set0(std::move(t));
+  ex::unique_any_sender<matrix::Tile<T, D>> t_local = Helpers::set0(std::move(t));
 
   // Note:
   // T factor is an upper triangular square matrix, built column by column

From bbdb50fed3ba45f8c90cc9bbd0a45bcf875046b0 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Fri, 17 Feb 2023 08:23:45 +0100
Subject: [PATCH 07/13] tfactor-bulk-dist

---
 include/dlaf/factorization/qr/t_factor_impl.h | 103 +++++++++++++-----
 1 file changed, 74 insertions(+), 29 deletions(-)

diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h
index 5e5b00da36..7fbc0c2b6a 100644
--- a/include/dlaf/factorization/qr/t_factor_impl.h
+++ b/include/dlaf/factorization/qr/t_factor_impl.h
@@ -33,6 +33,7 @@
 #include "dlaf/common/range2d.h"
 #include "dlaf/common/vector.h"
 #include "dlaf/communication/kernels/all_reduce.h"
+#include "dlaf/communication/sync/all_reduce.h"
 #include "dlaf/lapack/tile.h"
 #include "dlaf/matrix/matrix.h"
 #include "dlaf/matrix/views.h"
@@ -284,8 +285,8 @@ void QR_Tfactor<Backend::MC, Device::CPU, T>::call(matrix::Panel<Coord::Col, T,
   ex::start_detached(
       ex::when_all(ex::just(std::make_shared<pika::barrier<>>(nthreads)),
                    ex::when_all_vector(std::move(panel_tiles)), std::move(taus), std::move(t)) |
-      ex::let_value([=](std::shared_ptr<pika::barrier<>>& barrier_ptr, auto&& panel,
-                        const common::internal::vector<T>& taus, matrix::Tile<T, D>& t) {
+      ex::let_value([=](auto& barrier_ptr, auto&& panel, const common::internal::vector<T>& taus,
+                        matrix::Tile<T, D>& t) {
         matrix::Matrix<T, D> t_all({t.size().rows() * to_SizeType(nthreads - 1), t.size().cols()},
                                    t.size());
         return ex::when_all_vector(
@@ -399,16 +400,13 @@ void QR_TfactorDistributed<B, D, T>::call(matrix::Panel<Coord::Col, T, D>& hh_pa
                                           common::Pipeline<comm::Communicator>& mpi_col_task_chain) {
   namespace ex = pika::execution::experimental;
 
-  using Helpers = tfactor_l::Helpers<B, D, T>;
-
   // Fast return in case of no reflectors
   if (hh_panel.getWidth() == 0)
     return;
 
-  const auto v_start = hh_panel.offsetElement();
-  auto dist = hh_panel.parentDistribution();
-
-  ex::unique_any_sender<matrix::Tile<T, D>> t_local = Helpers::set0(std::move(t));
+  std::vector<decltype(hh_panel.read_sender(std::declval<LocalTileIndex>()))> panel_tiles;
+  for (const auto idx : hh_panel.iteratorLocal())
+    panel_tiles.push_back(hh_panel.read_sender(idx));
 
   // Note:
   // T factor is an upper triangular square matrix, built column by column
@@ -425,29 +423,76 @@ void QR_TfactorDistributed<B, D, T>::call(matrix::Panel<Coord::Col, T, D>& hh_pa
   // 1) t = -tau(j) . V(j:, 0:j)* . V(j:, j)
   // 2) T(0:j, j) = T(0:j, 0:j) . t
 
-  // 1st step: compute the column partial result `t`
-  // First we compute the matrix vector multiplication for each column
-  // -tau(j) . V(j:, 0:j)* . V(j:, j)
-  for (const auto& v_i_loc : hh_panel.iteratorLocal()) {
-    const SizeType v_i = dist.template globalTileFromLocalTile<Coord::Row>(v_i_loc.row());
-    const SizeType first_row_tile = std::max<SizeType>(0, v_i * dist.blockSize().rows() - v_start);
+  const auto dist = hh_panel.parentDistribution();
 
-    // TODO
-    // Note:
-    // Since we are writing always on the same t, the gemv are serialized
-    // A possible solution to this would be to have multiple places where to store partial
-    // results, and then locally reduce them just before the reduce over ranks
-    t_local = Helpers::gemvColumnT(first_row_tile, hh_panel.read(v_i_loc), taus, std::move(t_local));
-  }
+  const SizeType v_start = hh_panel.offsetElement();
+  const SizeType bsRows = hh_panel.parentDistribution().blockSize().rows();
+  const SizeType panelRowBegin = hh_panel.iteratorLocal().begin()->row();
+
+  const size_t nthreads = std::max<size_t>(1, (pika::get_num_worker_threads() / 2));
+  ex::start_detached(
+      ex::when_all(ex::just(std::make_shared<pika::barrier<>>(nthreads)),
+                   ex::when_all_vector(std::move(panel_tiles)), std::move(taus), std::move(t),
+                   mpi_col_task_chain()) |
+      ex::let_value([=](auto& barrier_ptr, auto&& panel, const common::internal::vector<T>& taus,
+                        matrix::Tile<T, D>& t, auto&& pcomm) {
+        matrix::Matrix<T, D> t_all({t.size().rows() * to_SizeType(nthreads - 1), t.size().cols()},
+                                   t.size());
+        return ex::when_all_vector(
+                   select(t_all, common::iterate_range2d(t_all.distribution().localNrTiles()))) |
+               ex::transfer(
+                   dlaf::internal::getBackendScheduler<B>(pika::execution::thread_priority::high)) |
+               ex::bulk(nthreads, [=, &barrier_ptr, &t, &taus, &panel,
+                                   &pcomm](const size_t index, std::vector<matrix::Tile<T, D>>& t_all) {
+                 using Helpers = tfactor_l::Helpers<B, D, T>;
 
-  // at this point each rank has its partial result for each column
-  // so, let's reduce the results (on all ranks, so that everyone can independently compute T factor)
-  if (true)  // TODO if the column communicator has more than 1 tile...but I just have the pipeline
-    t_local = dlaf::comm::scheduleAllReduceInPlace(mpi_col_task_chain(), MPI_SUM, std::move(t_local));
+                 tile::internal::set0<T>(index == 0 ? t : t_all[index - 1]);
 
-  // 2nd step: compute the T factor, by performing the last step on each column
-  // each column depends on the previous part (all reflectors that comes before)
-  // so it is performed sequentially
-  ex::start_detached(Helpers::trmvUpdateColumn(std::move(t_local)));
+                 // 1st step
+                 // compute the column partial result `t` (multi-threaded)
+                 // First we compute the matrix vector multiplication for each column
+                 // -tau(j) . V(j:, 0:j)* . V(j:, j)
+                 const size_t chunk_size = util::ceilDiv(panel.size(), nthreads);
+                 const size_t begin = index * chunk_size;
+                 const size_t end = std::min(index * chunk_size + chunk_size, panel.size());
+
+                 for (size_t i = begin; i < end; ++i) {
+                   const matrix::Tile<const T, D>& tile_v = panel[i].get();
+
+                   const SizeType first_row_tile =
+                       std::max<SizeType>(0, dist.template globalTileFromLocalTile<Coord::Row>(
+                                                 panelRowBegin + to_SizeType(i)) *
+                                                     bsRows -
+                                                 v_start);
+
+                   if (index == 0)
+                     t = Helpers::gemv_func(first_row_tile, tile_v, taus, std::move(t));
+                   else
+                     t_all[index - 1] =
+                         Helpers::gemv_func(first_row_tile, tile_v, taus, std::move(t_all[index - 1]));
+                 }
+
+                 barrier_ptr->arrive_and_wait();
+
+                 // (single-threaded)
+                 if (index == 0) {
+                   // reduce
+                   for (auto& partial_t : t_all)
+                     tile::internal::add(T(1), partial_t, t);
+
+                   // at this point each rank has its partial result for each column
+                   // so, let's reduce the results (on all ranks, so that everyone can
+                   // independently compute T factor)
+                   if (pcomm.ref().size() > 1)
+                     comm::sync::allReduceInPlace(pcomm.ref(), MPI_SUM, common::make_data(t));
+
+                   // 2nd step
+                   // compute the T factor, by performing the last step on each column
+                   // (single-threaded) each column depends on the previous part (all reflectors
+                   // that comes before) so it is performed sequentially
+                   t = Helpers::trmv_func(std::move(t));
+                 }
+               });
+      }));
 }
 }

From 3ca979e13606edc9dfd5c242daee5ebc20dd3cfe Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Fri, 17 Feb 2023 09:03:47 +0100
Subject: [PATCH 08/13] add back separate implementation for gpu and refactor
 variant separation

---
 include/dlaf/factorization/qr.h               |  2 +-
 include/dlaf/factorization/qr/api.h           | 17 ++--
 include/dlaf/factorization/qr/t_factor_impl.h | 77 +++++++++++++++++--
 3 files changed, 81 insertions(+), 15 deletions(-)

diff --git a/include/dlaf/factorization/qr.h b/include/dlaf/factorization/qr.h
index ac8f8db9ea..26fdd5d16e 100644
--- a/include/dlaf/factorization/qr.h
+++ b/include/dlaf/factorization/qr.h
@@ -96,7 +96,7 @@ void computeTFactor(matrix::Panel<Coord::Col, T, device>& hh_panel,
                     pika::shared_future<common::internal::vector<T>> taus,
                     pika::future<matrix::Tile<T, device>> t,
                     common::Pipeline<comm::Communicator>& mpi_col_task_chain) {
-  QR_TfactorDistributed<backend, device, T>::call(hh_panel, taus, std::move(t), mpi_col_task_chain);
+  QR_Tfactor<backend, device, T>::call(hh_panel, taus, std::move(t), mpi_col_task_chain);
 }
 
 }
diff --git a/include/dlaf/factorization/qr/api.h b/include/dlaf/factorization/qr/api.h
index 2c1f9a6dfe..79add6fbce 100644
--- a/include/dlaf/factorization/qr/api.h
+++ b/include/dlaf/factorization/qr/api.h
@@ -31,6 +31,10 @@ struct QR_Tfactor<Backend::MC, Device::CPU, T> {
   static void call(matrix::Panel<Coord::Col, T, Device::CPU>& panel_view,
                    pika::shared_future<common::internal::vector<T>> taus,
                    pika::future<matrix::Tile<T, Device::CPU>> t);
+  static void call(matrix::Panel<Coord::Col, T, Device::CPU>& hh_panel,
+                   pika::shared_future<common::internal::vector<T>> taus,
+                   pika::future<matrix::Tile<T, Device::CPU>> t,
+                   common::Pipeline<comm::Communicator>& mpi_col_task_chain);
 };
 
 #ifdef DLAF_WITH_GPU
@@ -39,21 +43,16 @@ struct QR_Tfactor<Backend::GPU, Device::GPU, T> {
   static void call(matrix::Panel<Coord::Col, T, Device::GPU>& panel_view,
                    pika::shared_future<common::internal::vector<T>> taus,
                    pika::future<matrix::Tile<T, Device::GPU>> t);
-};
-#endif
-
-template <Backend backend, Device device, class T>
-struct QR_TfactorDistributed {
-  static void call(matrix::Panel<Coord::Col, T, device>& hh_panel,
+  static void call(matrix::Panel<Coord::Col, T, Device::GPU>& hh_panel,
                    pika::shared_future<common::internal::vector<T>> taus,
-                   pika::future<matrix::Tile<T, device>> t,
+                   pika::future<matrix::Tile<T, Device::GPU>> t,
                    common::Pipeline<comm::Communicator>& mpi_col_task_chain);
 };
+#endif
 
 /// ---- ETI
 #define DLAF_FACTORIZATION_QR_TFACTOR_ETI(KWORD, BACKEND, DEVICE, DATATYPE) \
-  KWORD template struct QR_Tfactor<BACKEND, DEVICE, DATATYPE>;              \
-  KWORD template struct QR_TfactorDistributed<BACKEND, DEVICE, DATATYPE>;
+  KWORD template struct QR_Tfactor<BACKEND, DEVICE, DATATYPE>;
 
 DLAF_FACTORIZATION_QR_TFACTOR_ETI(extern, Backend::MC, Device::CPU, float)
 DLAF_FACTORIZATION_QR_TFACTOR_ETI(extern, Backend::MC, Device::CPU, double)
diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h
index 7fbc0c2b6a..4c1dd4aefe 100644
--- a/include/dlaf/factorization/qr/t_factor_impl.h
+++ b/include/dlaf/factorization/qr/t_factor_impl.h
@@ -393,11 +393,14 @@ void QR_Tfactor<Backend::GPU, Device::GPU, T>::call(matrix::Panel<Coord::Col, T,
 }
 #endif
 
-template <Backend B, Device D, class T>
-void QR_TfactorDistributed<B, D, T>::call(matrix::Panel<Coord::Col, T, D>& hh_panel,
-                                          pika::shared_future<common::internal::vector<T>> taus,
-                                          pika::future<matrix::Tile<T, D>> t,
-                                          common::Pipeline<comm::Communicator>& mpi_col_task_chain) {
+template <class T>
+void QR_Tfactor<Backend::MC, Device::CPU, T>::call(
+    matrix::Panel<Coord::Col, T, Device::CPU>& hh_panel,
+    pika::shared_future<common::internal::vector<T>> taus, pika::future<matrix::Tile<T, Device::CPU>> t,
+    common::Pipeline<comm::Communicator>& mpi_col_task_chain) {
+  constexpr auto B = Backend::MC;
+  constexpr auto D = Device::CPU;
+
   namespace ex = pika::execution::experimental;
 
   // Fast return in case of no reflectors
@@ -495,4 +498,68 @@ void QR_TfactorDistributed<B, D, T>::call(matrix::Panel<Coord::Col, T, D>& hh_pa
                });
       }));
 }
+
+#ifdef DLAF_WITH_GPU
+template <class T>
+void QR_Tfactor<Backend::GPU, Device::GPU, T>::call(
+    matrix::Panel<Coord::Col, T, Device::GPU>& hh_panel,
+    pika::shared_future<common::internal::vector<T>> taus, pika::future<matrix::Tile<T, Device::GPU>> t,
+    common::Pipeline<comm::Communicator>& mpi_col_task_chain) {
+  constexpr auto B = Backend::GPU;
+  constexpr auto D = Device::GPU;
+
+  namespace ex = pika::execution::experimental;
+
+  using Helpers = tfactor_l::Helpers<B, D, T>;
+
+  // Fast return in case of no reflectors
+  if (hh_panel.getWidth() == 0)
+    return;
+
+  const auto v_start = hh_panel.offsetElement();
+  auto dist = hh_panel.parentDistribution();
+
+  ex::unique_any_sender<matrix::Tile<T, D>> t_local = Helpers::set0(std::move(t));
+
+  // Note:
+  // T factor is an upper triangular square matrix, built column by column
+  // with taus values on the diagonal
+  //
+  // T(j,j) = tau(j)
+  //
+  // and in the upper triangular part the following formula applies
+  //
+  // T(0:j, j) = T(0:j, 0:j) . -tau(j) . V(j:, 0:j)* . V(j:, j)
+  //
+  //
+  // The result is achieved in two main steps:
+  // 1) t = -tau(j) . V(j:, 0:j)* . V(j:, j)
+  // 2) T(0:j, j) = T(0:j, 0:j) . t
+
+  // 1st step: compute the column partial result `t`
+  // First we compute the matrix vector multiplication for each column
+  // -tau(j) . V(j:, 0:j)* . V(j:, j)
+  for (const auto& v_i_loc : hh_panel.iteratorLocal()) {
+    const SizeType v_i = dist.template globalTileFromLocalTile<Coord::Row>(v_i_loc.row());
+    const SizeType first_row_tile = std::max<SizeType>(0, v_i * dist.blockSize().rows() - v_start);
+
+    // TODO
+    // Note:
+    // Since we are writing always on the same t, the gemv are serialized
+    // A possible solution to this would be to have multiple places where to store partial
+    // results, and then locally reduce them just before the reduce over ranks
+    t_local = Helpers::gemvColumnT(first_row_tile, hh_panel.read(v_i_loc), taus, std::move(t_local));
+  }
+
+  // at this point each rank has its partial result for each column
+  // so, let's reduce the results (on all ranks, so that everyone can independently compute T factor)
+  if (true)  // TODO if the column communicator has more than 1 tile...but I just have the pipeline
+    t_local = dlaf::comm::scheduleAllReduceInPlace(mpi_col_task_chain(), MPI_SUM, std::move(t_local));
+
+  // 2nd step: compute the T factor, by performing the last step on each column
+  // each column depends on the previous part (all reflectors that comes before)
+  // so it is performed sequentially
+  ex::start_detached(Helpers::trmvUpdateColumn(std::move(t_local)));
+}
+#endif
 }

From 26d72aac8663cf72ddf0aa36150e3883f51e4c98 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Tue, 18 Apr 2023 12:13:59 +0200
Subject: [PATCH 09/13] add tunable nthreads parameter

---
 .../dlaf/eigensolver/get_tfactor_nworkers.h   | 32 +++++++++++++++++++
 include/dlaf/factorization/qr/t_factor_impl.h | 28 +++++++++-------
 include/dlaf/tune.h                           | 10 ++++--
 3 files changed, 56 insertions(+), 14 deletions(-)
 create mode 100644 include/dlaf/eigensolver/get_tfactor_nworkers.h

diff --git a/include/dlaf/eigensolver/get_tfactor_nworkers.h b/include/dlaf/eigensolver/get_tfactor_nworkers.h
new file mode 100644
index 0000000000..ca61e9d0d2
--- /dev/null
+++ b/include/dlaf/eigensolver/get_tfactor_nworkers.h
@@ -0,0 +1,32 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2018-2023, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+
+#include <pika/runtime.hpp>
+
+#include "dlaf/common/assert.h"
+#include "dlaf/tune.h"
+
+namespace dlaf::factorization::internal {
+
+inline size_t getTFactorNWorkers() noexcept {
+  const size_t nworkers = getTuneParameters().tfactor_nworkers;
+
+  // Note: precautionarily we leave at least 1 thread "free" to do other stuff
+  const size_t max_workers = pika::resource::get_thread_pool("default").get_os_thread_count() - 1;
+
+  // 1 <= number of workers < max_workers
+  return std::max<std::size_t>(1, std::min<std::size_t>(max_workers, nworkers));
+}
+
+}
diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h
index 4c1dd4aefe..a3ed3c6b90 100644
--- a/include/dlaf/factorization/qr/t_factor_impl.h
+++ b/include/dlaf/factorization/qr/t_factor_impl.h
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include <cstddef>
 #include <memory>
 
 #include <blas.hh>
@@ -34,6 +35,7 @@
 #include "dlaf/common/vector.h"
 #include "dlaf/communication/kernels/all_reduce.h"
 #include "dlaf/communication/sync/all_reduce.h"
+#include "dlaf/eigensolver/get_tfactor_nworkers.h"
 #include "dlaf/lapack/tile.h"
 #include "dlaf/matrix/matrix.h"
 #include "dlaf/matrix/views.h"
@@ -281,7 +283,7 @@ void QR_Tfactor<Backend::MC, Device::CPU, T>::call(matrix::Panel<Coord::Col, T,
   const SizeType bsRows = hh_panel.parentDistribution().blockSize().rows();
   const SizeType panelRowBegin = hh_panel.iteratorLocal().begin()->row();
 
-  const size_t nthreads = std::max<size_t>(1, (pika::get_num_worker_threads() / 2));
+  const std::size_t nthreads = getTFactorNWorkers();
   ex::start_detached(
       ex::when_all(ex::just(std::make_shared<pika::barrier<>>(nthreads)),
                    ex::when_all_vector(std::move(panel_tiles)), std::move(taus), std::move(t)) |
@@ -294,7 +296,8 @@ void QR_Tfactor<Backend::MC, Device::CPU, T>::call(matrix::Panel<Coord::Col, T,
                ex::transfer(
                    dlaf::internal::getBackendScheduler<B>(pika::execution::thread_priority::high)) |
                ex::bulk(nthreads, [=, &barrier_ptr, &t, &taus,
-                                   &panel](const size_t index, std::vector<matrix::Tile<T, D>>& t_all) {
+                                   &panel](const std::size_t index,
+                                           std::vector<matrix::Tile<T, D>>& t_all) {
                  using Helpers = tfactor_l::Helpers<B, D, T>;
 
                  tile::internal::set0<T>(index == 0 ? t : t_all[index - 1]);
@@ -303,11 +306,11 @@ void QR_Tfactor<Backend::MC, Device::CPU, T>::call(matrix::Panel<Coord::Col, T,
                  // compute the column partial result `t` (multi-threaded)
                  // First we compute the matrix vector multiplication for each column
                  // -tau(j) . V(j:, 0:j)* . V(j:, j)
-                 const size_t chunk_size = util::ceilDiv(panel.size(), nthreads);
-                 const size_t begin = index * chunk_size;
-                 const size_t end = std::min(index * chunk_size + chunk_size, panel.size());
+                 const std::size_t chunk_size = util::ceilDiv(panel.size(), nthreads);
+                 const std::size_t begin = index * chunk_size;
+                 const std::size_t end = std::min(index * chunk_size + chunk_size, panel.size());
 
-                 for (size_t i = begin; i < end; ++i) {
+                 for (std::size_t i = begin; i < end; ++i) {
                    const matrix::Tile<const T, D>& tile_v = panel[i].get();
 
                    const SizeType first_row_tile =
@@ -432,7 +435,7 @@ void QR_Tfactor<Backend::MC, Device::CPU, T>::call(
   const SizeType bsRows = hh_panel.parentDistribution().blockSize().rows();
   const SizeType panelRowBegin = hh_panel.iteratorLocal().begin()->row();
 
-  const size_t nthreads = std::max<size_t>(1, (pika::get_num_worker_threads() / 2));
+  const std::size_t nthreads = getTFactorNWorkers();
   ex::start_detached(
       ex::when_all(ex::just(std::make_shared<pika::barrier<>>(nthreads)),
                    ex::when_all_vector(std::move(panel_tiles)), std::move(taus), std::move(t),
@@ -446,7 +449,8 @@ void QR_Tfactor<Backend::MC, Device::CPU, T>::call(
                ex::transfer(
                    dlaf::internal::getBackendScheduler<B>(pika::execution::thread_priority::high)) |
                ex::bulk(nthreads, [=, &barrier_ptr, &t, &taus, &panel,
-                                   &pcomm](const size_t index, std::vector<matrix::Tile<T, D>>& t_all) {
+                                   &pcomm](const std::size_t index,
+                                           std::vector<matrix::Tile<T, D>>& t_all) {
                  using Helpers = tfactor_l::Helpers<B, D, T>;
 
                  tile::internal::set0<T>(index == 0 ? t : t_all[index - 1]);
@@ -455,11 +459,11 @@ void QR_Tfactor<Backend::MC, Device::CPU, T>::call(
                  // compute the column partial result `t` (multi-threaded)
                  // First we compute the matrix vector multiplication for each column
                  // -tau(j) . V(j:, 0:j)* . V(j:, j)
-                 const size_t chunk_size = util::ceilDiv(panel.size(), nthreads);
-                 const size_t begin = index * chunk_size;
-                 const size_t end = std::min(index * chunk_size + chunk_size, panel.size());
+                 const std::size_t chunk_size = util::ceilDiv(panel.size(), nthreads);
+                 const std::size_t begin = index * chunk_size;
+                 const std::size_t end = std::min(index * chunk_size + chunk_size, panel.size());
 
-                 for (size_t i = begin; i < end; ++i) {
+                 for (std::size_t i = begin; i < end; ++i) {
                    const matrix::Tile<const T, D>& tile_v = panel[i].get();
 
                    const SizeType first_row_tile =
diff --git a/include/dlaf/tune.h b/include/dlaf/tune.h
index 072a8cf0e6..548e659d2d 100644
--- a/include/dlaf/tune.h
+++ b/include/dlaf/tune.h
@@ -9,6 +9,8 @@
 //
 #pragma once
 
+#include <cstddef>
+
 #include <pika/runtime.hpp>
 #include <dlaf/types.h>
 
@@ -17,6 +19,7 @@ namespace dlaf {
 ///
 /// Holds the value of the parameters that can be used to tune DLA-Future.
 /// - red2band_panel_nworkers: number of threads to use for computing the panel in the reduction to band algorithm.
+/// - tfactor_nworkers: number of threads to use for computing the T factor
 /// - eigensolver_min_band: The minimun value to start looking for a divisor of the block size.
 ///                         Set with --dlaf:eigensolver-min-band or env variable DLAF_EIGENSOLVER_MIN_BAND.
 /// - band_to_tridiag_1d_block_size_base:
@@ -29,8 +32,11 @@ namespace dlaf {
 ///     DLAF_BT_BAND_TO_TRIDIAG_HH_APPLY_GROUP_SIZE.
 /// Note to developers: Users can change these values, therefore consistency has to be ensured by algorithms.
 struct TuneParameters {
-  size_t red2band_panel_nworkers =
-      std::max<size_t>(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2);
+  std::size_t red2band_panel_nworkers =
+      std::max<std::size_t>(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2);
+
+  std::size_t tfactor_nworkers =
+      std::max<std::size_t>(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2);
 
   SizeType eigensolver_min_band = 100;
   SizeType band_to_tridiag_1d_block_size_base = 8192;

From 33b204d62168b7215e2131781f859cf97e128a3f Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Tue, 18 Apr 2023 12:45:05 +0200
Subject: [PATCH 10/13] minor doc fix

---
 include/dlaf/factorization/qr.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/dlaf/factorization/qr.h b/include/dlaf/factorization/qr.h
index 26fdd5d16e..f351217732 100644
--- a/include/dlaf/factorization/qr.h
+++ b/include/dlaf/factorization/qr.h
@@ -46,7 +46,6 @@ namespace dlaf::factorization::internal {
 /// @param taus array of taus, associated with the related elementary reflector
 /// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
 /// TileElementSize(k, k)
-/// @param mpi_col_task_chain where internal communications are issued
 ///
 /// @pre hh_pane.getWidth() <= t.get().size().rows && hh_panel.size().getWidth() <= t.get().size().cols()
 template <Backend backend, Device device, class T>

From dbbf122d074451017bea5dfe77e64987ce18528e Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Wed, 19 Apr 2023 08:46:43 +0200
Subject: [PATCH 11/13] fix todo about trmv on cpu

---
 include/dlaf/factorization/qr/t_factor_impl.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h
index a3ed3c6b90..a3cf0b95b8 100644
--- a/include/dlaf/factorization/qr/t_factor_impl.h
+++ b/include/dlaf/factorization/qr/t_factor_impl.h
@@ -121,7 +121,7 @@ struct Helpers<Backend::MC, Device::CPU, T> {
 
   // Update each column (in order) t = T . t
   // remember that T is upper triangular, so it is possible to use TRMV
-  static auto trmv_func(matrix::Tile<T, Device::CPU>&& tile_t) {
+  static void trmv_func(matrix::Tile<T, Device::CPU>& tile_t) {
     for (SizeType j = 0; j < tile_t.size().cols(); ++j) {
       const TileElementIndex t_start{0, j};
       const TileElementSize t_size{j, 1};
@@ -129,8 +129,6 @@ struct Helpers<Backend::MC, Device::CPU, T> {
       blas::trmv(blas::Layout::ColMajor, blas::Uplo::Upper, blas::Op::NoTrans, blas::Diag::NonUnit,
                  t_size.rows(), tile_t.ptr(), tile_t.ld(), tile_t.ptr(t_start), 1);
     }
-    // TODO: Why return if the tile is unused?
-    return std::move(tile_t);
   }
 
   template <typename TSender>
@@ -335,7 +333,7 @@ void QR_Tfactor<Backend::MC, Device::CPU, T>::call(matrix::Panel<Coord::Col, T,
                    // compute the T factor, by performing the last step on each column
                    // (single-threaded) each column depends on the previous part (all reflectors
                    // that comes before) so it is performed sequentially
-                   t = Helpers::trmv_func(std::move(t));
+                   Helpers::trmv_func(t);
                  }
                });
       }));
@@ -497,7 +495,7 @@ void QR_Tfactor<Backend::MC, Device::CPU, T>::call(
                    // compute the T factor, by performing the last step on each column
                    // (single-threaded) each column depends on the previous part (all reflectors
                    // that comes before) so it is performed sequentially
-                   t = Helpers::trmv_func(std::move(t));
+                   Helpers::trmv_func(t);
                  }
                });
       }));

From a0aa9c899bf0c18e93af2a83f240de00ad094cad Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Wed, 19 Apr 2023 09:30:17 +0200
Subject: [PATCH 12/13] minor fix

---
 include/dlaf/factorization/qr/t_factor_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h
index a3cf0b95b8..1dbe267a49 100644
--- a/include/dlaf/factorization/qr/t_factor_impl.h
+++ b/include/dlaf/factorization/qr/t_factor_impl.h
@@ -285,7 +285,7 @@ void QR_Tfactor<Backend::MC, Device::CPU, T>::call(matrix::Panel<Coord::Col, T,
   ex::start_detached(
       ex::when_all(ex::just(std::make_shared<pika::barrier<>>(nthreads)),
                    ex::when_all_vector(std::move(panel_tiles)), std::move(taus), std::move(t)) |
-      ex::let_value([=](auto& barrier_ptr, auto&& panel, const common::internal::vector<T>& taus,
+      ex::let_value([=](auto& barrier_ptr, auto& panel, const common::internal::vector<T>& taus,
                         matrix::Tile<T, D>& t) {
         matrix::Matrix<T, D> t_all({t.size().rows() * to_SizeType(nthreads - 1), t.size().cols()},
                                    t.size());

From 916dc32e32d0400c08e97b378d65694ff159c9b6 Mon Sep 17 00:00:00 2001
From: Alberto Invernizzi <invernizzi@cscs.ch>
Date: Wed, 19 Apr 2023 18:07:35 +0200
Subject: [PATCH 13/13] WIP: workaround for deadlock? if it works, eventually
 use to investigate

---
 include/dlaf/eigensolver/eigensolver/impl.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/dlaf/eigensolver/eigensolver/impl.h b/include/dlaf/eigensolver/eigensolver/impl.h
index 91afb7da92..cbb03c6616 100644
--- a/include/dlaf/eigensolver/eigensolver/impl.h
+++ b/include/dlaf/eigensolver/eigensolver/impl.h
@@ -65,6 +65,7 @@ void Eigensolver<B, D, T>::call(comm::CommunicatorGrid grid, blas::Uplo uplo, Ma
   eigensolver::tridiagSolver<B>(grid, ret.tridiagonal, evals, mat_e);
 
   backTransformationBandToTridiag<B>(grid, band_size, mat_e, ret.hh_reflectors);
+  pika::threads::get_thread_manager().wait();
   backTransformationReductionToBand<B>(grid, band_size, mat_e, mat_a, taus);
 }
 }