diff --git a/include/dlaf/eigensolver/get_tfactor_nworkers.h b/include/dlaf/eigensolver/get_tfactor_nworkers.h
new file mode 100644
index 0000000000..ca61e9d0d2
--- /dev/null
+++ b/include/dlaf/eigensolver/get_tfactor_nworkers.h
@@ -0,0 +1,32 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2018-2023, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+
+#include <pika/runtime.hpp>
+
+#include "dlaf/common/assert.h"
+#include "dlaf/tune.h"
+
+namespace dlaf::factorization::internal {
+
+inline size_t getTFactorNWorkers() noexcept {
+  const size_t nworkers = getTuneParameters().tfactor_nworkers;
+
+  // Note: precautionarily we leave at least 1 thread "free" to do other stuff
+  const size_t max_workers = pika::resource::get_thread_pool("default").get_os_thread_count() - 1;
+
+  // 1 <= number of workers < max_workers
+  return std::max<std::size_t>(1, std::min<std::size_t>(max_workers, nworkers));
+}
+
+}
diff --git a/include/dlaf/factorization/qr.h b/include/dlaf/factorization/qr.h
index 189987cc6f..f351217732 100644
--- a/include/dlaf/factorization/qr.h
+++ b/include/dlaf/factorization/qr.h
@@ -27,13 +27,27 @@ namespace dlaf::factorization::internal {
 /// H0 H1 H2 ... HK-1
 /// Note: The first element of the HH reflectors is NOT implicitly assumed to be 1,
 ///       it has to be set correctly in the panel (0s as well).
-
+///
+/// It is similar to what xLARFT in LAPACK does.
+/// Given @p k elementary reflectors stored in the column of @p hh_panel together with related tau values
+/// in @p taus, in @p t will be formed the triangular factor for the H block of reflectors, such that
+///
+/// H = I - V . T . V*
+///
+/// where H = H1 . H2 . ... . Hk
+///
+/// in which Hi represents a single elementary reflector transformation.
+///
 /// A Storage-Efficient WY Representation for Products of Householder Transformations.
 /// Schreiber, Robert & VanLoan, Charles. (1989)
 /// SIAM Journal on Scientific and Statistical Computing. 10. 10.1137/0910005.
 ///
-/// @pre taus contains a vector with k elements
-/// @pre t contains a (k x k) tile
+/// @param hh_panel where the elementary reflectors are stored
+/// @param taus array of taus, associated with the related elementary reflector
+/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
+/// TileElementSize(k, k)
+///
+/// @pre hh_pane.getWidth() <= t.get().size().rows && hh_panel.size().getWidth() <= t.get().size().cols()
 template <Backend backend, Device device, class T>
 void computeTFactor(matrix::Panel<Coord::Col, T, device>& hh_panel,
                     pika::shared_future<common::internal::vector<T>> taus,
@@ -41,6 +55,41 @@ void computeTFactor(matrix::Panel<Coord::Col, T, device>& hh_panel,
   QR_Tfactor<backend, device, T>::call(hh_panel, taus, std::move(t));
 }
 
+/// Forms the triangular factor T of a block of reflectors H, which is defined as a product of
+/// k := hh_panel.getWidth() elementary reflectors.
+///
+/// hh_panel should have the following form
+/// H0  0  0 ...    0
+///  . H1  0 ...    0
+///  .  . H2 ...    0
+///  .  .  . ...    0
+///  .  .  . ... HK-1
+///  .  .  . ...    .
+/// H0 H1 H2 ... HK-1
+/// Note: The first element of the HH reflectors is NOT implicitly assumed to be 1,
+///       it has to be set correctly in the panel (0s as well).
+///
+/// It is similar to what xLARFT in LAPACK does.
+/// Given @p k elementary reflectors stored in the column of @p hh_panel together with related tau values
+/// in @p taus, in @p t will be formed the triangular factor for the H block of reflectors, such that
+///
+/// H = I - V . T . V*
+///
+/// where H = H1 . H2 . ... . Hk
+///
+/// in which Hi represents a single elementary reflector transformation.
+///
+/// A Storage-Efficient WY Representation for Products of Householder Transformations.
+/// Schreiber, Robert & VanLoan, Charles. (1989)
+/// SIAM Journal on Scientific and Statistical Computing. 10. 10.1137/0910005.
+///
+/// @param hh_panel where the elementary reflectors are stored
+/// @param taus array of taus, associated with the related elementary reflector
+/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
+/// TileElementSize(k, k)
+/// @param mpi_col_task_chain where internal communications are issued
+///
+/// @pre hh_pane.getWidth() <= t.get().size().rows && hh_panel.size().getWidth() <= t.get().size().cols()
 template <Backend backend, Device device, class T>
 void computeTFactor(matrix::Panel<Coord::Col, T, device>& hh_panel,
                     pika::shared_future<common::internal::vector<T>> taus,
diff --git a/include/dlaf/factorization/qr/api.h b/include/dlaf/factorization/qr/api.h
index e803219597..79add6fbce 100644
--- a/include/dlaf/factorization/qr/api.h
+++ b/include/dlaf/factorization/qr/api.h
@@ -24,65 +24,31 @@ template <Backend backend, Device device, class T>
 struct QR {};
 
 template <Backend backend, Device device, class T>
-struct QR_Tfactor {
-  /// Forms the triangular factor T of a block of reflectors H, which is defined as a product of k
-  /// elementary reflectors.
-  ///
-  /// It is similar to what xLARFT in LAPACK does.
-  /// Given @p k elementary reflectors stored in the column of @p v starting at tile @p v_start,
-  /// together with related tau values in @p taus, in @p t will be formed the triangular factor for the H
-  /// block of reflector, such that
-  ///
-  /// H = I - V . T . V*
-  ///
-  /// where H = H1 . H2 . ... . Hk
-  ///
-  /// in which Hi represents a single elementary reflector transformation
-  ///
-  /// @param k the number of elementary reflectors to use (from the beginning of the tile)
-  /// @param v where the elementary reflectors are stored
-  /// @param v_start tile in @p v where the column of reflectors starts
-  /// @param taus array of taus, associated with the related elementary reflector
-  /// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
-  /// TileElementSize(k, k)
-  ///
-  /// @pre k <= t.get().size().rows && k <= t.get().size().cols()
-  /// @pre k >= 0
-  /// @pre v_start.isIn(v.nrTiles())
-  static void call(matrix::Panel<Coord::Col, T, device>& panel_view,
+struct QR_Tfactor;
+
+template <class T>
+struct QR_Tfactor<Backend::MC, Device::CPU, T> {
+  static void call(matrix::Panel<Coord::Col, T, Device::CPU>& panel_view,
+                   pika::shared_future<common::internal::vector<T>> taus,
+                   pika::future<matrix::Tile<T, Device::CPU>> t);
+  static void call(matrix::Panel<Coord::Col, T, Device::CPU>& hh_panel,
                    pika::shared_future<common::internal::vector<T>> taus,
-                   pika::future<matrix::Tile<T, device>> t);
+                   pika::future<matrix::Tile<T, Device::CPU>> t,
+                   common::Pipeline<comm::Communicator>& mpi_col_task_chain);
+};
 
-  /// Forms the triangular factor T of a block of reflectors H, which is defined as a product of k
-  /// elementary reflectors.
-  ///
-  /// It is similar to what xLARFT in LAPACK does.
-  /// Given @p k elementary reflectors stored in the column of @p v starting at tile @p v_start,
-  /// together with related tau values in @p taus, in @p t will be formed the triangular factor for the H
-  /// block of reflector, such that
-  ///
-  /// H = I - V . T . V*
-  ///
-  /// where H = H1 . H2 . ... . Hk
-  ///
-  /// in which Hi represents a single elementary reflector transformation
-  ///
-  /// @param k the number of elementary reflectors to use (from the beginning of the tile)
-  /// @param v where the elementary reflectors are stored
-  /// @param v_start tile in @p v where the column of reflectors starts
-  /// @param taus array of taus, associated with the related elementary reflector
-  /// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
-  /// TileElementSize(k, k)
-  /// @param mpi_col_task_chain where internal communications are issued
-  ///
-  /// @pre k <= t.get().size().rows && k <= t.get().size().cols()
-  /// @pre k >= 0
-  /// @pre v_start.isIn(v.nrTiles())
-  static void call(matrix::Panel<Coord::Col, T, device>& hh_panel,
+#ifdef DLAF_WITH_GPU
+template <class T>
+struct QR_Tfactor<Backend::GPU, Device::GPU, T> {
+  static void call(matrix::Panel<Coord::Col, T, Device::GPU>& panel_view,
+                   pika::shared_future<common::internal::vector<T>> taus,
+                   pika::future<matrix::Tile<T, Device::GPU>> t);
+  static void call(matrix::Panel<Coord::Col, T, Device::GPU>& hh_panel,
                    pika::shared_future<common::internal::vector<T>> taus,
-                   pika::future<matrix::Tile<T, device>> t,
+                   pika::future<matrix::Tile<T, Device::GPU>> t,
                    common::Pipeline<comm::Communicator>& mpi_col_task_chain);
 };
+#endif
 
 /// ---- ETI
 #define DLAF_FACTORIZATION_QR_TFACTOR_ETI(KWORD, BACKEND, DEVICE, DATATYPE) \
diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h
index eb1e9bdfcf..1dbe267a49 100644
--- a/include/dlaf/factorization/qr/t_factor_impl.h
+++ b/include/dlaf/factorization/qr/t_factor_impl.h
@@ -10,9 +10,16 @@
 
 #pragma once
 
-#include <pika/future.hpp>
+#include <cstddef>
+#include <memory>
 
 #include <blas.hh>
+#include <pika/barrier.hpp>
+#include <pika/future.hpp>
+
+#include "dlaf/blas/tile_extensions.h"
+#include "dlaf/matrix/tile.h"
+#include "dlaf/util_matrix.h"
 
 #ifdef DLAF_WITH_GPU
 #include <whip.hpp>
@@ -27,6 +34,8 @@
 #include "dlaf/common/range2d.h"
 #include "dlaf/common/vector.h"
 #include "dlaf/communication/kernels/all_reduce.h"
+#include "dlaf/communication/sync/all_reduce.h"
+#include "dlaf/eigensolver/get_tfactor_nworkers.h"
 #include "dlaf/lapack/tile.h"
 #include "dlaf/matrix/matrix.h"
 #include "dlaf/matrix/views.h"
@@ -59,74 +68,76 @@ struct Helpers<Backend::MC, Device::CPU, T> {
         std::forward<TSender>(t));
   }
 
-  template <class TSender>
-  static auto gemvColumnT(SizeType first_row_tile,
-                          pika::shared_future<matrix::Tile<const T, Device::CPU>> tile_vi,
-                          pika::shared_future<common::internal::vector<T>>& taus, TSender&& tile_t) {
-    namespace ex = pika::execution::experimental;
+  static auto gemv_func(const SizeType first_row_tile, const matrix::Tile<const T, Device::CPU>& tile_v,
+                        const common::internal::vector<T>& taus,
+                        matrix::Tile<T, Device::CPU> tile_t) noexcept {
+    const SizeType k = tile_t.size().cols();
+    DLAF_ASSERT(tile_v.size().cols() == k, tile_v.size().cols(), k);
+    DLAF_ASSERT(taus.size() == k, taus.size(), k);
 
-    auto gemv_func = [first_row_tile](const auto& tile_v, const auto& taus, auto&& tile_t) noexcept {
-      const SizeType k = tile_t.size().cols();
-      DLAF_ASSERT(tile_v.size().cols() == k, tile_v.size().cols(), k);
-      DLAF_ASSERT(taus.size() == k, taus.size(), k);
+    for (SizeType j = 0; j < k; ++j) {
+      const T tau = taus[j];
 
-      for (SizeType j = 0; j < k; ++j) {
-        const T tau = taus[j];
+      const TileElementIndex t_start{0, j};
 
-        const TileElementIndex t_start{0, j};
+      // Position of the 1 in the diagonal in the current column.
+      SizeType i_diag = j - first_row_tile;
+      const SizeType first_element_in_col = std::max<SizeType>(0, i_diag);
 
-        // Position of the 1 in the diagonal in the current column.
-        SizeType i_diag = j - first_row_tile;
-        const SizeType first_element_in_col = std::max<SizeType>(0, i_diag);
+      // Break if the reflector starts in the next tile.
+      if (i_diag >= tile_v.size().rows())
+        break;
 
-        // Break if the reflector starts in the next tile.
-        if (i_diag >= tile_v.size().rows())
-          break;
+      // T(0:j, j) = -tau . V(j:, 0:j)* . V(j:, j)
+      // [j x 1] = [(n-j) x j]* . [(n-j) x 1]
+      TileElementIndex va_start{first_element_in_col, 0};
+      TileElementIndex vb_start{first_element_in_col, j};
+      TileElementSize va_size{tile_v.size().rows() - first_element_in_col, j};
 
-        // T(0:j, j) = -tau . V(j:, 0:j)* . V(j:, j)
-        // [j x 1] = [(n-j) x j]* . [(n-j) x 1]
-        TileElementIndex va_start{first_element_in_col, 0};
-        TileElementIndex vb_start{first_element_in_col, j};
-        TileElementSize va_size{tile_v.size().rows() - first_element_in_col, j};
+      if (i_diag >= 0) {
+        tile_t({j, j}) = tau;
+      }
 
-        if (i_diag >= 0) {
-          tile_t({j, j}) = tau;
-        }
+      blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, va_size.rows(), va_size.cols(), -tau,
+                 tile_v.ptr(va_start), tile_v.ld(), tile_v.ptr(vb_start), 1, 1, tile_t.ptr(t_start), 1);
+    }
+    return tile_t;
+  };
+
+  template <class TSender>
+  static auto gemvColumnT(SizeType first_row_tile,
+                          pika::shared_future<matrix::Tile<const T, Device::CPU>> tile_vi,
+                          pika::shared_future<common::internal::vector<T>>& taus, TSender&& tile_t) {
+    namespace ex = pika::execution::experimental;
 
-        blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, va_size.rows(), va_size.cols(), -tau,
-                   tile_v.ptr(va_start), tile_v.ld(), tile_v.ptr(vb_start), 1, 1, tile_t.ptr(t_start),
-                   1);
-      }
-      return std::move(tile_t);
-    };
     return dlaf::internal::transform(dlaf::internal::Policy<Backend::MC>(
                                          pika::execution::thread_priority::high),
-                                     std::move(gemv_func),
-                                     ex::when_all(dlaf::internal::keepFuture(tile_vi),
+                                     gemv_func,
+                                     ex::when_all(ex::just(first_row_tile),
+                                                  dlaf::internal::keepFuture(tile_vi),
                                                   dlaf::internal::keepFuture(taus),
                                                   std::forward<TSender>(tile_t)));
   }
 
+  // Update each column (in order) t = T . t
+  // remember that T is upper triangular, so it is possible to use TRMV
+  static void trmv_func(matrix::Tile<T, Device::CPU>& tile_t) {
+    for (SizeType j = 0; j < tile_t.size().cols(); ++j) {
+      const TileElementIndex t_start{0, j};
+      const TileElementSize t_size{j, 1};
+
+      blas::trmv(blas::Layout::ColMajor, blas::Uplo::Upper, blas::Op::NoTrans, blas::Diag::NonUnit,
+                 t_size.rows(), tile_t.ptr(), tile_t.ld(), tile_t.ptr(t_start), 1);
+    }
+  }
+
   template <typename TSender>
   static auto trmvUpdateColumn(TSender&& tile_t) noexcept {
     namespace ex = pika::execution::experimental;
 
-    // Update each column (in order) t = T . t
-    // remember that T is upper triangular, so it is possible to use TRMV
-    auto trmv_func = [](matrix::Tile<T, Device::CPU>&& tile_t) {
-      for (SizeType j = 0; j < tile_t.size().cols(); ++j) {
-        const TileElementIndex t_start{0, j};
-        const TileElementSize t_size{j, 1};
-
-        blas::trmv(blas::Layout::ColMajor, blas::Uplo::Upper, blas::Op::NoTrans, blas::Diag::NonUnit,
-                   t_size.rows(), tile_t.ptr(), tile_t.ld(), tile_t.ptr(t_start), 1);
-      }
-      // TODO: Why return if the tile is unused?
-      return std::move(tile_t);
-    };
     return dlaf::internal::transform(dlaf::internal::Policy<Backend::MC>(
                                          pika::execution::thread_priority::high),
-                                     std::move(trmv_func), std::forward<TSender>(tile_t));
+                                     trmv_func, std::forward<TSender>(tile_t));
   }
 };
 
@@ -234,20 +245,118 @@ struct Helpers<Backend::GPU, Device::GPU, T> {
 #endif
 }
 
-template <Backend backend, Device device, class T>
-void QR_Tfactor<backend, device, T>::call(matrix::Panel<Coord::Col, T, device>& hh_panel,
-                                          pika::shared_future<common::internal::vector<T>> taus,
-                                          pika::future<matrix::Tile<T, device>> t) {
+template <class T>
+void QR_Tfactor<Backend::MC, Device::CPU, T>::call(matrix::Panel<Coord::Col, T, Device::CPU>& hh_panel,
+                                                   pika::shared_future<common::internal::vector<T>> taus,
+                                                   pika::future<matrix::Tile<T, Device::CPU>> t) {
+  constexpr auto B = Backend::MC;
+  constexpr auto D = Device::CPU;
+
+  namespace ex = pika::execution::experimental;
+
+  // Fast return in case of no reflectors
+  if (hh_panel.getWidth() == 0)
+    return;
+
+  std::vector<decltype(hh_panel.read_sender(std::declval<LocalTileIndex>()))> panel_tiles;
+  for (const auto idx : hh_panel.iteratorLocal())
+    panel_tiles.push_back(hh_panel.read_sender(idx));
+
+  // Note:
+  // T factor is an upper triangular square matrix, built column by column
+  // with taus values on the diagonal
+  //
+  // T(j,j) = tau(j)
+  //
+  // and in the upper triangular part the following formula applies
+  //
+  // T(0:j, j) = T(0:j, 0:j) . -tau(j) . V(j:, 0:j)* . V(j:, j)
+  //
+  //
+  // The result is achieved in two main steps:
+  // 1) t = -tau(j) . V(j:, 0:j)* . V(j:, j)
+  // 2) T(0:j, j) = T(0:j, 0:j) . t
+
+  const SizeType v_start = hh_panel.offsetElement();
+  const SizeType bsRows = hh_panel.parentDistribution().blockSize().rows();
+  const SizeType panelRowBegin = hh_panel.iteratorLocal().begin()->row();
+
+  const std::size_t nthreads = getTFactorNWorkers();
+  ex::start_detached(
+      ex::when_all(ex::just(std::make_shared<pika::barrier<>>(nthreads)),
+                   ex::when_all_vector(std::move(panel_tiles)), std::move(taus), std::move(t)) |
+      ex::let_value([=](auto& barrier_ptr, auto& panel, const common::internal::vector<T>& taus,
+                        matrix::Tile<T, D>& t) {
+        matrix::Matrix<T, D> t_all({t.size().rows() * to_SizeType(nthreads - 1), t.size().cols()},
+                                   t.size());
+        return ex::when_all_vector(
+                   select(t_all, common::iterate_range2d(t_all.distribution().localNrTiles()))) |
+               ex::transfer(
+                   dlaf::internal::getBackendScheduler<B>(pika::execution::thread_priority::high)) |
+               ex::bulk(nthreads, [=, &barrier_ptr, &t, &taus,
+                                   &panel](const std::size_t index,
+                                           std::vector<matrix::Tile<T, D>>& t_all) {
+                 using Helpers = tfactor_l::Helpers<B, D, T>;
+
+                 tile::internal::set0<T>(index == 0 ? t : t_all[index - 1]);
+
+                 // 1st step
+                 // compute the column partial result `t` (multi-threaded)
+                 // First we compute the matrix vector multiplication for each column
+                 // -tau(j) . V(j:, 0:j)* . V(j:, j)
+                 const std::size_t chunk_size = util::ceilDiv(panel.size(), nthreads);
+                 const std::size_t begin = index * chunk_size;
+                 const std::size_t end = std::min(index * chunk_size + chunk_size, panel.size());
+
+                 for (std::size_t i = begin; i < end; ++i) {
+                   const matrix::Tile<const T, D>& tile_v = panel[i].get();
+
+                   const SizeType first_row_tile =
+                       std::max<SizeType>(0, (panelRowBegin + to_SizeType(i)) * bsRows - v_start);
+
+                   if (index == 0)
+                     t = Helpers::gemv_func(first_row_tile, tile_v, taus, std::move(t));
+                   else
+                     t_all[index - 1] =
+                         Helpers::gemv_func(first_row_tile, tile_v, taus, std::move(t_all[index - 1]));
+                 }
+
+                 barrier_ptr->arrive_and_wait();
+
+                 // (single-threaded)
+                 if (index == 0) {
+                   // reduce
+                   for (auto& partial_t : t_all)
+                     tile::internal::add(T(1), partial_t, t);
+
+                   // 2nd step
+                   // compute the T factor, by performing the last step on each column
+                   // (single-threaded) each column depends on the previous part (all reflectors
+                   // that comes before) so it is performed sequentially
+                   Helpers::trmv_func(t);
+                 }
+               });
+      }));
+}
+
+#ifdef DLAF_WITH_GPU
+template <class T>
+void QR_Tfactor<Backend::GPU, Device::GPU, T>::call(matrix::Panel<Coord::Col, T, Device::GPU>& hh_panel,
+                                                    pika::shared_future<common::internal::vector<T>> taus,
+                                                    pika::future<matrix::Tile<T, Device::GPU>> t) {
+  constexpr auto B = Backend::GPU;
+  constexpr auto D = Device::GPU;
+
   namespace ex = pika::execution::experimental;
 
-  using Helpers = tfactor_l::Helpers<backend, device, T>;
+  using Helpers = tfactor_l::Helpers<B, D, T>;
   // Fast return in case of no reflectors
   if (hh_panel.getWidth() == 0)
     return;
 
   const auto v_start = hh_panel.offsetElement();
 
-  ex::unique_any_sender<matrix::Tile<T, device>> t_local = Helpers::set0(std::move(t));
+  ex::unique_any_sender<matrix::Tile<T, D>> t_local = Helpers::set0(std::move(t));
 
   // Note:
   // T factor is an upper triangular square matrix, built column by column
@@ -283,15 +392,127 @@ void QR_Tfactor<backend, device, T>::call(matrix::Panel<Coord::Col, T, device>&
   // so it is performed sequentially
   ex::start_detached(Helpers::trmvUpdateColumn(std::move(t_local)));
 }
+#endif
+
+template <class T>
+void QR_Tfactor<Backend::MC, Device::CPU, T>::call(
+    matrix::Panel<Coord::Col, T, Device::CPU>& hh_panel,
+    pika::shared_future<common::internal::vector<T>> taus, pika::future<matrix::Tile<T, Device::CPU>> t,
+    common::Pipeline<comm::Communicator>& mpi_col_task_chain) {
+  constexpr auto B = Backend::MC;
+  constexpr auto D = Device::CPU;
 
-template <Backend backend, Device device, class T>
-void QR_Tfactor<backend, device, T>::call(matrix::Panel<Coord::Col, T, device>& hh_panel,
-                                          pika::shared_future<common::internal::vector<T>> taus,
-                                          pika::future<matrix::Tile<T, device>> t,
-                                          common::Pipeline<comm::Communicator>& mpi_col_task_chain) {
   namespace ex = pika::execution::experimental;
 
-  using Helpers = tfactor_l::Helpers<backend, device, T>;
+  // Fast return in case of no reflectors
+  if (hh_panel.getWidth() == 0)
+    return;
+
+  std::vector<decltype(hh_panel.read_sender(std::declval<LocalTileIndex>()))> panel_tiles;
+  for (const auto idx : hh_panel.iteratorLocal())
+    panel_tiles.push_back(hh_panel.read_sender(idx));
+
+  // Note:
+  // T factor is an upper triangular square matrix, built column by column
+  // with taus values on the diagonal
+  //
+  // T(j,j) = tau(j)
+  //
+  // and in the upper triangular part the following formula applies
+  //
+  // T(0:j, j) = T(0:j, 0:j) . -tau(j) . V(j:, 0:j)* . V(j:, j)
+  //
+  //
+  // The result is achieved in two main steps:
+  // 1) t = -tau(j) . V(j:, 0:j)* . V(j:, j)
+  // 2) T(0:j, j) = T(0:j, 0:j) . t
+
+  const auto dist = hh_panel.parentDistribution();
+
+  const SizeType v_start = hh_panel.offsetElement();
+  const SizeType bsRows = hh_panel.parentDistribution().blockSize().rows();
+  const SizeType panelRowBegin = hh_panel.iteratorLocal().begin()->row();
+
+  const std::size_t nthreads = getTFactorNWorkers();
+  ex::start_detached(
+      ex::when_all(ex::just(std::make_shared<pika::barrier<>>(nthreads)),
+                   ex::when_all_vector(std::move(panel_tiles)), std::move(taus), std::move(t),
+                   mpi_col_task_chain()) |
+      ex::let_value([=](auto& barrier_ptr, auto&& panel, const common::internal::vector<T>& taus,
+                        matrix::Tile<T, D>& t, auto&& pcomm) {
+        matrix::Matrix<T, D> t_all({t.size().rows() * to_SizeType(nthreads - 1), t.size().cols()},
+                                   t.size());
+        return ex::when_all_vector(
+                   select(t_all, common::iterate_range2d(t_all.distribution().localNrTiles()))) |
+               ex::transfer(
+                   dlaf::internal::getBackendScheduler<B>(pika::execution::thread_priority::high)) |
+               ex::bulk(nthreads, [=, &barrier_ptr, &t, &taus, &panel,
+                                   &pcomm](const std::size_t index,
+                                           std::vector<matrix::Tile<T, D>>& t_all) {
+                 using Helpers = tfactor_l::Helpers<B, D, T>;
+
+                 tile::internal::set0<T>(index == 0 ? t : t_all[index - 1]);
+
+                 // 1st step
+                 // compute the column partial result `t` (multi-threaded)
+                 // First we compute the matrix vector multiplication for each column
+                 // -tau(j) . V(j:, 0:j)* . V(j:, j)
+                 const std::size_t chunk_size = util::ceilDiv(panel.size(), nthreads);
+                 const std::size_t begin = index * chunk_size;
+                 const std::size_t end = std::min(index * chunk_size + chunk_size, panel.size());
+
+                 for (std::size_t i = begin; i < end; ++i) {
+                   const matrix::Tile<const T, D>& tile_v = panel[i].get();
+
+                   const SizeType first_row_tile =
+                       std::max<SizeType>(0, dist.template globalTileFromLocalTile<Coord::Row>(
+                                                 panelRowBegin + to_SizeType(i)) *
+                                                     bsRows -
+                                                 v_start);
+
+                   if (index == 0)
+                     t = Helpers::gemv_func(first_row_tile, tile_v, taus, std::move(t));
+                   else
+                     t_all[index - 1] =
+                         Helpers::gemv_func(first_row_tile, tile_v, taus, std::move(t_all[index - 1]));
+                 }
+
+                 barrier_ptr->arrive_and_wait();
+
+                 // (single-threaded)
+                 if (index == 0) {
+                   // reduce
+                   for (auto& partial_t : t_all)
+                     tile::internal::add(T(1), partial_t, t);
+
+                   // at this point each rank has its partial result for each column
+                   // so, let's reduce the results (on all ranks, so that everyone can
+                   // independently compute T factor)
+                   if (pcomm.ref().size() > 1)
+                     comm::sync::allReduceInPlace(pcomm.ref(), MPI_SUM, common::make_data(t));
+
+                   // 2nd step
+                   // compute the T factor, by performing the last step on each column
+                   // (single-threaded) each column depends on the previous part (all reflectors
+                   // that comes before) so it is performed sequentially
+                   Helpers::trmv_func(t);
+                 }
+               });
+      }));
+}
+
+#ifdef DLAF_WITH_GPU
+template <class T>
+void QR_Tfactor<Backend::GPU, Device::GPU, T>::call(
+    matrix::Panel<Coord::Col, T, Device::GPU>& hh_panel,
+    pika::shared_future<common::internal::vector<T>> taus, pika::future<matrix::Tile<T, Device::GPU>> t,
+    common::Pipeline<comm::Communicator>& mpi_col_task_chain) {
+  constexpr auto B = Backend::GPU;
+  constexpr auto D = Device::GPU;
+
+  namespace ex = pika::execution::experimental;
+
+  using Helpers = tfactor_l::Helpers<B, D, T>;
 
   // Fast return in case of no reflectors
   if (hh_panel.getWidth() == 0)
@@ -300,7 +521,7 @@ void QR_Tfactor<backend, device, T>::call(matrix::Panel<Coord::Col, T, device>&
   const auto v_start = hh_panel.offsetElement();
   auto dist = hh_panel.parentDistribution();
 
-  ex::unique_any_sender<matrix::Tile<T, device>> t_local = Helpers::set0(std::move(t));
+  ex::unique_any_sender<matrix::Tile<T, D>> t_local = Helpers::set0(std::move(t));
 
   // Note:
   // T factor is an upper triangular square matrix, built column by column
@@ -342,5 +563,5 @@ void QR_Tfactor<backend, device, T>::call(matrix::Panel<Coord::Col, T, device>&
   // so it is performed sequentially
   ex::start_detached(Helpers::trmvUpdateColumn(std::move(t_local)));
 }
-
+#endif
 }
diff --git a/include/dlaf/tune.h b/include/dlaf/tune.h
index 072a8cf0e6..548e659d2d 100644
--- a/include/dlaf/tune.h
+++ b/include/dlaf/tune.h
@@ -9,6 +9,8 @@
 //
 #pragma once
 
+#include <cstddef>
+
 #include <pika/runtime.hpp>
 #include <dlaf/types.h>
 
@@ -17,6 +19,7 @@ namespace dlaf {
 ///
 /// Holds the value of the parameters that can be used to tune DLA-Future.
 /// - red2band_panel_nworkers: number of threads to use for computing the panel in the reduction to band algorithm.
+/// - tfactor_nworkers: number of threads to use for computing the T factor
 /// - eigensolver_min_band: The minimun value to start looking for a divisor of the block size.
 ///                         Set with --dlaf:eigensolver-min-band or env variable DLAF_EIGENSOLVER_MIN_BAND.
 /// - band_to_tridiag_1d_block_size_base:
@@ -29,8 +32,11 @@ namespace dlaf {
 ///     DLAF_BT_BAND_TO_TRIDIAG_HH_APPLY_GROUP_SIZE.
 /// Note to developers: Users can change these values, therefore consistency has to be ensured by algorithms.
 struct TuneParameters {
-  size_t red2band_panel_nworkers =
-      std::max<size_t>(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2);
+  std::size_t red2band_panel_nworkers =
+      std::max<std::size_t>(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2);
+
+  std::size_t tfactor_nworkers =
+      std::max<std::size_t>(1, pika::resource::get_thread_pool("default").get_os_thread_count() / 2);
 
   SizeType eigensolver_min_band = 100;
   SizeType band_to_tridiag_1d_block_size_base = 8192;