From ed4193eeeaceb0a53ed98b25042f16dac09fca6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Raffaele=20Solc=C3=A0?= <rasolca@cscs.ch>
Date: Wed, 19 Apr 2023 13:17:16 +0200
Subject: [PATCH] Permute Distributed: migrate from AllToAll to Send/Recv
 (#835)

---
 .../dlaf/eigensolver/tridiag_solver/merge.h   |   2 -
 include/dlaf/permutations/general.h           |  10 +-
 include/dlaf/permutations/general/api.h       |   4 +-
 include/dlaf/permutations/general/impl.h      | 404 ++++++++++++------
 test/src/gtest_mpi_listener.cpp               |   3 +-
 .../test_permutations_distributed.cpp         | 137 ++++--
 .../permutations/test_permutations_local.cpp  |  38 +-
 7 files changed, 392 insertions(+), 206 deletions(-)
diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h
index ceb6fb52f4..4f7940a051 100644
--- a/include/dlaf/eigensolver/tridiag_solver/merge.h
+++ b/include/dlaf/eigensolver/tridiag_solver/merge.h
@@ -35,8 +35,6 @@
 #include "dlaf/types.h"
 #include "dlaf/util_matrix.h"
 
-#include "dlaf/matrix/print_csv.h"
-
 namespace dlaf::eigensolver::internal {
 
 // Auxiliary matrix and vectors used for the D&C algorithm
diff --git a/include/dlaf/permutations/general.h b/include/dlaf/permutations/general.h
index 968d04f228..3690011a5e 100644
--- a/include/dlaf/permutations/general.h
+++ b/include/dlaf/permutations/general.h
@@ -34,8 +34,8 @@ namespace dlaf::permutations {
 ///        the closed range [i_begin,i_end] are accessed in write-only mode.
 ///
 template <Backend B, Device D, class T, Coord coord>
-void permute(SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms, Matrix<T, D>& mat_in,
-             Matrix<T, D>& mat_out) {
+void permute(SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms,
+             Matrix<const T, D>& mat_in, Matrix<T, D>& mat_out) {
   const matrix::Distribution& distr_perms = perms.distribution();
   const matrix::Distribution& distr_in = mat_in.distribution();
   const matrix::Distribution& distr_out = mat_out.distribution();
@@ -82,8 +82,8 @@ void permute(SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms,
 ///
 template <Backend B, Device D, class T, Coord coord>
 void permute(comm::CommunicatorGrid grid, common::Pipeline<comm::Communicator>& sub_task_chain,
-             SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms, Matrix<T, D>& mat_in,
-             Matrix<T, D>& mat_out) {
+             SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms,
+             Matrix<const T, D>& mat_in, Matrix<T, D>& mat_out) {
   const matrix::Distribution& distr_perms = perms.distribution();
   const matrix::Distribution& distr_in = mat_in.distribution();
 
@@ -110,7 +110,7 @@ void permute(comm::CommunicatorGrid grid, common::Pipeline<comm::Communicator>&
 ///
 template <Backend B, Device D, class T, Coord coord>
 void permute(comm::CommunicatorGrid grid, SizeType i_begin, SizeType i_end,
-             Matrix<const SizeType, D>& perms, Matrix<T, D>& mat_in, Matrix<T, D>& mat_out) {
+             Matrix<const SizeType, D>& perms, Matrix<const T, D>& mat_in, Matrix<T, D>& mat_out) {
   common::Pipeline<comm::Communicator> sub_task_chain(grid.subCommunicator(orthogonal(coord)).clone());
   permute<B, D, T, coord>(grid, sub_task_chain, i_begin, i_end, perms, mat_in, mat_out);
 }
diff --git a/include/dlaf/permutations/general/api.h b/include/dlaf/permutations/general/api.h
index 3b08233d86..3a75a29e89 100644
--- a/include/dlaf/permutations/general/api.h
+++ b/include/dlaf/permutations/general/api.h
@@ -21,9 +21,9 @@ namespace dlaf::permutations::internal {
 template <Backend B, Device D, class T, Coord coord>
 struct Permutations {
   static void call(SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms,
-                   Matrix<T, D>& mat_in, Matrix<T, D>& mat_out);
+                   Matrix<const T, D>& mat_in, Matrix<T, D>& mat_out);
   static void call(common::Pipeline<comm::Communicator>& sub_task_chain, SizeType i_begin,
-                   SizeType i_end, Matrix<const SizeType, D>& perms, Matrix<T, D>& mat_in,
+                   SizeType i_end, Matrix<const SizeType, D>& perms, Matrix<const T, D>& mat_in,
                    Matrix<T, D>& mat_out);
 };
 
diff --git a/include/dlaf/permutations/general/impl.h b/include/dlaf/permutations/general/impl.h
index 54d7f77bca..6c6aa309c8 100644
--- a/include/dlaf/permutations/general/impl.h
+++ b/include/dlaf/permutations/general/impl.h
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include "dlaf/matrix/index.h"
 #include "dlaf/permutations/general/api.h"
 #include "dlaf/permutations/general/perms.h"
 
@@ -24,6 +25,7 @@
 #include "dlaf/matrix/copy_tile.h"
 #include "dlaf/matrix/matrix.h"
 #include "dlaf/schedulers.h"
+#include "dlaf/sender/policy.h"
 #include "dlaf/sender/transform.h"
 #include "dlaf/sender/transform_mpi.h"
 #include "dlaf/sender/when_all_lift.h"
@@ -31,6 +33,7 @@
 #include "dlaf/util_matrix.h"
 
 #include <mpi.h>
+#include <numeric>
 #include <pika/future.hpp>
 #include "pika/algorithm.hpp"
 
@@ -126,9 +129,52 @@ void applyPermutations(GlobalElementIndex out_begin, GlobalElementSize sz, SizeT
   }
 }
 
+// FilterFunc is a function with signature bool(*)(SizeType)
+template <class T, Device D, Coord C, class FilterFunc>
+void applyPermutationsFiltered(
+    GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset,
+    const matrix::Distribution& subm_dist, const SizeType* perm_arr,
+    const std::vector<pika::shared_future<matrix::Tile<const T, D>>>& in_tiles_fut,
+    const std::vector<matrix::Tile<T, D>>& out_tiles, FilterFunc&& filter) {
+  constexpr auto OC = orthogonal(C);
+  std::vector<SizeType> splits =
+      dlaf::util::interleaveSplits(sz.get<OC>(), subm_dist.blockSize().get<OC>(),
+                                   subm_dist.distanceToAdjacentTile<OC>(in_offset),
+                                   subm_dist.distanceToAdjacentTile<OC>(out_begin.get<OC>()));
+
+  const SizeType nperms = subm_dist.size().get<C>();
+
+  // Parallelized over the number of permutations
+  pika::for_loop(pika::execution::par, to_sizet(0), to_sizet(nperms), [&](SizeType i_perm) {
+    if (!filter(perm_arr[i_perm]))
+      return;
+
+    for (std::size_t i_split = 0; i_split < splits.size() - 1; ++i_split) {
+      const SizeType split = splits[i_split];
+
+      GlobalElementIndex i_split_gl_in(split + in_offset, perm_arr[i_perm]);
+      GlobalElementIndex i_split_gl_out(split + out_begin.get<OC>(), out_begin.get<C>() + i_perm);
+      TileElementSize region(splits[i_split + 1] - split, 1);
+
+      if constexpr (C == Coord::Row) {
+        region.transpose();
+        i_split_gl_in.transpose();
+        i_split_gl_out.transpose();
+      }
+
+      const TileElementIndex i_subtile_in = subm_dist.tileElementIndex(i_split_gl_in);
+      const auto& tile_in = in_tiles_fut[to_sizet(subm_dist.globalTileLinearIndex(i_split_gl_in))].get();
+      const TileElementIndex i_subtile_out = subm_dist.tileElementIndex(i_split_gl_out);
+      auto& tile_out = out_tiles[to_sizet(subm_dist.globalTileLinearIndex(i_split_gl_out))];
+
+      dlaf::tile::lacpy<T>(region, i_subtile_in, tile_in, i_subtile_out, tile_out);
+    }
+  });
+}
+
 template <Backend B, Device D, class T, Coord C>
 void Permutations<B, D, T, C>::call(SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms,
-                                    Matrix<T, D>& mat_in, Matrix<T, D>& mat_out) {
+                                    Matrix<const T, D>& mat_in, Matrix<T, D>& mat_out) {
   namespace ut = matrix::util;
   namespace ex = pika::execution::experimental;
 
@@ -191,119 +237,135 @@ auto whenAllReadOnlyTilesArray(Matrix<const T, D>& matrix) {
       ut::collectReadTiles(LocalTileIndex(0, 0), matrix.distribution().localNrTiles(), matrix));
 }
 
-// No data is sent or received but the processes participates in the collective call
-template <class T>
-void all2allEmptyData(common::Pipeline<comm::Communicator>& sub_task_chain, int nranks) {
-  namespace ex = pika::execution::experimental;
-
-  auto all2all_f = [](const comm::Communicator& comm, const std::vector<int>& arr, MPI_Request* req) {
-    MPI_Datatype dtype = dlaf::comm::mpi_datatype<std::remove_pointer_t<T>>::type;
-    // Avoid buffer aliasing error reported when send_ptr and recv_ptr are the same.
-    int irrelevant = 42;
-    DLAF_MPI_CHECK_ERROR(MPI_Ialltoallv(nullptr, arr.data(), arr.data(), dtype, &irrelevant, arr.data(),
-                                        arr.data(), dtype, comm, req));
-  };
-  dlaf::comm::internal::transformMPIDetach(std::move(all2all_f),
-                                           ex::when_all(sub_task_chain(),
-                                                        ex::just(std::vector<int>(to_sizet(nranks), 0))));
-}
-
-// Note: Matrices used for communication @p send_mat and @p recv_mat are assumed to be in column-major layout!
-//
 template <class T, Device D, Coord C, class SendCountsSender, class RecvCountsSender>
 void all2allData(common::Pipeline<comm::Communicator>& sub_task_chain, int nranks,
-                 LocalElementSize sz_loc, SendCountsSender&& send_counts_sender, Matrix<T, D>& send_mat,
-                 RecvCountsSender&& recv_counts_sender, Matrix<T, D>& recv_mat) {
+                 LocalElementSize sz_loc, SendCountsSender&& send_counts_sender,
+                 Matrix<const T, D>& send_mat, RecvCountsSender&& recv_counts_sender,
+                 Matrix<T, D>& recv_mat) {
   namespace ex = pika::execution::experimental;
 
-  auto all2all_f =
-      [len = sz_loc.get<orthogonal(
-           C)>()](const comm::Communicator& comm, std::vector<int>& send_counts,
-                  std::vector<int>& send_displs,
-                  const std::vector<pika::shared_future<matrix::Tile<const T, D>>>& send_tiles_fut,
-                  std::vector<int>& recv_counts, std::vector<int>& recv_displs,
-                  const std::vector<matrix::Tile<T, D>>& recv_tiles, MPI_Request* req) {
-        // datatype to be sent to each rank
-        MPI_Datatype dtype = dlaf::comm::mpi_datatype<std::remove_pointer_t<T>>::type;
-
-        // scale by the length of the row or column vectors to get the number of elements sent to each process
-        auto mul_const = [len](int num) { return to_int(len) * num; };
-        std::transform(send_counts.cbegin(), send_counts.cend(), send_counts.begin(), mul_const);
-        std::transform(recv_counts.cbegin(), recv_counts.cend(), recv_counts.begin(), mul_const);
-
-        // Note: that was guaranteed to be contiguous on allocation
+  using dlaf::common::DataDescriptor;
+
+  const SizeType vec_size = sz_loc.get<orthogonal(C)>();
+  auto sendrecv_f =
+      [vec_size](comm::Communicator& comm, std::vector<int> send_counts, std::vector<int> send_displs,
+                 const std::vector<pika::shared_future<matrix::Tile<const T, D>>>& send_tiles_fut,
+                 std::vector<int> recv_counts, std::vector<int> recv_displs,
+                 const std::vector<matrix::Tile<T, D>>& recv_tiles) {
+        // Note: both guaranteed to be column-major on allocation
         const T* send_ptr = send_tiles_fut[0].get().ptr();
         T* recv_ptr = recv_tiles[0].ptr();
 
-        // send displacements
-        std::exclusive_scan(send_counts.begin(), send_counts.end(), send_displs.begin(), 0);
+        const SizeType send_ld = send_tiles_fut[0].get().ld();
+        const SizeType recv_ld = recv_tiles[0].ld();
+
+        const SizeType send_perm_stride = (C == Coord::Col) ? send_ld : 1;
+        const SizeType recv_perm_stride = (C == Coord::Col) ? recv_ld : 1;
 
-        // recv displacements
+        // cumulative sum for computing rank data displacements in packed vectors
+        std::exclusive_scan(send_counts.begin(), send_counts.end(), send_displs.begin(), 0);
         std::exclusive_scan(recv_counts.begin(), recv_counts.end(), recv_displs.begin(), 0);
 
-        // All-to-All communication
-        DLAF_MPI_CHECK_ERROR(MPI_Ialltoallv(send_ptr, send_counts.data(), send_displs.data(), dtype,
-                                            recv_ptr, recv_counts.data(), recv_displs.data(), dtype,
-                                            comm, req));
+        std::vector<ex::unique_any_sender<>> all_comms;
+        all_comms.reserve(to_sizet(comm.size() - 1) * 2);
+        const comm::IndexT_MPI rank = comm.rank();
+        for (comm::IndexT_MPI rank_partner = 0; rank_partner < comm.size(); ++rank_partner) {
+          if (rank == rank_partner)
+            continue;
+
+          const auto rank_partner_index = to_sizet(rank_partner);
+
+          if (send_counts[rank_partner_index])
+            all_comms.push_back(
+                ex::just() | dlaf::comm::internal::transformMPI([=](MPI_Request* req) {
+                  const SizeType nperms = send_counts[rank_partner_index];
+                  auto message = dlaf::comm::make_message(
+                      DataDescriptor<const T>(send_ptr +
+                                                  send_displs[rank_partner_index] * send_perm_stride,
+                                              C == Coord::Col ? nperms : vec_size,
+                                              C == Coord::Col ? vec_size : nperms, send_ld));
+
+                  DLAF_MPI_CHECK_ERROR(MPI_Isend(message.data(), message.count(), message.mpi_type(),
+                                                 rank_partner, 0, comm, req));
+                }));
+          if (recv_counts[rank_partner_index])
+            all_comms.push_back(
+                ex::just() | dlaf::comm::internal::transformMPI([=](MPI_Request* req) {
+                  const SizeType nperms = recv_counts[rank_partner_index];
+                  auto message = dlaf::comm::make_message(
+                      DataDescriptor<T>(recv_ptr + recv_displs[rank_partner_index] * recv_perm_stride,
+                                        C == Coord::Col ? nperms : vec_size,
+                                        C == Coord::Col ? vec_size : nperms, recv_ld));
+
+                  DLAF_MPI_CHECK_ERROR(MPI_Irecv(message.data(), message.count(), message.mpi_type(),
+                                                 rank_partner, 0, comm, req));
+                }));
+        }
+
+        pika::this_thread::experimental::sync_wait(ex::when_all_vector(std::move(all_comms)));
       };
 
-  auto sender =
-      ex::when_all(sub_task_chain(), std::forward<SendCountsSender>(send_counts_sender),
-                   ex::just(std::vector<int>(to_sizet(nranks))), whenAllReadOnlyTilesArray(send_mat),
-                   std::forward<RecvCountsSender>(recv_counts_sender),
-                   ex::just(std::vector<int>(to_sizet(nranks))), whenAllReadWriteTilesArray(recv_mat));
-  dlaf::comm::internal::transformMPIDetach(std::move(all2all_f), std::move(sender));
+  ex::when_all(sub_task_chain(), std::forward<SendCountsSender>(send_counts_sender),
+               ex::just(std::vector<int>(to_sizet(nranks))), whenAllReadOnlyTilesArray(send_mat),
+               std::forward<RecvCountsSender>(recv_counts_sender),
+               ex::just(std::vector<int>(to_sizet(nranks))), whenAllReadWriteTilesArray(recv_mat)) |
+      dlaf::internal::transformDetach(dlaf::internal::Policy<Backend::MC>(), sendrecv_f);
 }
 
 // @param nranks number of ranks
-// @param loc2gl_index  a column matrix that represents a map from local to global indices
+// @param offset_sub where the sub-matrix to permute starts in the global matrix
+// @param loc2sub_index a column matrix that represents a map from local to sub indices
 // @param packing_index a column matrix that represents a map from packed indices to local indices
-//        that is used for packing columns or rows on each rank
 //
 // Note: the order of the packed rows or columns on the send side must match the expected order at
 // unpacking on the receive side
-template <Coord C, bool PackBasedOnGlobalIndex>
-auto initPackingIndex(int nranks, SizeType i_el_begin, const matrix::Distribution& dist,
-                      Matrix<const SizeType, Device::CPU>& loc2gl_index,
+template <Coord C, bool BackwardMapping = false>
+auto initPackingIndex(comm::IndexT_MPI nranks, SizeType offset_sub, const matrix::Distribution& dist,
+                      Matrix<const SizeType, Device::CPU>& loc2sub_index,
                       Matrix<SizeType, Device::CPU>& packing_index) {
   namespace ex = pika::execution::experimental;
   namespace di = dlaf::internal;
 
-  auto counts_fn = [nranks, i_el_begin, dist,
-                    len = packing_index.size().rows()](const auto& loc2gl_index_tiles,
-                                                       const auto& packing_index_tiles) {
-    const SizeType* in = loc2gl_index_tiles[0].get().ptr();
+  auto counts_fn = [nranks, offset_sub, dist,
+                    nperms = packing_index.size().rows()](const auto& loc2gl_index_tiles,
+                                                          const auto& packing_index_tiles) {
+    const SizeType* loc2sub = loc2gl_index_tiles[0].get().ptr();
     SizeType* out = packing_index_tiles[0].ptr();
 
     std::vector<int> counts(to_sizet(nranks));
-    int offset = 0;
-    for (int rank = 0; rank < nranks; ++rank) {
-      int& count = counts[to_sizet(rank)];
-      count = 0;
-      for (SizeType i = 0; i < len; ++i) {
-        if (dist.rankGlobalElement<C>(i_el_begin + in[i]) == rank) {
-          if constexpr (PackBasedOnGlobalIndex) {
-            out[offset + count] = i;
-          }
-          else {
-            out[i] = offset + count;
-          }
-          ++count;
+
+    for (int rank = 0, rank_displacement = 0; rank < nranks; ++rank) {
+      int& nperms_local = counts[to_sizet(rank)] = 0;
+
+      for (SizeType perm_index_local = 0; perm_index_local < nperms; ++perm_index_local) {
+        const SizeType perm_index_global = offset_sub + loc2sub[perm_index_local];
+        if (dist.rankGlobalElement<C>(perm_index_global) == rank) {
+          const SizeType perm_index_packed = rank_displacement + nperms_local;
+
+          if constexpr (BackwardMapping)
+            out[perm_index_packed] = perm_index_local;
+          else
+            out[perm_index_local] = perm_index_packed;
+
+          ++nperms_local;
         }
       }
-      if constexpr (PackBasedOnGlobalIndex) {
-        std::sort(out + offset, out + offset + count,
-                  [in](SizeType i1, SizeType i2) { return in[i1] < in[i2]; });
-      }
-      offset += count;
+
+      // Note:
+      // This sorting actually "inverts" the mapping.
+      if constexpr (BackwardMapping)
+        std::sort(out + rank_displacement, out + rank_displacement + nperms_local,
+                  [loc2sub](SizeType i1, SizeType i2) { return loc2sub[i1] < loc2sub[i2]; });
+
+      rank_displacement += nperms_local;
     }
+
     return counts;
   };
 
-  auto sender =
-      ex::when_all(whenAllReadOnlyTilesArray(loc2gl_index), whenAllReadWriteTilesArray(packing_index));
   return ex::ensure_started(
-      di::transform(di::Policy<Backend::MC>{}, std::move(counts_fn), std::move(sender)));
+      ex::when_all(whenAllReadOnlyTilesArray(loc2sub_index), whenAllReadWriteTilesArray(packing_index)) |
+      di::transform(di::Policy<Backend::MC>{}, std::move(counts_fn)));
 }
 
 // Copies index tiles belonging to the current process from the complete index @p global_index into the
@@ -314,11 +376,11 @@ void copyLocalPartsFromGlobalIndex(SizeType i_loc_begin, const matrix::Distribut
                                    Matrix<SizeType, D>& local_index) {
   namespace ex = pika::execution::experimental;
 
-  for (auto tile_wrt_local : common::iterate_range2d(local_index.distribution().localNrTiles())) {
-    SizeType i_gl_tile = dist.globalTileFromLocalTile<C>(i_loc_begin + tile_wrt_local.row());
-    ex::start_detached(ex::when_all(global_index.read_sender(GlobalTileIndex(i_gl_tile, 0)),
-                                    local_index.readwrite_sender(tile_wrt_local)) |
-                       dlaf::matrix::copy(dlaf::internal::Policy<DefaultBackend_v<D>>{}));
+  for (const LocalTileIndex i : common::iterate_range2d(local_index.distribution().localNrTiles())) {
+    const GlobalTileIndex i_global(dist.globalTileFromLocalTile<C>(i_loc_begin + i.row()), 0);
+    ex::start_detached(
+        ex::when_all(global_index.read_sender(i_global), local_index.readwrite_sender(i)) |
+        dlaf::matrix::copy(dlaf::internal::Policy<DefaultBackend_v<D>>{}));
   }
 }
 
@@ -411,97 +473,160 @@ inline void invertIndex(SizeType i_begin, SizeType i_end, Matrix<const SizeType,
 }
 
 template <class T, Coord C>
-void permuteOnCPU(common::Pipeline<comm::Communicator>& sub_task_chain, SizeType i_begin, SizeType i_end,
-                  Matrix<const SizeType, Device::CPU>& perms, Matrix<T, Device::CPU>& mat_in,
-                  Matrix<T, Device::CPU>& mat_out) {
+void permuteOnCPU(common::Pipeline<comm::Communicator>& sub_task_chain, SizeType i_begin,
+                  SizeType i_last, Matrix<const SizeType, Device::CPU>& perms,
+                  Matrix<const T, Device::CPU>& mat_in, Matrix<T, Device::CPU>& mat_out) {
   constexpr Device D = Device::CPU;
 
-  const matrix::Distribution& dist = mat_in.distribution();
-  int nranks = to_int(dist.commGridSize().get<C>());
+  using namespace dlaf::matrix;
+
+  namespace ex = pika::execution::experimental;
+  namespace di = dlaf::internal;
+
+  const Distribution& dist = mat_in.distribution();
+  const comm::IndexT_MPI nranks = to_int(dist.commGridSize().get<C>());
 
-  // Local size and index of subproblem [i_begin, i_end]
-  SizeType i_el_begin = dist.globalElementFromGlobalTileAndTileElement<C>(i_begin, 0);
-  TileElementSize blk = dist.blockSize();
-  LocalTileIndex i_loc_begin{dist.nextLocalTileFromGlobalTile<Coord::Row>(i_begin),
-                             dist.nextLocalTileFromGlobalTile<Coord::Col>(i_begin)};
-  LocalTileIndex i_loc_end{dist.nextLocalTileFromGlobalTile<Coord::Row>(i_end + 1) - 1,
-                           dist.nextLocalTileFromGlobalTile<Coord::Col>(i_end + 1) - 1};
+  const SizeType i_end = i_last + 1;
+
+  // Local size and index of subproblem [i_begin, i_last]
+  const SizeType offset_sub = dist.globalElementFromGlobalTileAndTileElement<C>(i_begin, 0);
+  const TileElementSize blk = dist.blockSize();
+
+  const LocalTileIndex i_loc_begin{dist.nextLocalTileFromGlobalTile<Coord::Row>(i_begin),
+                                   dist.nextLocalTileFromGlobalTile<Coord::Col>(i_begin)};
+  const LocalTileIndex i_loc_end{dist.nextLocalTileFromGlobalTile<Coord::Row>(i_end) - 1,
+                                 dist.nextLocalTileFromGlobalTile<Coord::Col>(i_end) - 1};
   // Note: the local shape of the permutation region may not be square if the process grid is not square
-  LocalElementSize sz_loc{dist.localElementDistanceFromGlobalTile<Coord::Row>(i_begin, i_end + 1),
-                          dist.localElementDistanceFromGlobalTile<Coord::Col>(i_begin, i_end + 1)};
+  const LocalElementSize sz_loc{dist.localElementDistanceFromGlobalTile<Coord::Row>(i_begin, i_end),
+                                dist.localElementDistanceFromGlobalTile<Coord::Col>(i_begin, i_end)};
 
-  // If there are no tiles in this rank, participate in the all2all call and return
-  if (sz_loc.isEmpty()) {
-    all2allEmptyData<T>(sub_task_chain, nranks);
+  // If there are no tiles in this rank, nothing to do here
+  if (sz_loc.isEmpty())
     return;
-  }
 
   // Create a map from send indices to receive indices (inverse of perms)
   Matrix<SizeType, D> inverse_perms(perms.distribution());
-  invertIndex(i_begin, i_end, perms, inverse_perms);
+  invertIndex(i_begin, i_last, perms, inverse_perms);
 
   // Local distribution used for packing and unpacking
-  matrix::Distribution subm_dist(sz_loc, blk);
+  const Distribution subm_dist(sz_loc, blk);
 
   // Local single tile column matrices representing index maps used for packing and unpacking of
   // communication data
-  matrix::Distribution index_dist(LocalElementSize(sz_loc.get<C>(), 1), TileElementSize(blk.rows(), 1));
-  Matrix<SizeType, D> ws_index(index_dist);
+  const SizeType nvecs = sz_loc.get<C>();
+  const Distribution index_dist(LocalElementSize(nvecs, 1), TileElementSize(blk.rows(), 1));
+  Matrix<SizeType, D> local2global_index(index_dist);
   Matrix<SizeType, D> packing_index(index_dist);
   Matrix<SizeType, D> unpacking_index(index_dist);
 
   // Local matrices used for packing data for communication. Both matrices are in column-major order.
   // The particular constructor is used on purpose to guarantee that columns are stored contiguosly,
   // such that there is no padding and gaps between them.
-  LocalElementSize comm_sz = (C == Coord::Col) ? sz_loc : transposed(sz_loc);
-  matrix::Distribution comm_dist(comm_sz, blk);
-  matrix::LayoutInfo comm_layout = matrix::colMajorLayout(comm_sz, blk, comm_sz.rows());
+  const LocalElementSize comm_sz = sz_loc;
+  const Distribution comm_dist(comm_sz, blk);
+  const LayoutInfo comm_layout = matrix::colMajorLayout(comm_sz, blk, comm_sz.rows());
+
   Matrix<T, D> mat_send(comm_dist, comm_layout);
   Matrix<T, D> mat_recv(comm_dist, comm_layout);
 
   // Initialize the unpacking index
-  copyLocalPartsFromGlobalIndex<D, C>(i_loc_begin.get<C>(), dist, perms, ws_index);
+  copyLocalPartsFromGlobalIndex<D, C>(i_loc_begin.get<C>(), dist, perms, local2global_index);
   auto recv_counts_sender =
-      initPackingIndex<C, false>(nranks, i_el_begin, dist, ws_index, unpacking_index);
+      initPackingIndex<C>(nranks, offset_sub, dist, local2global_index, unpacking_index) | ex::split();
 
   // Initialize the packing index
-  // Here `true` is specified so that the send side matches the order of columns/rows on the receive side
-  copyLocalPartsFromGlobalIndex<D, C>(i_loc_begin.get<C>(), dist, inverse_perms, ws_index);
-  auto send_counts_sender = initPackingIndex<C, true>(nranks, i_el_begin, dist, ws_index, packing_index);
+  // Here `true` is specified so that the send side matches the order of columns on the receive side
+  copyLocalPartsFromGlobalIndex<D, C>(i_loc_begin.get<C>(), dist, inverse_perms, local2global_index);
+  auto send_counts_sender =
+      initPackingIndex<C, true>(nranks, offset_sub, dist, local2global_index, packing_index) |
+      ex::split();
 
   // Pack local rows or columns to be sent from this rank
   applyPackingIndex<T, D, C>(subm_dist, whenAllReadOnlyTilesArray(packing_index),
                              whenAllReadOnlyTilesArray(i_loc_begin, i_loc_end, mat_in),
-                             (C == Coord::Col)
-                                 ? whenAllReadWriteTilesArray(mat_send)
-                                 : whenAllReadWriteTilesArray(i_loc_begin, i_loc_end, mat_out));
-
-  // Pack data into the contiguous column-major send buffer by transposing
-  if constexpr (C == Coord::Row) {
-    transposeFromDistributedToLocalMatrix(i_loc_begin, mat_out, mat_send);
-  }
-
-  // Communicate data
-  all2allData<T, D, C>(sub_task_chain, nranks, sz_loc, std::move(send_counts_sender), mat_send,
-                       std::move(recv_counts_sender), mat_recv);
+                             whenAllReadWriteTilesArray(mat_send));
+
+  // Unpacking
+  // separate unpacking:
+  // - locals
+  // - communicated
+  // and then start two different tasks:
+  // - the first depends on mat_send instead of mat_recv (no dependency on comm)
+  // - the last is the same, but it has to skip the part already done for local
+
+  // LOCAL
+  auto unpack_local_f = [subm_dist, rank = dist.rankIndex().get<C>()](const auto& send_counts,
+                                                                      const auto& recv_counts,
+                                                                      const auto& index_tile_futs,
+                                                                      const auto& mat_in_tiles,
+                                                                      const auto& mat_out_tiles) {
+    const size_t rank_index = to_sizet(rank);
+
+    const SizeType* perm_arr = index_tile_futs[0].get().ptr();
+    const GlobalElementSize sz = subm_dist.size();
+
+    const int a = std::accumulate(send_counts.cbegin(), send_counts.cbegin() + rank, 0);
+    const int b = a + send_counts[rank_index];
+
+    // Note:
+    // These are copied directly from mat_send, while unpacking permutation applies to indices on
+    // the receiver side. So, we have to "align" the unpacking permutation, by applying the offset
+    // existing between the send and recv side.
+    // This is due to the fact that send and recv buffers might be "unbalanced", e.g. rank1 sends 2
+    // and receive 1 with rank0, so resulting in a shift in indices between the two buffer sides,
+    // following previous example the local part would start at index (0-based) 2 in mat_send and
+    // at index 1 in mat_recv.
+    const int a_r = std::accumulate(recv_counts.cbegin(), recv_counts.cbegin() + rank, 0);
+    const SizeType offset = to_SizeType(a - a_r);
+    std::vector<SizeType> perm_offseted(perm_arr, perm_arr + subm_dist.size().get<C>());
+    std::transform(perm_offseted.begin(), perm_offseted.end(), perm_offseted.begin(),
+                   [offset](const SizeType perm) { return perm + offset; });
+
+    // [a, b)
+    applyPermutationsFiltered<T, D, C>({0, 0}, sz, 0, subm_dist, perm_offseted.data(), mat_in_tiles,
+                                       mat_out_tiles,
+                                       [a, b](SizeType i_perm) { return i_perm >= a && i_perm < b; });
+  };
 
-  // Unpack data from the contiguous column-major receive buffer by transposing
-  if constexpr (C == Coord::Row) {
-    transposeFromLocalToDistributedMatrix(i_loc_begin, mat_recv, mat_in);
-  }
+  ex::when_all(send_counts_sender, recv_counts_sender, whenAllReadOnlyTilesArray(unpacking_index),
+               whenAllReadOnlyTilesArray(mat_send),
+               whenAllReadWriteTilesArray(i_loc_begin, i_loc_end, mat_out)) |
+      di::transformDetach(di::Policy<DefaultBackend_v<D>>(), std::move(unpack_local_f));
+
+  // COMMUNICATION-dependent
+  all2allData<T, D, C>(sub_task_chain, nranks, sz_loc, send_counts_sender, mat_send, recv_counts_sender,
+                       mat_recv);
+
+  auto unpack_others_f = [subm_dist, rank = dist.rankIndex().get<C>()](const auto& recv_counts,
+                                                                       const auto& index_tile_futs,
+                                                                       const auto& mat_in_tiles,
+                                                                       const auto& mat_out_tiles) {
+    const size_t rank_index = to_sizet(rank);
+    const int a = std::accumulate(recv_counts.cbegin(), recv_counts.cbegin() + rank, 0);
+    const int b = a + recv_counts[rank_index];
+
+    const SizeType* perm_arr = index_tile_futs[0].get().ptr();
+    const GlobalElementSize sz = subm_dist.size();
+
+    // [0, a)
+    applyPermutationsFiltered<T, D, C>({0, 0}, sz, 0, subm_dist, perm_arr, mat_in_tiles, mat_out_tiles,
+                                       [a](SizeType i_perm) { return i_perm < a; });
+
+    // [b, end)
+    applyPermutationsFiltered<T, D, C>({0, 0}, sz, 0, subm_dist, perm_arr, mat_in_tiles, mat_out_tiles,
+                                       [b](SizeType i_perm) { return i_perm >= b; });
+  };
 
-  // Unpack local rows or columns received on this rank
-  applyPackingIndex<T, D, C>(subm_dist, whenAllReadOnlyTilesArray(unpacking_index),
-                             (C == Coord::Col)
-                                 ? whenAllReadOnlyTilesArray(mat_recv)
-                                 : whenAllReadOnlyTilesArray(i_loc_begin, i_loc_end, mat_in),
-                             whenAllReadWriteTilesArray(i_loc_begin, i_loc_end, mat_out));
+  ex::when_all(recv_counts_sender, whenAllReadOnlyTilesArray(unpacking_index),
+               whenAllReadOnlyTilesArray(mat_recv),
+               whenAllReadWriteTilesArray(i_loc_begin, i_loc_end, mat_out)) |
+      di::transformDetach(di::Policy<DefaultBackend_v<D>>(), std::move(unpack_others_f));
 }
 
 template <Backend B, Device D, class T, Coord C>
 void Permutations<B, D, T, C>::call(common::Pipeline<comm::Communicator>& sub_task_chain,
                                     SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms,
-                                    Matrix<T, D>& mat_in, Matrix<T, D>& mat_out) {
+                                    Matrix<const T, D>& mat_in, Matrix<T, D>& mat_out) {
   if constexpr (D == Device::GPU) {
     // This is a temporary placeholder which avoids diverging GPU API:
     DLAF_UNIMPLEMENTED("GPU implementation not available yet");
@@ -512,5 +637,4 @@ void Permutations<B, D, T, C>::call(common::Pipeline<comm::Communicator>& sub_ta
     permuteOnCPU<T, C>(sub_task_chain, i_begin, i_end, perms, mat_in, mat_out);
   }
 }
-
 }
diff --git a/test/src/gtest_mpi_listener.cpp b/test/src/gtest_mpi_listener.cpp
index a342b8582f..24754dce54 100644
--- a/test/src/gtest_mpi_listener.cpp
+++ b/test/src/gtest_mpi_listener.cpp
@@ -160,7 +160,8 @@ void MPIListener::OnTestEndAllRanks(const ::testing::TestInfo& test_info) const
 
 namespace internal {
 void mpi_send_string(const std::string& message, int to_rank) {
-  MPI_Send(message.c_str(), static_cast<int>(message.size()) + 1, MPI_CHAR, to_rank, 0, MPI_COMM_WORLD);
+  MPI_Send(const_cast<char*>(message.c_str()), static_cast<int>(message.size()) + 1, MPI_CHAR, to_rank,
+           0, MPI_COMM_WORLD);
 }
 
 std::string mpi_receive_string(int from_rank) {
diff --git a/test/unit/permutations/test_permutations_distributed.cpp b/test/unit/permutations/test_permutations_distributed.cpp
index 9516bc9aa5..a401153927 100644
--- a/test/unit/permutations/test_permutations_distributed.cpp
+++ b/test/unit/permutations/test_permutations_distributed.cpp
@@ -7,15 +7,23 @@
 // Please, refer to the LICENSE file in the root directory.
 // SPDX-License-Identifier: BSD-3-Clause
 //
+
+#include "dlaf/permutations/general.h"
+
+#include <algorithm>
+
 #include <gtest/gtest.h>
 #include <pika/runtime.hpp>
 
+#include "dlaf/common/assert.h"
 #include "dlaf/matrix/matrix_mirror.h"
-#include "dlaf/permutations/general.h"
+#include "dlaf/types.h"
 
 #include "dlaf_test/comm_grids/grids_6_ranks.h"
+#include "dlaf_test/matrix/matrix_local.h"
 #include "dlaf_test/matrix/util_generic_lapack.h"
 #include "dlaf_test/matrix/util_matrix.h"
+#include "dlaf_test/matrix/util_matrix_local.h"
 #include "dlaf_test/util_types.h"
 
 using namespace dlaf;
@@ -33,68 +41,115 @@ struct PermutationsDistTestMC : public TestWithCommGrids {};
 
 TYPED_TEST_SUITE(PermutationsDistTestMC, RealMatrixElementTypes);
 
-const std::vector<std::tuple<SizeType, SizeType, SizeType, SizeType>> params = {
-    // n, nb, i_begin, i_end
+// n, nb, i_begin, i_last, permutation
+// permutation has to be defined using element indices wrt to the range described by [i_begin, i_last]
+// tiles and not global indices.
+// permutation[i]: mat_out[i] = mat_in[perm[i]]
+using testcase_t = std::tuple<SizeType, SizeType, SizeType, SizeType, std::vector<SizeType>>;
+
+// Given matrix size and blocksize, this helper converts a range defined with tile indices
+// [i_begin_tile, i_end_tile) into a range defined with element indices [i_begin, i_end)
+auto tileToElementRange(SizeType m, SizeType mb, SizeType i_begin_tile, SizeType i_end_tile) {
+  const SizeType i_begin = std::max<SizeType>(0, std::min<SizeType>(m - 1, i_begin_tile * mb));
+  const SizeType i_end = std::max<SizeType>(0, std::min<SizeType>(m, i_end_tile * mb));
+  return std::make_tuple(i_begin, i_end);
+}
+
+// Helper that given a geometry generates a "mirror" permutation.
+// A mirror permutation means that first becomes last, second becomes before-last, ...
+testcase_t mirrorPermutation(SizeType m, SizeType mb, SizeType i_begin_tile, SizeType i_last_tile) {
+  const auto [i_begin, i_end] = tileToElementRange(m, mb, i_begin_tile, i_last_tile + 1);
+
+  std::vector<SizeType> perms(to_sizet(i_end - i_begin));
+  std::generate(perms.rbegin(), perms.rend(), [n = 0]() mutable { return n++; });
+
+  return {m, mb, i_begin_tile, i_last_tile, perms};
+}
+
+// Helper that just checks that given geometry and permutations are compatible.
+testcase_t customPermutation(SizeType m, SizeType mb, SizeType i_begin_tile, SizeType i_last_tile,
+                             std::vector<SizeType> perms) {
+  const auto [i_begin, i_end] = tileToElementRange(m, mb, i_begin_tile, i_last_tile + 1);
+
+  const std::size_t nperms = to_sizet(i_end - i_begin);
+  DLAF_ASSERT(perms.size() == nperms, perms.size(), nperms);
 
+  return {m, mb, i_begin_tile, i_last_tile, std::move(perms)};
+}
+
+const std::vector<testcase_t> params = {
     // simple setup for a (3, 2) process grid,
-    {6, 2, 0, 2},
+    mirrorPermutation(6, 2, 0, 2),
+    customPermutation(6, 2, 0, 2, {2, 0, 1, 4, 5, 3}),
     // entire range of tiles is inculded
-    {10, 3, 0, 3},
-    {17, 5, 0, 3},
+    mirrorPermutation(10, 3, 0, 3),
+    customPermutation(10, 3, 0, 3, {0, 2, 3, 4, 6, 8, 1, 5, 7, 9}),
+    customPermutation(10, 3, 0, 3, {8, 9, 3, 5, 2, 7, 1, 4, 0, 6}),
+    mirrorPermutation(17, 5, 0, 3),
     // only a subset of processes participate
-    {10, 3, 1, 2},
+    mirrorPermutation(10, 3, 1, 2),
     // a single tile matrix
-    {10, 10, 0, 0},
+    mirrorPermutation(10, 10, 0, 0),
     // each process has multiple tiles
-    {31, 6, 1, 3},
-    {50, 4, 1, 8},
+    mirrorPermutation(31, 6, 1, 3),
+    mirrorPermutation(50, 4, 1, 8),
 };
 
 template <class T, Device D, Coord C>
 void testDistPermutations(comm::CommunicatorGrid grid, SizeType n, SizeType nb, SizeType i_begin,
-                          SizeType i_end) {
+                          SizeType i_last, std::vector<SizeType> perms) {
   const GlobalElementSize size(n, n);
   const TileElementSize block_size(nb, nb);
-  Index2D src_rank_index(std::max(0, grid.size().rows() - 1), std::min(1, grid.size().cols() - 1));
+  const Index2D src_rank_index(std::max(0, grid.size().rows() - 1), std::min(1, grid.size().cols() - 1));
 
-  Distribution dist(size, block_size, grid.size(), grid.rank(), src_rank_index);
-  Matrix<SizeType, Device::CPU> perms_h(LocalElementSize(n, 1), TileElementSize(nb, 1));
-  Matrix<T, Device::CPU> mat_in_h(dist);
-  Matrix<T, Device::CPU> mat_out_h(dist);
+  const Distribution dist(size, block_size, grid.size(), grid.rank(), src_rank_index);
 
-  SizeType index_start = dist.globalElementFromGlobalTileAndTileElement<C>(i_begin, 0);
-  SizeType index_finish = dist.globalElementFromGlobalTileAndTileElement<C>(i_end, 0) +
-                          dist.tileSize(GlobalTileIndex(i_end, i_end)).get<C>();
-  dlaf::matrix::util::set(perms_h, [index_start, index_finish](GlobalElementIndex i) {
-    if (index_start > i.row() || i.row() >= index_finish)
-      return SizeType(0);
+  const auto [index_start, index_end] = tileToElementRange(n, nb, i_begin, i_last + 1);
 
-    return index_finish - 1 - i.row();
-  });
-  dlaf::matrix::util::set(mat_in_h, [](GlobalElementIndex i) {
-    return T(i.get<C>()) - T(i.get<orthogonal(C)>()) / T(8);
-  });
-  dlaf::matrix::util::set0<Backend::MC>(pika::execution::thread_priority::normal, mat_out_h);
+  Matrix<const SizeType, Device::CPU> perms_h = [=, index_start = index_start, index_end = index_end] {
+    Matrix<SizeType, Device::CPU> perms_h(LocalElementSize(n, 1), TileElementSize(nb, 1));
+    dlaf::matrix::util::set(perms_h, [=](GlobalElementIndex i) {
+      if (index_start > i.row() || i.row() >= index_end)
+        return SizeType(0);
+
+      const SizeType i_window = i.row() - index_start;
+      return perms[to_sizet(i_window)];
+    });
+    return perms_h;
+  }();
+
+  auto value_in = [](GlobalElementIndex i) { return T(i.get<C>()) - T(i.get<orthogonal(C)>()) / T(8); };
+  Matrix<const T, Device::CPU> mat_in_h = [dist, value_in]() {
+    Matrix<T, Device::CPU> mat_in_h(dist);
+    dlaf::matrix::util::set(mat_in_h, value_in);
+    return mat_in_h;
+  }();
+
+  auto value_out = [](GlobalElementIndex i) { return T(i.get<orthogonal(C)>()) - T(i.get<C>()) / T(8); };
+  Matrix<T, Device::CPU> mat_out_h(dist);
+  dlaf::matrix::util::set(mat_out_h, value_out);
 
   {
     matrix::MatrixMirror<const SizeType, D, Device::CPU> perms(perms_h);
-    matrix::MatrixMirror<T, D, Device::CPU> mat_in(mat_in_h);
+    matrix::MatrixMirror<const T, D, Device::CPU> mat_in(mat_in_h);
     matrix::MatrixMirror<T, D, Device::CPU> mat_out(mat_out_h);
 
-    permutations::permute<DefaultBackend_v<D>, D, T, C>(grid, i_begin, i_end, perms.get(), mat_in.get(),
+    permutations::permute<DefaultBackend_v<D>, D, T, C>(grid, i_begin, i_last, perms.get(), mat_in.get(),
                                                         mat_out.get());
   }
 
-  auto expected_out = [i_begin, i_end, index_start, index_finish, &dist](const GlobalElementIndex i) {
-    GlobalTileIndex i_tile = dist.globalTileIndex(i);
-    if (i_begin <= i_tile.row() && i_tile.row() <= i_end && i_begin <= i_tile.col() &&
-        i_tile.col() <= i_end) {
-      GlobalElementIndex i_in(i.get<orthogonal(C)>(), index_finish + index_start - 1 - i.get<C>());
+  auto expected_out = [=, index_start = index_start](const GlobalElementIndex& i) {
+    const GlobalTileIndex i_tile = dist.globalTileIndex(i);
+    if (i_begin <= i_tile.row() && i_tile.row() <= i_last && i_begin <= i_tile.col() &&
+        i_tile.col() <= i_last) {
+      const std::size_t i_window = to_sizet(i.get<C>() - index_start);
+      GlobalElementIndex i_in(i.get<orthogonal(C)>(), index_start + perms[i_window]);
       if constexpr (C == Coord::Row)
         i_in.transpose();
-      return T(i_in.get<C>()) - T(i_in.get<orthogonal(C)>()) / T(8);
+
+      return value_in(i_in);
     }
-    return T(0);
+    return value_out(i);
   };
 
   CHECK_MATRIX_EQ(expected_out, mat_out_h);
@@ -102,8 +157,8 @@ void testDistPermutations(comm::CommunicatorGrid grid, SizeType n, SizeType nb,
 
 TYPED_TEST(PermutationsDistTestMC, Columns) {
   for (const auto& comm_grid : this->commGrids()) {
-    for (const auto& [n, nb, i_begin, i_end] : params) {
-      testDistPermutations<TypeParam, Device::CPU, Coord::Col>(comm_grid, n, nb, i_begin, i_end);
+    for (const auto& [n, nb, i_begin, i_end, perms] : params) {
+      testDistPermutations<TypeParam, Device::CPU, Coord::Col>(comm_grid, n, nb, i_begin, i_end, perms);
       pika::threads::get_thread_manager().wait();
     }
   }
@@ -111,8 +166,8 @@ TYPED_TEST(PermutationsDistTestMC, Columns) {
 
 TYPED_TEST(PermutationsDistTestMC, Rows) {
   for (const auto& comm_grid : this->commGrids()) {
-    for (const auto& [n, nb, i_begin, i_end] : params) {
-      testDistPermutations<TypeParam, Device::CPU, Coord::Row>(comm_grid, n, nb, i_begin, i_end);
+    for (const auto& [n, nb, i_begin, i_end, perms] : params) {
+      testDistPermutations<TypeParam, Device::CPU, Coord::Row>(comm_grid, n, nb, i_begin, i_end, perms);
       pika::threads::get_thread_manager().wait();
     }
   }
diff --git a/test/unit/permutations/test_permutations_local.cpp b/test/unit/permutations/test_permutations_local.cpp
index a6fdc24e5f..3bfca591d5 100644
--- a/test/unit/permutations/test_permutations_local.cpp
+++ b/test/unit/permutations/test_permutations_local.cpp
@@ -36,29 +36,37 @@ TYPED_TEST_SUITE(PermutationsTestGPU, RealMatrixElementTypes);
 // reverse order into the output matrix.
 template <Backend B, Device D, class T, Coord C>
 void testPermutations(SizeType n, SizeType nb, SizeType i_begin, SizeType i_end) {
-  Matrix<SizeType, Device::CPU> perms_h(LocalElementSize(n, 1), TileElementSize(nb, 1));
-  Matrix<T, Device::CPU> mat_in_h(LocalElementSize(n, n), TileElementSize(nb, nb));
-  Matrix<T, Device::CPU> mat_out_h(LocalElementSize(n, n), TileElementSize(nb, nb));
-
-  const matrix::Distribution& distr = mat_out_h.distribution();
+  const matrix::Distribution distr({n, n}, {nb, nb});
 
   SizeType index_start = distr.globalElementFromGlobalTileAndTileElement<C>(i_begin, 0);
   SizeType index_finish = distr.globalElementFromGlobalTileAndTileElement<C>(i_end, 0) +
                           distr.tileSize(GlobalTileIndex(i_end, i_end)).get<C>();
-  dlaf::matrix::util::set(perms_h, [index_start, index_finish](GlobalElementIndex i) {
-    if (index_start > i.row() || i.row() >= index_finish)
-      return SizeType(0);
-
-    return index_finish - 1 - i.row();
-  });
-  dlaf::matrix::util::set(mat_in_h, [](GlobalElementIndex i) {
-    return T(i.get<C>()) - T(i.get<orthogonal(C)>()) / T(8);
-  });
+
+  Matrix<const SizeType, Device::CPU> perms_h = [n, nb, index_start, index_finish]() {
+    Matrix<SizeType, Device::CPU> perms_h(LocalElementSize(n, 1), TileElementSize(nb, 1));
+    dlaf::matrix::util::set(perms_h, [index_start, index_finish](GlobalElementIndex i) {
+      if (index_start > i.row() || i.row() >= index_finish)
+        return SizeType(0);
+
+      return index_finish - 1 - i.row();
+    });
+    return perms_h;
+  }();
+
+  Matrix<T, Device::CPU> mat_in_h = [distr]() {
+    Matrix<T, Device::CPU> mat_in_h(distr);
+    dlaf::matrix::util::set(mat_in_h, [](GlobalElementIndex i) {
+      return T(i.get<C>()) - T(i.get<orthogonal(C)>()) / T(8);
+    });
+    return mat_in_h;
+  }();
+
+  Matrix<T, Device::CPU> mat_out_h(distr);
   dlaf::matrix::util::set0<Backend::MC>(pika::execution::thread_priority::normal, mat_out_h);
 
   {
     matrix::MatrixMirror<const SizeType, D, Device::CPU> perms(perms_h);
-    matrix::MatrixMirror<T, D, Device::CPU> mat_in(mat_in_h);
+    matrix::MatrixMirror<const T, D, Device::CPU> mat_in(mat_in_h);
     matrix::MatrixMirror<T, D, Device::CPU> mat_out(mat_out_h);
 
     permutations::permute<B, D, T, C>(i_begin, i_end, perms.get(), mat_in.get(), mat_out.get());