Permute Distributed: migrate from AllToAll to Send/Recv (#835)

eth-cscs · Apr 19, 2023 · ed4193e · ed4193e
1 parent 1cf78cc
commit ed4193e
Show file tree

Hide file tree

Showing 7 changed files with 392 additions and 206 deletions.
diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h
@@ -35,8 +35,6 @@
 #include "dlaf/types.h"
 #include "dlaf/util_matrix.h"
 
-#include "dlaf/matrix/print_csv.h"
-
 namespace dlaf::eigensolver::internal {
 
 // Auxiliary matrix and vectors used for the D&C algorithm

diff --git a/include/dlaf/permutations/general.h b/include/dlaf/permutations/general.h
@@ -34,8 +34,8 @@ namespace dlaf::permutations {
 ///        the closed range [i_begin,i_end] are accessed in write-only mode.
 ///
 template <Backend B, Device D, class T, Coord coord>
-void permute(SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms, Matrix<T, D>& mat_in,
-             Matrix<T, D>& mat_out) {
+void permute(SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms,
+             Matrix<const T, D>& mat_in, Matrix<T, D>& mat_out) {
   const matrix::Distribution& distr_perms = perms.distribution();
   const matrix::Distribution& distr_in = mat_in.distribution();
   const matrix::Distribution& distr_out = mat_out.distribution();
@@ -82,8 +82,8 @@ void permute(SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms,
 ///
 template <Backend B, Device D, class T, Coord coord>
 void permute(comm::CommunicatorGrid grid, common::Pipeline<comm::Communicator>& sub_task_chain,
-             SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms, Matrix<T, D>& mat_in,
-             Matrix<T, D>& mat_out) {
+             SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms,
+             Matrix<const T, D>& mat_in, Matrix<T, D>& mat_out) {
   const matrix::Distribution& distr_perms = perms.distribution();
   const matrix::Distribution& distr_in = mat_in.distribution();
 
@@ -110,7 +110,7 @@ void permute(comm::CommunicatorGrid grid, common::Pipeline<comm::Communicator>&
 ///
 template <Backend B, Device D, class T, Coord coord>
 void permute(comm::CommunicatorGrid grid, SizeType i_begin, SizeType i_end,
-             Matrix<const SizeType, D>& perms, Matrix<T, D>& mat_in, Matrix<T, D>& mat_out) {
+             Matrix<const SizeType, D>& perms, Matrix<const T, D>& mat_in, Matrix<T, D>& mat_out) {
   common::Pipeline<comm::Communicator> sub_task_chain(grid.subCommunicator(orthogonal(coord)).clone());
   permute<B, D, T, coord>(grid, sub_task_chain, i_begin, i_end, perms, mat_in, mat_out);
 }

diff --git a/include/dlaf/permutations/general/api.h b/include/dlaf/permutations/general/api.h
@@ -21,9 +21,9 @@ namespace dlaf::permutations::internal {
 template <Backend B, Device D, class T, Coord coord>
 struct Permutations {
   static void call(SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms,
-                   Matrix<T, D>& mat_in, Matrix<T, D>& mat_out);
+                   Matrix<const T, D>& mat_in, Matrix<T, D>& mat_out);
   static void call(common::Pipeline<comm::Communicator>& sub_task_chain, SizeType i_begin,
-                   SizeType i_end, Matrix<const SizeType, D>& perms, Matrix<T, D>& mat_in,
+                   SizeType i_end, Matrix<const SizeType, D>& perms, Matrix<const T, D>& mat_in,
                    Matrix<T, D>& mat_out);
 };
 

diff --git a/include/dlaf/permutations/general/impl.h b/include/dlaf/permutations/general/impl.h
diff --git a/test/src/gtest_mpi_listener.cpp b/test/src/gtest_mpi_listener.cpp
@@ -160,7 +160,8 @@ void MPIListener::OnTestEndAllRanks(const ::testing::TestInfo& test_info) const
 
 namespace internal {
 void mpi_send_string(const std::string& message, int to_rank) {
-  MPI_Send(message.c_str(), static_cast<int>(message.size()) + 1, MPI_CHAR, to_rank, 0, MPI_COMM_WORLD);
+  MPI_Send(const_cast<char*>(message.c_str()), static_cast<int>(message.size()) + 1, MPI_CHAR, to_rank,
+           0, MPI_COMM_WORLD);
 }
 
 std::string mpi_receive_string(int from_rank) {

diff --git a/test/unit/permutations/test_permutations_distributed.cpp b/test/unit/permutations/test_permutations_distributed.cpp
@@ -7,15 +7,23 @@
 // Please, refer to the LICENSE file in the root directory.
 // SPDX-License-Identifier: BSD-3-Clause
 //
+
+#include "dlaf/permutations/general.h"
+
+#include <algorithm>
+
 #include <gtest/gtest.h>
 #include <pika/runtime.hpp>
 
+#include "dlaf/common/assert.h"
 #include "dlaf/matrix/matrix_mirror.h"
-#include "dlaf/permutations/general.h"
+#include "dlaf/types.h"
 
 #include "dlaf_test/comm_grids/grids_6_ranks.h"
+#include "dlaf_test/matrix/matrix_local.h"
 #include "dlaf_test/matrix/util_generic_lapack.h"
 #include "dlaf_test/matrix/util_matrix.h"
+#include "dlaf_test/matrix/util_matrix_local.h"
 #include "dlaf_test/util_types.h"
 
 using namespace dlaf;
@@ -33,86 +41,133 @@ struct PermutationsDistTestMC : public TestWithCommGrids {};
 
 TYPED_TEST_SUITE(PermutationsDistTestMC, RealMatrixElementTypes);
 
-const std::vector<std::tuple<SizeType, SizeType, SizeType, SizeType>> params = {
-    // n, nb, i_begin, i_end
+// n, nb, i_begin, i_last, permutation
+// permutation has to be defined using element indices wrt to the range described by [i_begin, i_last]
+// tiles and not global indices.
+// permutation[i]: mat_out[i] = mat_in[perm[i]]
+using testcase_t = std::tuple<SizeType, SizeType, SizeType, SizeType, std::vector<SizeType>>;
+
+// Given matrix size and blocksize, this helper converts a range defined with tile indices
+// [i_begin_tile, i_end_tile) into a range defined with element indices [i_begin, i_end)
+auto tileToElementRange(SizeType m, SizeType mb, SizeType i_begin_tile, SizeType i_end_tile) {
+  const SizeType i_begin = std::max<SizeType>(0, std::min<SizeType>(m - 1, i_begin_tile * mb));
+  const SizeType i_end = std::max<SizeType>(0, std::min<SizeType>(m, i_end_tile * mb));
+  return std::make_tuple(i_begin, i_end);
+}
+
+// Helper that given a geometry generates a "mirror" permutation.
+// A mirror permutation means that first becomes last, second becomes before-last, ...
+testcase_t mirrorPermutation(SizeType m, SizeType mb, SizeType i_begin_tile, SizeType i_last_tile) {
+  const auto [i_begin, i_end] = tileToElementRange(m, mb, i_begin_tile, i_last_tile + 1);
+
+  std::vector<SizeType> perms(to_sizet(i_end - i_begin));
+  std::generate(perms.rbegin(), perms.rend(), [n = 0]() mutable { return n++; });
+
+  return {m, mb, i_begin_tile, i_last_tile, perms};
+}
+
+// Helper that just checks that given geometry and permutations are compatible.
+testcase_t customPermutation(SizeType m, SizeType mb, SizeType i_begin_tile, SizeType i_last_tile,
+                             std::vector<SizeType> perms) {
+  const auto [i_begin, i_end] = tileToElementRange(m, mb, i_begin_tile, i_last_tile + 1);
+
+  const std::size_t nperms = to_sizet(i_end - i_begin);
+  DLAF_ASSERT(perms.size() == nperms, perms.size(), nperms);
 
+  return {m, mb, i_begin_tile, i_last_tile, std::move(perms)};
+}
+
+const std::vector<testcase_t> params = {
     // simple setup for a (3, 2) process grid,
-    {6, 2, 0, 2},
+    mirrorPermutation(6, 2, 0, 2),
+    customPermutation(6, 2, 0, 2, {2, 0, 1, 4, 5, 3}),
     // entire range of tiles is inculded
-    {10, 3, 0, 3},
-    {17, 5, 0, 3},
+    mirrorPermutation(10, 3, 0, 3),
+    customPermutation(10, 3, 0, 3, {0, 2, 3, 4, 6, 8, 1, 5, 7, 9}),
+    customPermutation(10, 3, 0, 3, {8, 9, 3, 5, 2, 7, 1, 4, 0, 6}),
+    mirrorPermutation(17, 5, 0, 3),
     // only a subset of processes participate
-    {10, 3, 1, 2},
+    mirrorPermutation(10, 3, 1, 2),
     // a single tile matrix
-    {10, 10, 0, 0},
+    mirrorPermutation(10, 10, 0, 0),
     // each process has multiple tiles
-    {31, 6, 1, 3},
-    {50, 4, 1, 8},
+    mirrorPermutation(31, 6, 1, 3),
+    mirrorPermutation(50, 4, 1, 8),
 };
 
 template <class T, Device D, Coord C>
 void testDistPermutations(comm::CommunicatorGrid grid, SizeType n, SizeType nb, SizeType i_begin,
-                          SizeType i_end) {
+                          SizeType i_last, std::vector<SizeType> perms) {
   const GlobalElementSize size(n, n);
   const TileElementSize block_size(nb, nb);
-  Index2D src_rank_index(std::max(0, grid.size().rows() - 1), std::min(1, grid.size().cols() - 1));
+  const Index2D src_rank_index(std::max(0, grid.size().rows() - 1), std::min(1, grid.size().cols() - 1));
 
-  Distribution dist(size, block_size, grid.size(), grid.rank(), src_rank_index);
-  Matrix<SizeType, Device::CPU> perms_h(LocalElementSize(n, 1), TileElementSize(nb, 1));
-  Matrix<T, Device::CPU> mat_in_h(dist);
-  Matrix<T, Device::CPU> mat_out_h(dist);
+  const Distribution dist(size, block_size, grid.size(), grid.rank(), src_rank_index);
 
-  SizeType index_start = dist.globalElementFromGlobalTileAndTileElement<C>(i_begin, 0);
-  SizeType index_finish = dist.globalElementFromGlobalTileAndTileElement<C>(i_end, 0) +
-                          dist.tileSize(GlobalTileIndex(i_end, i_end)).get<C>();
-  dlaf::matrix::util::set(perms_h, [index_start, index_finish](GlobalElementIndex i) {
-    if (index_start > i.row() || i.row() >= index_finish)
-      return SizeType(0);
+  const auto [index_start, index_end] = tileToElementRange(n, nb, i_begin, i_last + 1);
 
-    return index_finish - 1 - i.row();
-  });
-  dlaf::matrix::util::set(mat_in_h, [](GlobalElementIndex i) {
-    return T(i.get<C>()) - T(i.get<orthogonal(C)>()) / T(8);
-  });
-  dlaf::matrix::util::set0<Backend::MC>(pika::execution::thread_priority::normal, mat_out_h);
+  Matrix<const SizeType, Device::CPU> perms_h = [=, index_start = index_start, index_end = index_end] {
+    Matrix<SizeType, Device::CPU> perms_h(LocalElementSize(n, 1), TileElementSize(nb, 1));
+    dlaf::matrix::util::set(perms_h, [=](GlobalElementIndex i) {
+      if (index_start > i.row() || i.row() >= index_end)
+        return SizeType(0);
+
+      const SizeType i_window = i.row() - index_start;
+      return perms[to_sizet(i_window)];
+    });
+    return perms_h;
+  }();
+
+  auto value_in = [](GlobalElementIndex i) { return T(i.get<C>()) - T(i.get<orthogonal(C)>()) / T(8); };
+  Matrix<const T, Device::CPU> mat_in_h = [dist, value_in]() {
+    Matrix<T, Device::CPU> mat_in_h(dist);
+    dlaf::matrix::util::set(mat_in_h, value_in);
+    return mat_in_h;
+  }();
+
+  auto value_out = [](GlobalElementIndex i) { return T(i.get<orthogonal(C)>()) - T(i.get<C>()) / T(8); };
+  Matrix<T, Device::CPU> mat_out_h(dist);
+  dlaf::matrix::util::set(mat_out_h, value_out);
 
   {
     matrix::MatrixMirror<const SizeType, D, Device::CPU> perms(perms_h);
-    matrix::MatrixMirror<T, D, Device::CPU> mat_in(mat_in_h);
+    matrix::MatrixMirror<const T, D, Device::CPU> mat_in(mat_in_h);
     matrix::MatrixMirror<T, D, Device::CPU> mat_out(mat_out_h);
 
-    permutations::permute<DefaultBackend_v<D>, D, T, C>(grid, i_begin, i_end, perms.get(), mat_in.get(),
+    permutations::permute<DefaultBackend_v<D>, D, T, C>(grid, i_begin, i_last, perms.get(), mat_in.get(),
                                                         mat_out.get());
   }
 
-  auto expected_out = [i_begin, i_end, index_start, index_finish, &dist](const GlobalElementIndex i) {
-    GlobalTileIndex i_tile = dist.globalTileIndex(i);
-    if (i_begin <= i_tile.row() && i_tile.row() <= i_end && i_begin <= i_tile.col() &&
-        i_tile.col() <= i_end) {
-      GlobalElementIndex i_in(i.get<orthogonal(C)>(), index_finish + index_start - 1 - i.get<C>());
+  auto expected_out = [=, index_start = index_start](const GlobalElementIndex& i) {
+    const GlobalTileIndex i_tile = dist.globalTileIndex(i);
+    if (i_begin <= i_tile.row() && i_tile.row() <= i_last && i_begin <= i_tile.col() &&
+        i_tile.col() <= i_last) {
+      const std::size_t i_window = to_sizet(i.get<C>() - index_start);
+      GlobalElementIndex i_in(i.get<orthogonal(C)>(), index_start + perms[i_window]);
       if constexpr (C == Coord::Row)
         i_in.transpose();
-      return T(i_in.get<C>()) - T(i_in.get<orthogonal(C)>()) / T(8);
+
+      return value_in(i_in);
     }
-    return T(0);
+    return value_out(i);
   };
 
   CHECK_MATRIX_EQ(expected_out, mat_out_h);
 }
 
 TYPED_TEST(PermutationsDistTestMC, Columns) {
   for (const auto& comm_grid : this->commGrids()) {
-    for (const auto& [n, nb, i_begin, i_end] : params) {
-      testDistPermutations<TypeParam, Device::CPU, Coord::Col>(comm_grid, n, nb, i_begin, i_end);
+    for (const auto& [n, nb, i_begin, i_end, perms] : params) {
+      testDistPermutations<TypeParam, Device::CPU, Coord::Col>(comm_grid, n, nb, i_begin, i_end, perms);
       pika::threads::get_thread_manager().wait();
     }
   }
 }
 
 TYPED_TEST(PermutationsDistTestMC, Rows) {
   for (const auto& comm_grid : this->commGrids()) {
-    for (const auto& [n, nb, i_begin, i_end] : params) {
-      testDistPermutations<TypeParam, Device::CPU, Coord::Row>(comm_grid, n, nb, i_begin, i_end);
+    for (const auto& [n, nb, i_begin, i_end, perms] : params) {
+      testDistPermutations<TypeParam, Device::CPU, Coord::Row>(comm_grid, n, nb, i_begin, i_end, perms);
       pika::threads::get_thread_manager().wait();
     }
   }

diff --git a/test/unit/permutations/test_permutations_local.cpp b/test/unit/permutations/test_permutations_local.cpp
@@ -36,29 +36,37 @@ TYPED_TEST_SUITE(PermutationsTestGPU, RealMatrixElementTypes);
 // reverse order into the output matrix.
 template <Backend B, Device D, class T, Coord C>
 void testPermutations(SizeType n, SizeType nb, SizeType i_begin, SizeType i_end) {
-  Matrix<SizeType, Device::CPU> perms_h(LocalElementSize(n, 1), TileElementSize(nb, 1));
-  Matrix<T, Device::CPU> mat_in_h(LocalElementSize(n, n), TileElementSize(nb, nb));
-  Matrix<T, Device::CPU> mat_out_h(LocalElementSize(n, n), TileElementSize(nb, nb));
-
-  const matrix::Distribution& distr = mat_out_h.distribution();
+  const matrix::Distribution distr({n, n}, {nb, nb});
 
   SizeType index_start = distr.globalElementFromGlobalTileAndTileElement<C>(i_begin, 0);
   SizeType index_finish = distr.globalElementFromGlobalTileAndTileElement<C>(i_end, 0) +
                           distr.tileSize(GlobalTileIndex(i_end, i_end)).get<C>();
-  dlaf::matrix::util::set(perms_h, [index_start, index_finish](GlobalElementIndex i) {
-    if (index_start > i.row() || i.row() >= index_finish)
-      return SizeType(0);
-
-    return index_finish - 1 - i.row();
-  });
-  dlaf::matrix::util::set(mat_in_h, [](GlobalElementIndex i) {
-    return T(i.get<C>()) - T(i.get<orthogonal(C)>()) / T(8);
-  });
+
+  Matrix<const SizeType, Device::CPU> perms_h = [n, nb, index_start, index_finish]() {
+    Matrix<SizeType, Device::CPU> perms_h(LocalElementSize(n, 1), TileElementSize(nb, 1));
+    dlaf::matrix::util::set(perms_h, [index_start, index_finish](GlobalElementIndex i) {
+      if (index_start > i.row() || i.row() >= index_finish)
+        return SizeType(0);
+
+      return index_finish - 1 - i.row();
+    });
+    return perms_h;
+  }();
+
+  Matrix<T, Device::CPU> mat_in_h = [distr]() {
+    Matrix<T, Device::CPU> mat_in_h(distr);
+    dlaf::matrix::util::set(mat_in_h, [](GlobalElementIndex i) {
+      return T(i.get<C>()) - T(i.get<orthogonal(C)>()) / T(8);
+    });
+    return mat_in_h;
+  }();
+
+  Matrix<T, Device::CPU> mat_out_h(distr);
   dlaf::matrix::util::set0<Backend::MC>(pika::execution::thread_priority::normal, mat_out_h);
 
   {
     matrix::MatrixMirror<const SizeType, D, Device::CPU> perms(perms_h);
-    matrix::MatrixMirror<T, D, Device::CPU> mat_in(mat_in_h);
+    matrix::MatrixMirror<const T, D, Device::CPU> mat_in(mat_in_h);
     matrix::MatrixMirror<T, D, Device::CPU> mat_out(mat_out_h);
 
     permutations::permute<B, D, T, C>(i_begin, i_end, perms.get(), mat_in.get(), mat_out.get());