Skip to content

Commit

Permalink
Permute Distributed: migrate from AllToAll to Send/Recv (#835)
Browse files Browse the repository at this point in the history
  • Loading branch information
rasolca committed Apr 19, 2023
1 parent 1cf78cc commit ed4193e
Show file tree
Hide file tree
Showing 7 changed files with 392 additions and 206 deletions.
2 changes: 0 additions & 2 deletions include/dlaf/eigensolver/tridiag_solver/merge.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,6 @@
#include "dlaf/types.h"
#include "dlaf/util_matrix.h"

#include "dlaf/matrix/print_csv.h"

namespace dlaf::eigensolver::internal {

// Auxiliary matrix and vectors used for the D&C algorithm
Expand Down
10 changes: 5 additions & 5 deletions include/dlaf/permutations/general.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ namespace dlaf::permutations {
/// the closed range [i_begin,i_end] are accessed in write-only mode.
///
template <Backend B, Device D, class T, Coord coord>
void permute(SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms, Matrix<T, D>& mat_in,
Matrix<T, D>& mat_out) {
void permute(SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms,
Matrix<const T, D>& mat_in, Matrix<T, D>& mat_out) {
const matrix::Distribution& distr_perms = perms.distribution();
const matrix::Distribution& distr_in = mat_in.distribution();
const matrix::Distribution& distr_out = mat_out.distribution();
Expand Down Expand Up @@ -82,8 +82,8 @@ void permute(SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms,
///
template <Backend B, Device D, class T, Coord coord>
void permute(comm::CommunicatorGrid grid, common::Pipeline<comm::Communicator>& sub_task_chain,
SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms, Matrix<T, D>& mat_in,
Matrix<T, D>& mat_out) {
SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms,
Matrix<const T, D>& mat_in, Matrix<T, D>& mat_out) {
const matrix::Distribution& distr_perms = perms.distribution();
const matrix::Distribution& distr_in = mat_in.distribution();

Expand All @@ -110,7 +110,7 @@ void permute(comm::CommunicatorGrid grid, common::Pipeline<comm::Communicator>&
///
template <Backend B, Device D, class T, Coord coord>
void permute(comm::CommunicatorGrid grid, SizeType i_begin, SizeType i_end,
Matrix<const SizeType, D>& perms, Matrix<T, D>& mat_in, Matrix<T, D>& mat_out) {
Matrix<const SizeType, D>& perms, Matrix<const T, D>& mat_in, Matrix<T, D>& mat_out) {
common::Pipeline<comm::Communicator> sub_task_chain(grid.subCommunicator(orthogonal(coord)).clone());
permute<B, D, T, coord>(grid, sub_task_chain, i_begin, i_end, perms, mat_in, mat_out);
}
Expand Down
4 changes: 2 additions & 2 deletions include/dlaf/permutations/general/api.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ namespace dlaf::permutations::internal {
template <Backend B, Device D, class T, Coord coord>
struct Permutations {
static void call(SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms,
Matrix<T, D>& mat_in, Matrix<T, D>& mat_out);
Matrix<const T, D>& mat_in, Matrix<T, D>& mat_out);
static void call(common::Pipeline<comm::Communicator>& sub_task_chain, SizeType i_begin,
SizeType i_end, Matrix<const SizeType, D>& perms, Matrix<T, D>& mat_in,
SizeType i_end, Matrix<const SizeType, D>& perms, Matrix<const T, D>& mat_in,
Matrix<T, D>& mat_out);
};

Expand Down
404 changes: 264 additions & 140 deletions include/dlaf/permutations/general/impl.h

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion test/src/gtest_mpi_listener.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,8 @@ void MPIListener::OnTestEndAllRanks(const ::testing::TestInfo& test_info) const

namespace internal {
void mpi_send_string(const std::string& message, int to_rank) {
MPI_Send(message.c_str(), static_cast<int>(message.size()) + 1, MPI_CHAR, to_rank, 0, MPI_COMM_WORLD);
MPI_Send(const_cast<char*>(message.c_str()), static_cast<int>(message.size()) + 1, MPI_CHAR, to_rank,
0, MPI_COMM_WORLD);
}

std::string mpi_receive_string(int from_rank) {
Expand Down
137 changes: 96 additions & 41 deletions test/unit/permutations/test_permutations_distributed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,23 @@
// Please, refer to the LICENSE file in the root directory.
// SPDX-License-Identifier: BSD-3-Clause
//

#include "dlaf/permutations/general.h"

#include <algorithm>

#include <gtest/gtest.h>
#include <pika/runtime.hpp>

#include "dlaf/common/assert.h"
#include "dlaf/matrix/matrix_mirror.h"
#include "dlaf/permutations/general.h"
#include "dlaf/types.h"

#include "dlaf_test/comm_grids/grids_6_ranks.h"
#include "dlaf_test/matrix/matrix_local.h"
#include "dlaf_test/matrix/util_generic_lapack.h"
#include "dlaf_test/matrix/util_matrix.h"
#include "dlaf_test/matrix/util_matrix_local.h"
#include "dlaf_test/util_types.h"

using namespace dlaf;
Expand All @@ -33,86 +41,133 @@ struct PermutationsDistTestMC : public TestWithCommGrids {};

TYPED_TEST_SUITE(PermutationsDistTestMC, RealMatrixElementTypes);

const std::vector<std::tuple<SizeType, SizeType, SizeType, SizeType>> params = {
// n, nb, i_begin, i_end
// n, nb, i_begin, i_last, permutation
// permutation has to be defined using element indices wrt to the range described by [i_begin, i_last]
// tiles and not global indices.
// permutation[i]: mat_out[i] = mat_in[perm[i]]
using testcase_t = std::tuple<SizeType, SizeType, SizeType, SizeType, std::vector<SizeType>>;

// Given matrix size and blocksize, this helper converts a range defined with tile indices
// [i_begin_tile, i_end_tile) into a range defined with element indices [i_begin, i_end)
auto tileToElementRange(SizeType m, SizeType mb, SizeType i_begin_tile, SizeType i_end_tile) {
const SizeType i_begin = std::max<SizeType>(0, std::min<SizeType>(m - 1, i_begin_tile * mb));
const SizeType i_end = std::max<SizeType>(0, std::min<SizeType>(m, i_end_tile * mb));
return std::make_tuple(i_begin, i_end);
}

// Helper that given a geometry generates a "mirror" permutation.
// A mirror permutation means that first becomes last, second becomes before-last, ...
testcase_t mirrorPermutation(SizeType m, SizeType mb, SizeType i_begin_tile, SizeType i_last_tile) {
const auto [i_begin, i_end] = tileToElementRange(m, mb, i_begin_tile, i_last_tile + 1);

std::vector<SizeType> perms(to_sizet(i_end - i_begin));
std::generate(perms.rbegin(), perms.rend(), [n = 0]() mutable { return n++; });

return {m, mb, i_begin_tile, i_last_tile, perms};
}

// Helper that just checks that given geometry and permutations are compatible.
testcase_t customPermutation(SizeType m, SizeType mb, SizeType i_begin_tile, SizeType i_last_tile,
std::vector<SizeType> perms) {
const auto [i_begin, i_end] = tileToElementRange(m, mb, i_begin_tile, i_last_tile + 1);

const std::size_t nperms = to_sizet(i_end - i_begin);
DLAF_ASSERT(perms.size() == nperms, perms.size(), nperms);

return {m, mb, i_begin_tile, i_last_tile, std::move(perms)};
}

const std::vector<testcase_t> params = {
// simple setup for a (3, 2) process grid,
{6, 2, 0, 2},
mirrorPermutation(6, 2, 0, 2),
customPermutation(6, 2, 0, 2, {2, 0, 1, 4, 5, 3}),
// entire range of tiles is inculded
{10, 3, 0, 3},
{17, 5, 0, 3},
mirrorPermutation(10, 3, 0, 3),
customPermutation(10, 3, 0, 3, {0, 2, 3, 4, 6, 8, 1, 5, 7, 9}),
customPermutation(10, 3, 0, 3, {8, 9, 3, 5, 2, 7, 1, 4, 0, 6}),
mirrorPermutation(17, 5, 0, 3),
// only a subset of processes participate
{10, 3, 1, 2},
mirrorPermutation(10, 3, 1, 2),
// a single tile matrix
{10, 10, 0, 0},
mirrorPermutation(10, 10, 0, 0),
// each process has multiple tiles
{31, 6, 1, 3},
{50, 4, 1, 8},
mirrorPermutation(31, 6, 1, 3),
mirrorPermutation(50, 4, 1, 8),
};

template <class T, Device D, Coord C>
void testDistPermutations(comm::CommunicatorGrid grid, SizeType n, SizeType nb, SizeType i_begin,
SizeType i_end) {
SizeType i_last, std::vector<SizeType> perms) {
const GlobalElementSize size(n, n);
const TileElementSize block_size(nb, nb);
Index2D src_rank_index(std::max(0, grid.size().rows() - 1), std::min(1, grid.size().cols() - 1));
const Index2D src_rank_index(std::max(0, grid.size().rows() - 1), std::min(1, grid.size().cols() - 1));

Distribution dist(size, block_size, grid.size(), grid.rank(), src_rank_index);
Matrix<SizeType, Device::CPU> perms_h(LocalElementSize(n, 1), TileElementSize(nb, 1));
Matrix<T, Device::CPU> mat_in_h(dist);
Matrix<T, Device::CPU> mat_out_h(dist);
const Distribution dist(size, block_size, grid.size(), grid.rank(), src_rank_index);

SizeType index_start = dist.globalElementFromGlobalTileAndTileElement<C>(i_begin, 0);
SizeType index_finish = dist.globalElementFromGlobalTileAndTileElement<C>(i_end, 0) +
dist.tileSize(GlobalTileIndex(i_end, i_end)).get<C>();
dlaf::matrix::util::set(perms_h, [index_start, index_finish](GlobalElementIndex i) {
if (index_start > i.row() || i.row() >= index_finish)
return SizeType(0);
const auto [index_start, index_end] = tileToElementRange(n, nb, i_begin, i_last + 1);

return index_finish - 1 - i.row();
});
dlaf::matrix::util::set(mat_in_h, [](GlobalElementIndex i) {
return T(i.get<C>()) - T(i.get<orthogonal(C)>()) / T(8);
});
dlaf::matrix::util::set0<Backend::MC>(pika::execution::thread_priority::normal, mat_out_h);
Matrix<const SizeType, Device::CPU> perms_h = [=, index_start = index_start, index_end = index_end] {
Matrix<SizeType, Device::CPU> perms_h(LocalElementSize(n, 1), TileElementSize(nb, 1));
dlaf::matrix::util::set(perms_h, [=](GlobalElementIndex i) {
if (index_start > i.row() || i.row() >= index_end)
return SizeType(0);

const SizeType i_window = i.row() - index_start;
return perms[to_sizet(i_window)];
});
return perms_h;
}();

auto value_in = [](GlobalElementIndex i) { return T(i.get<C>()) - T(i.get<orthogonal(C)>()) / T(8); };
Matrix<const T, Device::CPU> mat_in_h = [dist, value_in]() {
Matrix<T, Device::CPU> mat_in_h(dist);
dlaf::matrix::util::set(mat_in_h, value_in);
return mat_in_h;
}();

auto value_out = [](GlobalElementIndex i) { return T(i.get<orthogonal(C)>()) - T(i.get<C>()) / T(8); };
Matrix<T, Device::CPU> mat_out_h(dist);
dlaf::matrix::util::set(mat_out_h, value_out);

{
matrix::MatrixMirror<const SizeType, D, Device::CPU> perms(perms_h);
matrix::MatrixMirror<T, D, Device::CPU> mat_in(mat_in_h);
matrix::MatrixMirror<const T, D, Device::CPU> mat_in(mat_in_h);
matrix::MatrixMirror<T, D, Device::CPU> mat_out(mat_out_h);

permutations::permute<DefaultBackend_v<D>, D, T, C>(grid, i_begin, i_end, perms.get(), mat_in.get(),
permutations::permute<DefaultBackend_v<D>, D, T, C>(grid, i_begin, i_last, perms.get(), mat_in.get(),
mat_out.get());
}

auto expected_out = [i_begin, i_end, index_start, index_finish, &dist](const GlobalElementIndex i) {
GlobalTileIndex i_tile = dist.globalTileIndex(i);
if (i_begin <= i_tile.row() && i_tile.row() <= i_end && i_begin <= i_tile.col() &&
i_tile.col() <= i_end) {
GlobalElementIndex i_in(i.get<orthogonal(C)>(), index_finish + index_start - 1 - i.get<C>());
auto expected_out = [=, index_start = index_start](const GlobalElementIndex& i) {
const GlobalTileIndex i_tile = dist.globalTileIndex(i);
if (i_begin <= i_tile.row() && i_tile.row() <= i_last && i_begin <= i_tile.col() &&
i_tile.col() <= i_last) {
const std::size_t i_window = to_sizet(i.get<C>() - index_start);
GlobalElementIndex i_in(i.get<orthogonal(C)>(), index_start + perms[i_window]);
if constexpr (C == Coord::Row)
i_in.transpose();
return T(i_in.get<C>()) - T(i_in.get<orthogonal(C)>()) / T(8);

return value_in(i_in);
}
return T(0);
return value_out(i);
};

CHECK_MATRIX_EQ(expected_out, mat_out_h);
}

TYPED_TEST(PermutationsDistTestMC, Columns) {
for (const auto& comm_grid : this->commGrids()) {
for (const auto& [n, nb, i_begin, i_end] : params) {
testDistPermutations<TypeParam, Device::CPU, Coord::Col>(comm_grid, n, nb, i_begin, i_end);
for (const auto& [n, nb, i_begin, i_end, perms] : params) {
testDistPermutations<TypeParam, Device::CPU, Coord::Col>(comm_grid, n, nb, i_begin, i_end, perms);
pika::threads::get_thread_manager().wait();
}
}
}

TYPED_TEST(PermutationsDistTestMC, Rows) {
for (const auto& comm_grid : this->commGrids()) {
for (const auto& [n, nb, i_begin, i_end] : params) {
testDistPermutations<TypeParam, Device::CPU, Coord::Row>(comm_grid, n, nb, i_begin, i_end);
for (const auto& [n, nb, i_begin, i_end, perms] : params) {
testDistPermutations<TypeParam, Device::CPU, Coord::Row>(comm_grid, n, nb, i_begin, i_end, perms);
pika::threads::get_thread_manager().wait();
}
}
Expand Down
38 changes: 23 additions & 15 deletions test/unit/permutations/test_permutations_local.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,29 +36,37 @@ TYPED_TEST_SUITE(PermutationsTestGPU, RealMatrixElementTypes);
// reverse order into the output matrix.
template <Backend B, Device D, class T, Coord C>
void testPermutations(SizeType n, SizeType nb, SizeType i_begin, SizeType i_end) {
Matrix<SizeType, Device::CPU> perms_h(LocalElementSize(n, 1), TileElementSize(nb, 1));
Matrix<T, Device::CPU> mat_in_h(LocalElementSize(n, n), TileElementSize(nb, nb));
Matrix<T, Device::CPU> mat_out_h(LocalElementSize(n, n), TileElementSize(nb, nb));

const matrix::Distribution& distr = mat_out_h.distribution();
const matrix::Distribution distr({n, n}, {nb, nb});

SizeType index_start = distr.globalElementFromGlobalTileAndTileElement<C>(i_begin, 0);
SizeType index_finish = distr.globalElementFromGlobalTileAndTileElement<C>(i_end, 0) +
distr.tileSize(GlobalTileIndex(i_end, i_end)).get<C>();
dlaf::matrix::util::set(perms_h, [index_start, index_finish](GlobalElementIndex i) {
if (index_start > i.row() || i.row() >= index_finish)
return SizeType(0);

return index_finish - 1 - i.row();
});
dlaf::matrix::util::set(mat_in_h, [](GlobalElementIndex i) {
return T(i.get<C>()) - T(i.get<orthogonal(C)>()) / T(8);
});

Matrix<const SizeType, Device::CPU> perms_h = [n, nb, index_start, index_finish]() {
Matrix<SizeType, Device::CPU> perms_h(LocalElementSize(n, 1), TileElementSize(nb, 1));
dlaf::matrix::util::set(perms_h, [index_start, index_finish](GlobalElementIndex i) {
if (index_start > i.row() || i.row() >= index_finish)
return SizeType(0);

return index_finish - 1 - i.row();
});
return perms_h;
}();

Matrix<T, Device::CPU> mat_in_h = [distr]() {
Matrix<T, Device::CPU> mat_in_h(distr);
dlaf::matrix::util::set(mat_in_h, [](GlobalElementIndex i) {
return T(i.get<C>()) - T(i.get<orthogonal(C)>()) / T(8);
});
return mat_in_h;
}();

Matrix<T, Device::CPU> mat_out_h(distr);
dlaf::matrix::util::set0<Backend::MC>(pika::execution::thread_priority::normal, mat_out_h);

{
matrix::MatrixMirror<const SizeType, D, Device::CPU> perms(perms_h);
matrix::MatrixMirror<T, D, Device::CPU> mat_in(mat_in_h);
matrix::MatrixMirror<const T, D, Device::CPU> mat_in(mat_in_h);
matrix::MatrixMirror<T, D, Device::CPU> mat_out(mat_out_h);

permutations::permute<B, D, T, C>(i_begin, i_end, perms.get(), mat_in.get(), mat_out.get());
Expand Down

0 comments on commit ed4193e

Please sign in to comment.