From ed4193eeeaceb0a53ed98b25042f16dac09fca6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Raffaele=20Solc=C3=A0?= Date: Wed, 19 Apr 2023 13:17:16 +0200 Subject: [PATCH] Permute Distributed: migrate from AllToAll to Send/Recv (#835) --- .../dlaf/eigensolver/tridiag_solver/merge.h | 2 - include/dlaf/permutations/general.h | 10 +- include/dlaf/permutations/general/api.h | 4 +- include/dlaf/permutations/general/impl.h | 404 ++++++++++++------ test/src/gtest_mpi_listener.cpp | 3 +- .../test_permutations_distributed.cpp | 137 ++++-- .../permutations/test_permutations_local.cpp | 38 +- 7 files changed, 392 insertions(+), 206 deletions(-) diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h index ceb6fb52f4..4f7940a051 100644 --- a/include/dlaf/eigensolver/tridiag_solver/merge.h +++ b/include/dlaf/eigensolver/tridiag_solver/merge.h @@ -35,8 +35,6 @@ #include "dlaf/types.h" #include "dlaf/util_matrix.h" -#include "dlaf/matrix/print_csv.h" - namespace dlaf::eigensolver::internal { // Auxiliary matrix and vectors used for the D&C algorithm diff --git a/include/dlaf/permutations/general.h b/include/dlaf/permutations/general.h index 968d04f228..3690011a5e 100644 --- a/include/dlaf/permutations/general.h +++ b/include/dlaf/permutations/general.h @@ -34,8 +34,8 @@ namespace dlaf::permutations { /// the closed range [i_begin,i_end] are accessed in write-only mode. /// template -void permute(SizeType i_begin, SizeType i_end, Matrix& perms, Matrix& mat_in, - Matrix& mat_out) { +void permute(SizeType i_begin, SizeType i_end, Matrix& perms, + Matrix& mat_in, Matrix& mat_out) { const matrix::Distribution& distr_perms = perms.distribution(); const matrix::Distribution& distr_in = mat_in.distribution(); const matrix::Distribution& distr_out = mat_out.distribution(); @@ -82,8 +82,8 @@ void permute(SizeType i_begin, SizeType i_end, Matrix& perms, /// template void permute(comm::CommunicatorGrid grid, common::Pipeline& sub_task_chain, - SizeType i_begin, SizeType i_end, Matrix& perms, Matrix& mat_in, - Matrix& mat_out) { + SizeType i_begin, SizeType i_end, Matrix& perms, + Matrix& mat_in, Matrix& mat_out) { const matrix::Distribution& distr_perms = perms.distribution(); const matrix::Distribution& distr_in = mat_in.distribution(); @@ -110,7 +110,7 @@ void permute(comm::CommunicatorGrid grid, common::Pipeline& /// template void permute(comm::CommunicatorGrid grid, SizeType i_begin, SizeType i_end, - Matrix& perms, Matrix& mat_in, Matrix& mat_out) { + Matrix& perms, Matrix& mat_in, Matrix& mat_out) { common::Pipeline sub_task_chain(grid.subCommunicator(orthogonal(coord)).clone()); permute(grid, sub_task_chain, i_begin, i_end, perms, mat_in, mat_out); } diff --git a/include/dlaf/permutations/general/api.h b/include/dlaf/permutations/general/api.h index 3b08233d86..3a75a29e89 100644 --- a/include/dlaf/permutations/general/api.h +++ b/include/dlaf/permutations/general/api.h @@ -21,9 +21,9 @@ namespace dlaf::permutations::internal { template struct Permutations { static void call(SizeType i_begin, SizeType i_end, Matrix& perms, - Matrix& mat_in, Matrix& mat_out); + Matrix& mat_in, Matrix& mat_out); static void call(common::Pipeline& sub_task_chain, SizeType i_begin, - SizeType i_end, Matrix& perms, Matrix& mat_in, + SizeType i_end, Matrix& perms, Matrix& mat_in, Matrix& mat_out); }; diff --git a/include/dlaf/permutations/general/impl.h b/include/dlaf/permutations/general/impl.h index 54d7f77bca..6c6aa309c8 100644 --- a/include/dlaf/permutations/general/impl.h +++ b/include/dlaf/permutations/general/impl.h @@ -10,6 +10,7 @@ #pragma once +#include "dlaf/matrix/index.h" #include "dlaf/permutations/general/api.h" #include "dlaf/permutations/general/perms.h" @@ -24,6 +25,7 @@ #include "dlaf/matrix/copy_tile.h" #include "dlaf/matrix/matrix.h" #include "dlaf/schedulers.h" +#include "dlaf/sender/policy.h" #include "dlaf/sender/transform.h" #include "dlaf/sender/transform_mpi.h" #include "dlaf/sender/when_all_lift.h" @@ -31,6 +33,7 @@ #include "dlaf/util_matrix.h" #include +#include #include #include "pika/algorithm.hpp" @@ -126,9 +129,52 @@ void applyPermutations(GlobalElementIndex out_begin, GlobalElementSize sz, SizeT } } +// FilterFunc is a function with signature bool(*)(SizeType) +template +void applyPermutationsFiltered( + GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset, + const matrix::Distribution& subm_dist, const SizeType* perm_arr, + const std::vector>>& in_tiles_fut, + const std::vector>& out_tiles, FilterFunc&& filter) { + constexpr auto OC = orthogonal(C); + std::vector splits = + dlaf::util::interleaveSplits(sz.get(), subm_dist.blockSize().get(), + subm_dist.distanceToAdjacentTile(in_offset), + subm_dist.distanceToAdjacentTile(out_begin.get())); + + const SizeType nperms = subm_dist.size().get(); + + // Parallelized over the number of permutations + pika::for_loop(pika::execution::par, to_sizet(0), to_sizet(nperms), [&](SizeType i_perm) { + if (!filter(perm_arr[i_perm])) + return; + + for (std::size_t i_split = 0; i_split < splits.size() - 1; ++i_split) { + const SizeType split = splits[i_split]; + + GlobalElementIndex i_split_gl_in(split + in_offset, perm_arr[i_perm]); + GlobalElementIndex i_split_gl_out(split + out_begin.get(), out_begin.get() + i_perm); + TileElementSize region(splits[i_split + 1] - split, 1); + + if constexpr (C == Coord::Row) { + region.transpose(); + i_split_gl_in.transpose(); + i_split_gl_out.transpose(); + } + + const TileElementIndex i_subtile_in = subm_dist.tileElementIndex(i_split_gl_in); + const auto& tile_in = in_tiles_fut[to_sizet(subm_dist.globalTileLinearIndex(i_split_gl_in))].get(); + const TileElementIndex i_subtile_out = subm_dist.tileElementIndex(i_split_gl_out); + auto& tile_out = out_tiles[to_sizet(subm_dist.globalTileLinearIndex(i_split_gl_out))]; + + dlaf::tile::lacpy(region, i_subtile_in, tile_in, i_subtile_out, tile_out); + } + }); +} + template void Permutations::call(SizeType i_begin, SizeType i_end, Matrix& perms, - Matrix& mat_in, Matrix& mat_out) { + Matrix& mat_in, Matrix& mat_out) { namespace ut = matrix::util; namespace ex = pika::execution::experimental; @@ -191,119 +237,135 @@ auto whenAllReadOnlyTilesArray(Matrix& matrix) { ut::collectReadTiles(LocalTileIndex(0, 0), matrix.distribution().localNrTiles(), matrix)); } -// No data is sent or received but the processes participates in the collective call -template -void all2allEmptyData(common::Pipeline& sub_task_chain, int nranks) { - namespace ex = pika::execution::experimental; - - auto all2all_f = [](const comm::Communicator& comm, const std::vector& arr, MPI_Request* req) { - MPI_Datatype dtype = dlaf::comm::mpi_datatype>::type; - // Avoid buffer aliasing error reported when send_ptr and recv_ptr are the same. - int irrelevant = 42; - DLAF_MPI_CHECK_ERROR(MPI_Ialltoallv(nullptr, arr.data(), arr.data(), dtype, &irrelevant, arr.data(), - arr.data(), dtype, comm, req)); - }; - dlaf::comm::internal::transformMPIDetach(std::move(all2all_f), - ex::when_all(sub_task_chain(), - ex::just(std::vector(to_sizet(nranks), 0)))); -} - -// Note: Matrices used for communication @p send_mat and @p recv_mat are assumed to be in column-major layout! -// template void all2allData(common::Pipeline& sub_task_chain, int nranks, - LocalElementSize sz_loc, SendCountsSender&& send_counts_sender, Matrix& send_mat, - RecvCountsSender&& recv_counts_sender, Matrix& recv_mat) { + LocalElementSize sz_loc, SendCountsSender&& send_counts_sender, + Matrix& send_mat, RecvCountsSender&& recv_counts_sender, + Matrix& recv_mat) { namespace ex = pika::execution::experimental; - auto all2all_f = - [len = sz_loc.get()](const comm::Communicator& comm, std::vector& send_counts, - std::vector& send_displs, - const std::vector>>& send_tiles_fut, - std::vector& recv_counts, std::vector& recv_displs, - const std::vector>& recv_tiles, MPI_Request* req) { - // datatype to be sent to each rank - MPI_Datatype dtype = dlaf::comm::mpi_datatype>::type; - - // scale by the length of the row or column vectors to get the number of elements sent to each process - auto mul_const = [len](int num) { return to_int(len) * num; }; - std::transform(send_counts.cbegin(), send_counts.cend(), send_counts.begin(), mul_const); - std::transform(recv_counts.cbegin(), recv_counts.cend(), recv_counts.begin(), mul_const); - - // Note: that was guaranteed to be contiguous on allocation + using dlaf::common::DataDescriptor; + + const SizeType vec_size = sz_loc.get(); + auto sendrecv_f = + [vec_size](comm::Communicator& comm, std::vector send_counts, std::vector send_displs, + const std::vector>>& send_tiles_fut, + std::vector recv_counts, std::vector recv_displs, + const std::vector>& recv_tiles) { + // Note: both guaranteed to be column-major on allocation const T* send_ptr = send_tiles_fut[0].get().ptr(); T* recv_ptr = recv_tiles[0].ptr(); - // send displacements - std::exclusive_scan(send_counts.begin(), send_counts.end(), send_displs.begin(), 0); + const SizeType send_ld = send_tiles_fut[0].get().ld(); + const SizeType recv_ld = recv_tiles[0].ld(); + + const SizeType send_perm_stride = (C == Coord::Col) ? send_ld : 1; + const SizeType recv_perm_stride = (C == Coord::Col) ? recv_ld : 1; - // recv displacements + // cumulative sum for computing rank data displacements in packed vectors + std::exclusive_scan(send_counts.begin(), send_counts.end(), send_displs.begin(), 0); std::exclusive_scan(recv_counts.begin(), recv_counts.end(), recv_displs.begin(), 0); - // All-to-All communication - DLAF_MPI_CHECK_ERROR(MPI_Ialltoallv(send_ptr, send_counts.data(), send_displs.data(), dtype, - recv_ptr, recv_counts.data(), recv_displs.data(), dtype, - comm, req)); + std::vector> all_comms; + all_comms.reserve(to_sizet(comm.size() - 1) * 2); + const comm::IndexT_MPI rank = comm.rank(); + for (comm::IndexT_MPI rank_partner = 0; rank_partner < comm.size(); ++rank_partner) { + if (rank == rank_partner) + continue; + + const auto rank_partner_index = to_sizet(rank_partner); + + if (send_counts[rank_partner_index]) + all_comms.push_back( + ex::just() | dlaf::comm::internal::transformMPI([=](MPI_Request* req) { + const SizeType nperms = send_counts[rank_partner_index]; + auto message = dlaf::comm::make_message( + DataDescriptor(send_ptr + + send_displs[rank_partner_index] * send_perm_stride, + C == Coord::Col ? nperms : vec_size, + C == Coord::Col ? vec_size : nperms, send_ld)); + + DLAF_MPI_CHECK_ERROR(MPI_Isend(message.data(), message.count(), message.mpi_type(), + rank_partner, 0, comm, req)); + })); + if (recv_counts[rank_partner_index]) + all_comms.push_back( + ex::just() | dlaf::comm::internal::transformMPI([=](MPI_Request* req) { + const SizeType nperms = recv_counts[rank_partner_index]; + auto message = dlaf::comm::make_message( + DataDescriptor(recv_ptr + recv_displs[rank_partner_index] * recv_perm_stride, + C == Coord::Col ? nperms : vec_size, + C == Coord::Col ? vec_size : nperms, recv_ld)); + + DLAF_MPI_CHECK_ERROR(MPI_Irecv(message.data(), message.count(), message.mpi_type(), + rank_partner, 0, comm, req)); + })); + } + + pika::this_thread::experimental::sync_wait(ex::when_all_vector(std::move(all_comms))); }; - auto sender = - ex::when_all(sub_task_chain(), std::forward(send_counts_sender), - ex::just(std::vector(to_sizet(nranks))), whenAllReadOnlyTilesArray(send_mat), - std::forward(recv_counts_sender), - ex::just(std::vector(to_sizet(nranks))), whenAllReadWriteTilesArray(recv_mat)); - dlaf::comm::internal::transformMPIDetach(std::move(all2all_f), std::move(sender)); + ex::when_all(sub_task_chain(), std::forward(send_counts_sender), + ex::just(std::vector(to_sizet(nranks))), whenAllReadOnlyTilesArray(send_mat), + std::forward(recv_counts_sender), + ex::just(std::vector(to_sizet(nranks))), whenAllReadWriteTilesArray(recv_mat)) | + dlaf::internal::transformDetach(dlaf::internal::Policy(), sendrecv_f); } // @param nranks number of ranks -// @param loc2gl_index a column matrix that represents a map from local to global indices +// @param offset_sub where the sub-matrix to permute starts in the global matrix +// @param loc2sub_index a column matrix that represents a map from local to sub indices // @param packing_index a column matrix that represents a map from packed indices to local indices -// that is used for packing columns or rows on each rank // // Note: the order of the packed rows or columns on the send side must match the expected order at // unpacking on the receive side -template -auto initPackingIndex(int nranks, SizeType i_el_begin, const matrix::Distribution& dist, - Matrix& loc2gl_index, +template +auto initPackingIndex(comm::IndexT_MPI nranks, SizeType offset_sub, const matrix::Distribution& dist, + Matrix& loc2sub_index, Matrix& packing_index) { namespace ex = pika::execution::experimental; namespace di = dlaf::internal; - auto counts_fn = [nranks, i_el_begin, dist, - len = packing_index.size().rows()](const auto& loc2gl_index_tiles, - const auto& packing_index_tiles) { - const SizeType* in = loc2gl_index_tiles[0].get().ptr(); + auto counts_fn = [nranks, offset_sub, dist, + nperms = packing_index.size().rows()](const auto& loc2gl_index_tiles, + const auto& packing_index_tiles) { + const SizeType* loc2sub = loc2gl_index_tiles[0].get().ptr(); SizeType* out = packing_index_tiles[0].ptr(); std::vector counts(to_sizet(nranks)); - int offset = 0; - for (int rank = 0; rank < nranks; ++rank) { - int& count = counts[to_sizet(rank)]; - count = 0; - for (SizeType i = 0; i < len; ++i) { - if (dist.rankGlobalElement(i_el_begin + in[i]) == rank) { - if constexpr (PackBasedOnGlobalIndex) { - out[offset + count] = i; - } - else { - out[i] = offset + count; - } - ++count; + + for (int rank = 0, rank_displacement = 0; rank < nranks; ++rank) { + int& nperms_local = counts[to_sizet(rank)] = 0; + + for (SizeType perm_index_local = 0; perm_index_local < nperms; ++perm_index_local) { + const SizeType perm_index_global = offset_sub + loc2sub[perm_index_local]; + if (dist.rankGlobalElement(perm_index_global) == rank) { + const SizeType perm_index_packed = rank_displacement + nperms_local; + + if constexpr (BackwardMapping) + out[perm_index_packed] = perm_index_local; + else + out[perm_index_local] = perm_index_packed; + + ++nperms_local; } } - if constexpr (PackBasedOnGlobalIndex) { - std::sort(out + offset, out + offset + count, - [in](SizeType i1, SizeType i2) { return in[i1] < in[i2]; }); - } - offset += count; + + // Note: + // This sorting actually "inverts" the mapping. + if constexpr (BackwardMapping) + std::sort(out + rank_displacement, out + rank_displacement + nperms_local, + [loc2sub](SizeType i1, SizeType i2) { return loc2sub[i1] < loc2sub[i2]; }); + + rank_displacement += nperms_local; } + return counts; }; - auto sender = - ex::when_all(whenAllReadOnlyTilesArray(loc2gl_index), whenAllReadWriteTilesArray(packing_index)); return ex::ensure_started( - di::transform(di::Policy{}, std::move(counts_fn), std::move(sender))); + ex::when_all(whenAllReadOnlyTilesArray(loc2sub_index), whenAllReadWriteTilesArray(packing_index)) | + di::transform(di::Policy{}, std::move(counts_fn))); } // Copies index tiles belonging to the current process from the complete index @p global_index into the @@ -314,11 +376,11 @@ void copyLocalPartsFromGlobalIndex(SizeType i_loc_begin, const matrix::Distribut Matrix& local_index) { namespace ex = pika::execution::experimental; - for (auto tile_wrt_local : common::iterate_range2d(local_index.distribution().localNrTiles())) { - SizeType i_gl_tile = dist.globalTileFromLocalTile(i_loc_begin + tile_wrt_local.row()); - ex::start_detached(ex::when_all(global_index.read_sender(GlobalTileIndex(i_gl_tile, 0)), - local_index.readwrite_sender(tile_wrt_local)) | - dlaf::matrix::copy(dlaf::internal::Policy>{})); + for (const LocalTileIndex i : common::iterate_range2d(local_index.distribution().localNrTiles())) { + const GlobalTileIndex i_global(dist.globalTileFromLocalTile(i_loc_begin + i.row()), 0); + ex::start_detached( + ex::when_all(global_index.read_sender(i_global), local_index.readwrite_sender(i)) | + dlaf::matrix::copy(dlaf::internal::Policy>{})); } } @@ -411,97 +473,160 @@ inline void invertIndex(SizeType i_begin, SizeType i_end, Matrix -void permuteOnCPU(common::Pipeline& sub_task_chain, SizeType i_begin, SizeType i_end, - Matrix& perms, Matrix& mat_in, - Matrix& mat_out) { +void permuteOnCPU(common::Pipeline& sub_task_chain, SizeType i_begin, + SizeType i_last, Matrix& perms, + Matrix& mat_in, Matrix& mat_out) { constexpr Device D = Device::CPU; - const matrix::Distribution& dist = mat_in.distribution(); - int nranks = to_int(dist.commGridSize().get()); + using namespace dlaf::matrix; + + namespace ex = pika::execution::experimental; + namespace di = dlaf::internal; + + const Distribution& dist = mat_in.distribution(); + const comm::IndexT_MPI nranks = to_int(dist.commGridSize().get()); - // Local size and index of subproblem [i_begin, i_end] - SizeType i_el_begin = dist.globalElementFromGlobalTileAndTileElement(i_begin, 0); - TileElementSize blk = dist.blockSize(); - LocalTileIndex i_loc_begin{dist.nextLocalTileFromGlobalTile(i_begin), - dist.nextLocalTileFromGlobalTile(i_begin)}; - LocalTileIndex i_loc_end{dist.nextLocalTileFromGlobalTile(i_end + 1) - 1, - dist.nextLocalTileFromGlobalTile(i_end + 1) - 1}; + const SizeType i_end = i_last + 1; + + // Local size and index of subproblem [i_begin, i_last] + const SizeType offset_sub = dist.globalElementFromGlobalTileAndTileElement(i_begin, 0); + const TileElementSize blk = dist.blockSize(); + + const LocalTileIndex i_loc_begin{dist.nextLocalTileFromGlobalTile(i_begin), + dist.nextLocalTileFromGlobalTile(i_begin)}; + const LocalTileIndex i_loc_end{dist.nextLocalTileFromGlobalTile(i_end) - 1, + dist.nextLocalTileFromGlobalTile(i_end) - 1}; // Note: the local shape of the permutation region may not be square if the process grid is not square - LocalElementSize sz_loc{dist.localElementDistanceFromGlobalTile(i_begin, i_end + 1), - dist.localElementDistanceFromGlobalTile(i_begin, i_end + 1)}; + const LocalElementSize sz_loc{dist.localElementDistanceFromGlobalTile(i_begin, i_end), + dist.localElementDistanceFromGlobalTile(i_begin, i_end)}; - // If there are no tiles in this rank, participate in the all2all call and return - if (sz_loc.isEmpty()) { - all2allEmptyData(sub_task_chain, nranks); + // If there are no tiles in this rank, nothing to do here + if (sz_loc.isEmpty()) return; - } // Create a map from send indices to receive indices (inverse of perms) Matrix inverse_perms(perms.distribution()); - invertIndex(i_begin, i_end, perms, inverse_perms); + invertIndex(i_begin, i_last, perms, inverse_perms); // Local distribution used for packing and unpacking - matrix::Distribution subm_dist(sz_loc, blk); + const Distribution subm_dist(sz_loc, blk); // Local single tile column matrices representing index maps used for packing and unpacking of // communication data - matrix::Distribution index_dist(LocalElementSize(sz_loc.get(), 1), TileElementSize(blk.rows(), 1)); - Matrix ws_index(index_dist); + const SizeType nvecs = sz_loc.get(); + const Distribution index_dist(LocalElementSize(nvecs, 1), TileElementSize(blk.rows(), 1)); + Matrix local2global_index(index_dist); Matrix packing_index(index_dist); Matrix unpacking_index(index_dist); // Local matrices used for packing data for communication. Both matrices are in column-major order. // The particular constructor is used on purpose to guarantee that columns are stored contiguosly, // such that there is no padding and gaps between them. - LocalElementSize comm_sz = (C == Coord::Col) ? sz_loc : transposed(sz_loc); - matrix::Distribution comm_dist(comm_sz, blk); - matrix::LayoutInfo comm_layout = matrix::colMajorLayout(comm_sz, blk, comm_sz.rows()); + const LocalElementSize comm_sz = sz_loc; + const Distribution comm_dist(comm_sz, blk); + const LayoutInfo comm_layout = matrix::colMajorLayout(comm_sz, blk, comm_sz.rows()); + Matrix mat_send(comm_dist, comm_layout); Matrix mat_recv(comm_dist, comm_layout); // Initialize the unpacking index - copyLocalPartsFromGlobalIndex(i_loc_begin.get(), dist, perms, ws_index); + copyLocalPartsFromGlobalIndex(i_loc_begin.get(), dist, perms, local2global_index); auto recv_counts_sender = - initPackingIndex(nranks, i_el_begin, dist, ws_index, unpacking_index); + initPackingIndex(nranks, offset_sub, dist, local2global_index, unpacking_index) | ex::split(); // Initialize the packing index - // Here `true` is specified so that the send side matches the order of columns/rows on the receive side - copyLocalPartsFromGlobalIndex(i_loc_begin.get(), dist, inverse_perms, ws_index); - auto send_counts_sender = initPackingIndex(nranks, i_el_begin, dist, ws_index, packing_index); + // Here `true` is specified so that the send side matches the order of columns on the receive side + copyLocalPartsFromGlobalIndex(i_loc_begin.get(), dist, inverse_perms, local2global_index); + auto send_counts_sender = + initPackingIndex(nranks, offset_sub, dist, local2global_index, packing_index) | + ex::split(); // Pack local rows or columns to be sent from this rank applyPackingIndex(subm_dist, whenAllReadOnlyTilesArray(packing_index), whenAllReadOnlyTilesArray(i_loc_begin, i_loc_end, mat_in), - (C == Coord::Col) - ? whenAllReadWriteTilesArray(mat_send) - : whenAllReadWriteTilesArray(i_loc_begin, i_loc_end, mat_out)); - - // Pack data into the contiguous column-major send buffer by transposing - if constexpr (C == Coord::Row) { - transposeFromDistributedToLocalMatrix(i_loc_begin, mat_out, mat_send); - } - - // Communicate data - all2allData(sub_task_chain, nranks, sz_loc, std::move(send_counts_sender), mat_send, - std::move(recv_counts_sender), mat_recv); + whenAllReadWriteTilesArray(mat_send)); + + // Unpacking + // separate unpacking: + // - locals + // - communicated + // and then start two different tasks: + // - the first depends on mat_send instead of mat_recv (no dependency on comm) + // - the last is the same, but it has to skip the part already done for local + + // LOCAL + auto unpack_local_f = [subm_dist, rank = dist.rankIndex().get()](const auto& send_counts, + const auto& recv_counts, + const auto& index_tile_futs, + const auto& mat_in_tiles, + const auto& mat_out_tiles) { + const size_t rank_index = to_sizet(rank); + + const SizeType* perm_arr = index_tile_futs[0].get().ptr(); + const GlobalElementSize sz = subm_dist.size(); + + const int a = std::accumulate(send_counts.cbegin(), send_counts.cbegin() + rank, 0); + const int b = a + send_counts[rank_index]; + + // Note: + // These are copied directly from mat_send, while unpacking permutation applies to indices on + // the receiver side. So, we have to "align" the unpacking permutation, by applying the offset + // existing between the send and recv side. + // This is due to the fact that send and recv buffers might be "unbalanced", e.g. rank1 sends 2 + // and receive 1 with rank0, so resulting in a shift in indices between the two buffer sides, + // following previous example the local part would start at index (0-based) 2 in mat_send and + // at index 1 in mat_recv. + const int a_r = std::accumulate(recv_counts.cbegin(), recv_counts.cbegin() + rank, 0); + const SizeType offset = to_SizeType(a - a_r); + std::vector perm_offseted(perm_arr, perm_arr + subm_dist.size().get()); + std::transform(perm_offseted.begin(), perm_offseted.end(), perm_offseted.begin(), + [offset](const SizeType perm) { return perm + offset; }); + + // [a, b) + applyPermutationsFiltered({0, 0}, sz, 0, subm_dist, perm_offseted.data(), mat_in_tiles, + mat_out_tiles, + [a, b](SizeType i_perm) { return i_perm >= a && i_perm < b; }); + }; - // Unpack data from the contiguous column-major receive buffer by transposing - if constexpr (C == Coord::Row) { - transposeFromLocalToDistributedMatrix(i_loc_begin, mat_recv, mat_in); - } + ex::when_all(send_counts_sender, recv_counts_sender, whenAllReadOnlyTilesArray(unpacking_index), + whenAllReadOnlyTilesArray(mat_send), + whenAllReadWriteTilesArray(i_loc_begin, i_loc_end, mat_out)) | + di::transformDetach(di::Policy>(), std::move(unpack_local_f)); + + // COMMUNICATION-dependent + all2allData(sub_task_chain, nranks, sz_loc, send_counts_sender, mat_send, recv_counts_sender, + mat_recv); + + auto unpack_others_f = [subm_dist, rank = dist.rankIndex().get()](const auto& recv_counts, + const auto& index_tile_futs, + const auto& mat_in_tiles, + const auto& mat_out_tiles) { + const size_t rank_index = to_sizet(rank); + const int a = std::accumulate(recv_counts.cbegin(), recv_counts.cbegin() + rank, 0); + const int b = a + recv_counts[rank_index]; + + const SizeType* perm_arr = index_tile_futs[0].get().ptr(); + const GlobalElementSize sz = subm_dist.size(); + + // [0, a) + applyPermutationsFiltered({0, 0}, sz, 0, subm_dist, perm_arr, mat_in_tiles, mat_out_tiles, + [a](SizeType i_perm) { return i_perm < a; }); + + // [b, end) + applyPermutationsFiltered({0, 0}, sz, 0, subm_dist, perm_arr, mat_in_tiles, mat_out_tiles, + [b](SizeType i_perm) { return i_perm >= b; }); + }; - // Unpack local rows or columns received on this rank - applyPackingIndex(subm_dist, whenAllReadOnlyTilesArray(unpacking_index), - (C == Coord::Col) - ? whenAllReadOnlyTilesArray(mat_recv) - : whenAllReadOnlyTilesArray(i_loc_begin, i_loc_end, mat_in), - whenAllReadWriteTilesArray(i_loc_begin, i_loc_end, mat_out)); + ex::when_all(recv_counts_sender, whenAllReadOnlyTilesArray(unpacking_index), + whenAllReadOnlyTilesArray(mat_recv), + whenAllReadWriteTilesArray(i_loc_begin, i_loc_end, mat_out)) | + di::transformDetach(di::Policy>(), std::move(unpack_others_f)); } template void Permutations::call(common::Pipeline& sub_task_chain, SizeType i_begin, SizeType i_end, Matrix& perms, - Matrix& mat_in, Matrix& mat_out) { + Matrix& mat_in, Matrix& mat_out) { if constexpr (D == Device::GPU) { // This is a temporary placeholder which avoids diverging GPU API: DLAF_UNIMPLEMENTED("GPU implementation not available yet"); @@ -512,5 +637,4 @@ void Permutations::call(common::Pipeline& sub_ta permuteOnCPU(sub_task_chain, i_begin, i_end, perms, mat_in, mat_out); } } - } diff --git a/test/src/gtest_mpi_listener.cpp b/test/src/gtest_mpi_listener.cpp index a342b8582f..24754dce54 100644 --- a/test/src/gtest_mpi_listener.cpp +++ b/test/src/gtest_mpi_listener.cpp @@ -160,7 +160,8 @@ void MPIListener::OnTestEndAllRanks(const ::testing::TestInfo& test_info) const namespace internal { void mpi_send_string(const std::string& message, int to_rank) { - MPI_Send(message.c_str(), static_cast(message.size()) + 1, MPI_CHAR, to_rank, 0, MPI_COMM_WORLD); + MPI_Send(const_cast(message.c_str()), static_cast(message.size()) + 1, MPI_CHAR, to_rank, + 0, MPI_COMM_WORLD); } std::string mpi_receive_string(int from_rank) { diff --git a/test/unit/permutations/test_permutations_distributed.cpp b/test/unit/permutations/test_permutations_distributed.cpp index 9516bc9aa5..a401153927 100644 --- a/test/unit/permutations/test_permutations_distributed.cpp +++ b/test/unit/permutations/test_permutations_distributed.cpp @@ -7,15 +7,23 @@ // Please, refer to the LICENSE file in the root directory. // SPDX-License-Identifier: BSD-3-Clause // + +#include "dlaf/permutations/general.h" + +#include + #include #include +#include "dlaf/common/assert.h" #include "dlaf/matrix/matrix_mirror.h" -#include "dlaf/permutations/general.h" +#include "dlaf/types.h" #include "dlaf_test/comm_grids/grids_6_ranks.h" +#include "dlaf_test/matrix/matrix_local.h" #include "dlaf_test/matrix/util_generic_lapack.h" #include "dlaf_test/matrix/util_matrix.h" +#include "dlaf_test/matrix/util_matrix_local.h" #include "dlaf_test/util_types.h" using namespace dlaf; @@ -33,68 +41,115 @@ struct PermutationsDistTestMC : public TestWithCommGrids {}; TYPED_TEST_SUITE(PermutationsDistTestMC, RealMatrixElementTypes); -const std::vector> params = { - // n, nb, i_begin, i_end +// n, nb, i_begin, i_last, permutation +// permutation has to be defined using element indices wrt to the range described by [i_begin, i_last] +// tiles and not global indices. +// permutation[i]: mat_out[i] = mat_in[perm[i]] +using testcase_t = std::tuple>; + +// Given matrix size and blocksize, this helper converts a range defined with tile indices +// [i_begin_tile, i_end_tile) into a range defined with element indices [i_begin, i_end) +auto tileToElementRange(SizeType m, SizeType mb, SizeType i_begin_tile, SizeType i_end_tile) { + const SizeType i_begin = std::max(0, std::min(m - 1, i_begin_tile * mb)); + const SizeType i_end = std::max(0, std::min(m, i_end_tile * mb)); + return std::make_tuple(i_begin, i_end); +} + +// Helper that given a geometry generates a "mirror" permutation. +// A mirror permutation means that first becomes last, second becomes before-last, ... +testcase_t mirrorPermutation(SizeType m, SizeType mb, SizeType i_begin_tile, SizeType i_last_tile) { + const auto [i_begin, i_end] = tileToElementRange(m, mb, i_begin_tile, i_last_tile + 1); + + std::vector perms(to_sizet(i_end - i_begin)); + std::generate(perms.rbegin(), perms.rend(), [n = 0]() mutable { return n++; }); + + return {m, mb, i_begin_tile, i_last_tile, perms}; +} + +// Helper that just checks that given geometry and permutations are compatible. +testcase_t customPermutation(SizeType m, SizeType mb, SizeType i_begin_tile, SizeType i_last_tile, + std::vector perms) { + const auto [i_begin, i_end] = tileToElementRange(m, mb, i_begin_tile, i_last_tile + 1); + + const std::size_t nperms = to_sizet(i_end - i_begin); + DLAF_ASSERT(perms.size() == nperms, perms.size(), nperms); + return {m, mb, i_begin_tile, i_last_tile, std::move(perms)}; +} + +const std::vector params = { // simple setup for a (3, 2) process grid, - {6, 2, 0, 2}, + mirrorPermutation(6, 2, 0, 2), + customPermutation(6, 2, 0, 2, {2, 0, 1, 4, 5, 3}), // entire range of tiles is inculded - {10, 3, 0, 3}, - {17, 5, 0, 3}, + mirrorPermutation(10, 3, 0, 3), + customPermutation(10, 3, 0, 3, {0, 2, 3, 4, 6, 8, 1, 5, 7, 9}), + customPermutation(10, 3, 0, 3, {8, 9, 3, 5, 2, 7, 1, 4, 0, 6}), + mirrorPermutation(17, 5, 0, 3), // only a subset of processes participate - {10, 3, 1, 2}, + mirrorPermutation(10, 3, 1, 2), // a single tile matrix - {10, 10, 0, 0}, + mirrorPermutation(10, 10, 0, 0), // each process has multiple tiles - {31, 6, 1, 3}, - {50, 4, 1, 8}, + mirrorPermutation(31, 6, 1, 3), + mirrorPermutation(50, 4, 1, 8), }; template void testDistPermutations(comm::CommunicatorGrid grid, SizeType n, SizeType nb, SizeType i_begin, - SizeType i_end) { + SizeType i_last, std::vector perms) { const GlobalElementSize size(n, n); const TileElementSize block_size(nb, nb); - Index2D src_rank_index(std::max(0, grid.size().rows() - 1), std::min(1, grid.size().cols() - 1)); + const Index2D src_rank_index(std::max(0, grid.size().rows() - 1), std::min(1, grid.size().cols() - 1)); - Distribution dist(size, block_size, grid.size(), grid.rank(), src_rank_index); - Matrix perms_h(LocalElementSize(n, 1), TileElementSize(nb, 1)); - Matrix mat_in_h(dist); - Matrix mat_out_h(dist); + const Distribution dist(size, block_size, grid.size(), grid.rank(), src_rank_index); - SizeType index_start = dist.globalElementFromGlobalTileAndTileElement(i_begin, 0); - SizeType index_finish = dist.globalElementFromGlobalTileAndTileElement(i_end, 0) + - dist.tileSize(GlobalTileIndex(i_end, i_end)).get(); - dlaf::matrix::util::set(perms_h, [index_start, index_finish](GlobalElementIndex i) { - if (index_start > i.row() || i.row() >= index_finish) - return SizeType(0); + const auto [index_start, index_end] = tileToElementRange(n, nb, i_begin, i_last + 1); - return index_finish - 1 - i.row(); - }); - dlaf::matrix::util::set(mat_in_h, [](GlobalElementIndex i) { - return T(i.get()) - T(i.get()) / T(8); - }); - dlaf::matrix::util::set0(pika::execution::thread_priority::normal, mat_out_h); + Matrix perms_h = [=, index_start = index_start, index_end = index_end] { + Matrix perms_h(LocalElementSize(n, 1), TileElementSize(nb, 1)); + dlaf::matrix::util::set(perms_h, [=](GlobalElementIndex i) { + if (index_start > i.row() || i.row() >= index_end) + return SizeType(0); + + const SizeType i_window = i.row() - index_start; + return perms[to_sizet(i_window)]; + }); + return perms_h; + }(); + + auto value_in = [](GlobalElementIndex i) { return T(i.get()) - T(i.get()) / T(8); }; + Matrix mat_in_h = [dist, value_in]() { + Matrix mat_in_h(dist); + dlaf::matrix::util::set(mat_in_h, value_in); + return mat_in_h; + }(); + + auto value_out = [](GlobalElementIndex i) { return T(i.get()) - T(i.get()) / T(8); }; + Matrix mat_out_h(dist); + dlaf::matrix::util::set(mat_out_h, value_out); { matrix::MatrixMirror perms(perms_h); - matrix::MatrixMirror mat_in(mat_in_h); + matrix::MatrixMirror mat_in(mat_in_h); matrix::MatrixMirror mat_out(mat_out_h); - permutations::permute, D, T, C>(grid, i_begin, i_end, perms.get(), mat_in.get(), + permutations::permute, D, T, C>(grid, i_begin, i_last, perms.get(), mat_in.get(), mat_out.get()); } - auto expected_out = [i_begin, i_end, index_start, index_finish, &dist](const GlobalElementIndex i) { - GlobalTileIndex i_tile = dist.globalTileIndex(i); - if (i_begin <= i_tile.row() && i_tile.row() <= i_end && i_begin <= i_tile.col() && - i_tile.col() <= i_end) { - GlobalElementIndex i_in(i.get(), index_finish + index_start - 1 - i.get()); + auto expected_out = [=, index_start = index_start](const GlobalElementIndex& i) { + const GlobalTileIndex i_tile = dist.globalTileIndex(i); + if (i_begin <= i_tile.row() && i_tile.row() <= i_last && i_begin <= i_tile.col() && + i_tile.col() <= i_last) { + const std::size_t i_window = to_sizet(i.get() - index_start); + GlobalElementIndex i_in(i.get(), index_start + perms[i_window]); if constexpr (C == Coord::Row) i_in.transpose(); - return T(i_in.get()) - T(i_in.get()) / T(8); + + return value_in(i_in); } - return T(0); + return value_out(i); }; CHECK_MATRIX_EQ(expected_out, mat_out_h); @@ -102,8 +157,8 @@ void testDistPermutations(comm::CommunicatorGrid grid, SizeType n, SizeType nb, TYPED_TEST(PermutationsDistTestMC, Columns) { for (const auto& comm_grid : this->commGrids()) { - for (const auto& [n, nb, i_begin, i_end] : params) { - testDistPermutations(comm_grid, n, nb, i_begin, i_end); + for (const auto& [n, nb, i_begin, i_end, perms] : params) { + testDistPermutations(comm_grid, n, nb, i_begin, i_end, perms); pika::threads::get_thread_manager().wait(); } } @@ -111,8 +166,8 @@ TYPED_TEST(PermutationsDistTestMC, Columns) { TYPED_TEST(PermutationsDistTestMC, Rows) { for (const auto& comm_grid : this->commGrids()) { - for (const auto& [n, nb, i_begin, i_end] : params) { - testDistPermutations(comm_grid, n, nb, i_begin, i_end); + for (const auto& [n, nb, i_begin, i_end, perms] : params) { + testDistPermutations(comm_grid, n, nb, i_begin, i_end, perms); pika::threads::get_thread_manager().wait(); } } diff --git a/test/unit/permutations/test_permutations_local.cpp b/test/unit/permutations/test_permutations_local.cpp index a6fdc24e5f..3bfca591d5 100644 --- a/test/unit/permutations/test_permutations_local.cpp +++ b/test/unit/permutations/test_permutations_local.cpp @@ -36,29 +36,37 @@ TYPED_TEST_SUITE(PermutationsTestGPU, RealMatrixElementTypes); // reverse order into the output matrix. template void testPermutations(SizeType n, SizeType nb, SizeType i_begin, SizeType i_end) { - Matrix perms_h(LocalElementSize(n, 1), TileElementSize(nb, 1)); - Matrix mat_in_h(LocalElementSize(n, n), TileElementSize(nb, nb)); - Matrix mat_out_h(LocalElementSize(n, n), TileElementSize(nb, nb)); - - const matrix::Distribution& distr = mat_out_h.distribution(); + const matrix::Distribution distr({n, n}, {nb, nb}); SizeType index_start = distr.globalElementFromGlobalTileAndTileElement(i_begin, 0); SizeType index_finish = distr.globalElementFromGlobalTileAndTileElement(i_end, 0) + distr.tileSize(GlobalTileIndex(i_end, i_end)).get(); - dlaf::matrix::util::set(perms_h, [index_start, index_finish](GlobalElementIndex i) { - if (index_start > i.row() || i.row() >= index_finish) - return SizeType(0); - - return index_finish - 1 - i.row(); - }); - dlaf::matrix::util::set(mat_in_h, [](GlobalElementIndex i) { - return T(i.get()) - T(i.get()) / T(8); - }); + + Matrix perms_h = [n, nb, index_start, index_finish]() { + Matrix perms_h(LocalElementSize(n, 1), TileElementSize(nb, 1)); + dlaf::matrix::util::set(perms_h, [index_start, index_finish](GlobalElementIndex i) { + if (index_start > i.row() || i.row() >= index_finish) + return SizeType(0); + + return index_finish - 1 - i.row(); + }); + return perms_h; + }(); + + Matrix mat_in_h = [distr]() { + Matrix mat_in_h(distr); + dlaf::matrix::util::set(mat_in_h, [](GlobalElementIndex i) { + return T(i.get()) - T(i.get()) / T(8); + }); + return mat_in_h; + }(); + + Matrix mat_out_h(distr); dlaf::matrix::util::set0(pika::execution::thread_priority::normal, mat_out_h); { matrix::MatrixMirror perms(perms_h); - matrix::MatrixMirror mat_in(mat_in_h); + matrix::MatrixMirror mat_in(mat_in_h); matrix::MatrixMirror mat_out(mat_out_h); permutations::permute(i_begin, i_end, perms.get(), mat_in.get(), mat_out.get());