diff --git a/CMakeLists.txt b/CMakeLists.txt index d22e13b9cc..dc530e726c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -171,8 +171,7 @@ else() endif() # ----- pika -find_package(pika 0.17.0 REQUIRED) -find_package(pika-algorithms 0.1.1 REQUIRED) +find_package(pika 0.18.0 REQUIRED) # ----- BLASPP/LAPACKPP find_package(blaspp REQUIRED) diff --git a/ci/.gitlab-ci.yml b/ci/.gitlab-ci.yml index 9e9fa05f2b..9a2fa27ef8 100644 --- a/ci/.gitlab-ci.yml +++ b/ci/.gitlab-ci.yml @@ -16,7 +16,7 @@ stages: stage: build timeout: 6 hours variables: - SPACK_SHA: 566754440f9dfed9accd25db7f1a67b0cd074fcd + SPACK_SHA: 4429e17db053fcf7ec591ce3abe8ffc7733c0a25 SPACK_DLAF_REPO: ./spack before_script: - !reference [.fetch-registry-tokens, script] diff --git a/ci/docker/release-cpu-stdexec.yaml b/ci/docker/release-cpu-stdexec.yaml index 11b0c25415..0035f02a39 100644 --- a/ci/docker/release-cpu-stdexec.yaml +++ b/ci/docker/release-cpu-stdexec.yaml @@ -18,7 +18,7 @@ spack: true specs: - - dla-future@master +miniapps +ci-test +ci-check-threads ^intel-mkl threads=openmp ^mpich ^pika+stdexec ^stdexec@git.48c52df0f81c6151eecf4f39fa5eed2dc0216204=main + - dla-future@master +miniapps +ci-test +ci-check-threads ^intel-mkl threads=openmp ^mpich ^pika +stdexec ^stdexec@git.48c52df0f81c6151eecf4f39fa5eed2dc0216204=main packages: all: diff --git a/cmake/template/DLAFConfig.cmake.in b/cmake/template/DLAFConfig.cmake.in index 083ef33f84..5355b57b92 100644 --- a/cmake/template/DLAFConfig.cmake.in +++ b/cmake/template/DLAFConfig.cmake.in @@ -80,7 +80,6 @@ endif() # ----- pika find_dependency(pika PATHS @pika_DIR@) -find_dependency(pika-algorithms PATHS @pika-algorithms_DIR@) # ----- BLASPP/LAPACKPP find_dependency(blaspp PATHS @blaspp_DIR@) diff --git a/include/dlaf/permutations/general/impl.h b/include/dlaf/permutations/general/impl.h index 7b7cad69e5..84483b9b72 100644 --- a/include/dlaf/permutations/general/impl.h +++ b/include/dlaf/permutations/general/impl.h @@ -14,7 +14,7 @@ #include -#include +#include #include #include @@ -39,6 +39,10 @@ #include #include +#ifdef DLAF_WITH_GPU +#include +#endif + namespace dlaf::permutations::internal { // Applies the permutation index `perm_arr` to a portion of the columns/rows(depends on coord) [1] of an @@ -83,103 +87,41 @@ namespace dlaf::permutations::internal { // [2]: The input submatrix is defined by `begin_tiles`, `ld_tiles`, `distr` and `in_tiles` // [3]: The subregion is defined by `begin` and `sz` // [4]: The output submatrix is defined by `begin_tiles`, `ld_tiles`, `distr` and `out_tiles` -template -void applyPermutations( - const GlobalElementIndex out_begin, const GlobalElementSize sz, const SizeType in_offset, - const matrix::Distribution& distr, const SizeType* perm_arr, - const std::vector>& in_tiles, - const std::vector>& out_tiles, [[maybe_unused]] Args&&... args) { - if constexpr (D == Device::CPU) { - constexpr Coord orth_coord = orthogonal(coord); - std::vector splits = - util::interleaveSplits(sz.get(), distr.blockSize().get(), - distr.distanceToAdjacentTile(in_offset), - distr.distanceToAdjacentTile(out_begin.get())); - - // Parallelized over the number of permuted columns or rows - const SizeType nperms = sz.get(); - pika::for_loop(pika::execution::par, to_sizet(0), to_sizet(nperms), [&](SizeType i_perm) { - DLAF_ASSERT_HEAVY(i_perm >= 0 && i_perm < nperms, i_perm, nperms); - DLAF_ASSERT_HEAVY(perm_arr[i_perm] >= 0 && perm_arr[i_perm] < nperms, i_perm, nperms); - - for (std::size_t i_split = 0; i_split < splits.size() - 1; ++i_split) { - const SizeType split = splits[i_split]; - - GlobalElementIndex i_split_gl_in(split + in_offset, perm_arr[i_perm]); - GlobalElementIndex i_split_gl_out(split + out_begin.get(), - out_begin.get() + i_perm); - TileElementSize region(splits[i_split + 1] - split, 1); - if constexpr (coord == Coord::Row) { - region.transpose(); - i_split_gl_in.transpose(); - i_split_gl_out.transpose(); - } - - const TileElementIndex i_subtile_in = distr.tileElementIndex(i_split_gl_in); - const auto& tile_in = in_tiles[to_sizet(distr.globalTileLinearIndex(i_split_gl_in))].get(); - const TileElementIndex i_subtile_out = distr.tileElementIndex(i_split_gl_out); - auto& tile_out = out_tiles[to_sizet(distr.globalTileLinearIndex(i_split_gl_out))]; - - dlaf::tile::lacpy(region, i_subtile_in, tile_in, i_subtile_out, tile_out); - } - }); - } - else if constexpr (D == Device::GPU) { -#if defined(DLAF_WITH_GPU) - applyPermutationsOnDevice(out_begin, sz, in_offset, distr, perm_arr, in_tiles, out_tiles, - args...); -#endif - } -} -// FilterFunc is a function with signature bool(*)(SizeType) -template -void applyPermutationsFiltered( - const GlobalElementIndex out_begin, const GlobalElementSize sz, const SizeType in_offset, - const matrix::Distribution& subm_dist, const SizeType* perm_arr, - const std::vector>& in_tiles_fut, - const std::vector>& out_tiles, FilterFunc&& filter) { +template +void applyPermutationOnCPU( + const SizeType i_perm, const std::vector& splits, const GlobalElementIndex out_begin, + const SizeType in_offset, const matrix::Distribution& subm_dist, const SizeType* perm_arr, + const std::vector>& in_tiles_fut, + const std::vector>& out_tiles) { constexpr auto OC = orthogonal(C); - std::vector splits = - dlaf::util::interleaveSplits(sz.get(), subm_dist.blockSize().get(), - subm_dist.distanceToAdjacentTile(in_offset), - subm_dist.distanceToAdjacentTile(out_begin.get())); - - const SizeType nperms = subm_dist.size().get(); - - // Parallelized over the number of permutations - pika::for_loop(pika::execution::par, to_sizet(0), to_sizet(nperms), [&](SizeType i_perm) { - if (!filter(perm_arr[i_perm])) - return; - for (std::size_t i_split = 0; i_split < splits.size() - 1; ++i_split) { - const SizeType split = splits[i_split]; + for (std::size_t i_split = 0; i_split < splits.size() - 1; ++i_split) { + const SizeType split = splits[i_split]; - GlobalElementIndex i_split_gl_in(split + in_offset, perm_arr[i_perm]); - GlobalElementIndex i_split_gl_out(split + out_begin.get(), out_begin.get() + i_perm); - TileElementSize region(splits[i_split + 1] - split, 1); + GlobalElementIndex i_split_gl_in(split + in_offset, perm_arr[i_perm]); + GlobalElementIndex i_split_gl_out(split + out_begin.get(), out_begin.get() + i_perm); + TileElementSize region(splits[i_split + 1] - split, 1); - if constexpr (C == Coord::Row) { - region.transpose(); - i_split_gl_in.transpose(); - i_split_gl_out.transpose(); - } + if constexpr (C == Coord::Row) { + region.transpose(); + i_split_gl_in.transpose(); + i_split_gl_out.transpose(); + } - const TileElementIndex i_subtile_in = subm_dist.tileElementIndex(i_split_gl_in); - const auto& tile_in = in_tiles_fut[to_sizet(subm_dist.globalTileLinearIndex(i_split_gl_in))].get(); - const TileElementIndex i_subtile_out = subm_dist.tileElementIndex(i_split_gl_out); - auto& tile_out = out_tiles[to_sizet(subm_dist.globalTileLinearIndex(i_split_gl_out))]; + const TileElementIndex i_subtile_in = subm_dist.tileElementIndex(i_split_gl_in); + const auto& tile_in = in_tiles_fut[to_sizet(subm_dist.globalTileLinearIndex(i_split_gl_in))].get(); + const TileElementIndex i_subtile_out = subm_dist.tileElementIndex(i_split_gl_out); + auto& tile_out = out_tiles[to_sizet(subm_dist.globalTileLinearIndex(i_split_gl_out))]; - dlaf::tile::lacpy(region, i_subtile_in, tile_in, i_subtile_out, tile_out); - } - }); + dlaf::tile::lacpy(region, i_subtile_in, tile_in, i_subtile_out, tile_out); + } } template void Permutations::call(const SizeType i_begin, const SizeType i_end, Matrix& perms, Matrix& mat_in, Matrix& mat_out) { - namespace ut = matrix::util; namespace ex = pika::execution::experimental; if (i_begin == i_end) @@ -188,7 +130,7 @@ void Permutations::call(const SizeType i_begin, const SizeType i_end const matrix::Distribution& distr = mat_in.distribution(); const SizeType m = distr.globalTileElementDistance(i_begin, i_end); const SizeType n = distr.globalTileElementDistance(i_begin, i_end); - matrix::Distribution subm_distr(LocalElementSize(m, n), distr.blockSize()); + matrix::Distribution subm_dist(LocalElementSize(m, n), distr.blockSize()); const SizeType ntiles = i_end - i_begin; auto perms_range = common::iterate_range2d(LocalTileIndex(i_begin, 0), LocalTileSize(ntiles, 1)); @@ -198,29 +140,68 @@ void Permutations::call(const SizeType i_begin, const SizeType i_end ex::when_all_vector(matrix::selectRead(mat_in, mat_range)), ex::when_all_vector(matrix::select(mat_out, mat_range))); - auto permute_fn = [subm_distr](const auto& index_tile_futs, const auto& mat_in_tiles, - const auto& mat_out_tiles, auto&&... ts) { - TileElementIndex zero(0, 0); - const SizeType* i_ptr = index_tile_futs[0].get().ptr(zero); - applyPermutations(GlobalElementIndex(0, 0), subm_distr.size(), 0, subm_distr, i_ptr, - mat_in_tiles, mat_out_tiles, std::forward(ts)...); - }; - ex::start_detached(dlaf::internal::transform(dlaf::internal::Policy(), std::move(permute_fn), - std::move(sender))); + if constexpr (D == Device::CPU) { + auto setup_permute_fn = [subm_dist](auto index_tile_futs, auto mat_in_tiles, auto mat_out_tiles) { + const GlobalElementIndex out_begin{0, 0}; + const SizeType in_offset = 0; + constexpr Coord orth_coord = orthogonal(C); + + std::vector splits = util::interleaveSplits( + subm_dist.size().get(), subm_dist.blockSize().get(), + subm_dist.distanceToAdjacentTile(in_offset), + subm_dist.distanceToAdjacentTile(out_begin.get())); + + return std::tuple(std::move(splits), std::move(index_tile_futs), std::move(mat_in_tiles), + std::move(mat_out_tiles)); + }; + + auto permute_fn = [subm_dist](const auto i_perm, const auto& splits, const auto& index_tile_futs, + const auto& mat_in_tiles, const auto& mat_out_tiles) { + const TileElementIndex zero(0, 0); + const SizeType* perm_arr = index_tile_futs[0].get().ptr(zero); + const GlobalElementIndex out_begin{0, 0}; + const SizeType in_offset = 0; + + [[maybe_unused]] const SizeType nperms = subm_dist.size().get(); + DLAF_ASSERT_HEAVY(i_perm >= 0 && i_perm < nperms, i_perm, nperms); + DLAF_ASSERT_HEAVY(perm_arr[i_perm] >= 0 && perm_arr[i_perm] < nperms, i_perm, nperms); + + applyPermutationOnCPU(i_perm, splits, out_begin, in_offset, subm_dist, perm_arr, + mat_in_tiles, mat_out_tiles); + }; + + ex::start_detached(std::move(sender) | + dlaf::internal::transform(dlaf::internal::Policy(), + std::move(setup_permute_fn)) | + ex::unpack() | ex::bulk(subm_dist.size().get(), std::move(permute_fn))); + } + else { +#if defined(DLAF_WITH_GPU) + auto permute_fn = [subm_dist](const auto& index_tile_futs, const auto& mat_in_tiles, + const auto& mat_out_tiles, whip::stream_t stream) { + TileElementIndex zero(0, 0); + const SizeType* i_ptr = index_tile_futs[0].get().ptr(zero); + + applyPermutationsOnDevice(GlobalElementIndex(0, 0), subm_dist.size(), 0, subm_dist, i_ptr, + mat_in_tiles, mat_out_tiles, stream); + }; + + ex::start_detached(std::move(sender) | + dlaf::internal::transform(dlaf::internal::Policy(), std::move(permute_fn))); +#endif + } } template auto whenAllReadWriteTilesArray(LocalTileIndex begin, LocalTileIndex end, Matrix& matrix) { const LocalTileSize sz{end.row() - begin.row(), end.col() - begin.col()}; namespace ex = pika::execution::experimental; - namespace ut = matrix::util; return ex::when_all_vector(matrix::select(matrix, common::iterate_range2d(begin, sz))); } template auto whenAllReadWriteTilesArray(Matrix& matrix) { namespace ex = pika::execution::experimental; - namespace ut = matrix::util; return ex::when_all_vector(matrix::select( matrix, common::iterate_range2d(LocalTileIndex(0, 0), matrix.distribution().localNrTiles()))); } @@ -229,14 +210,12 @@ template auto whenAllReadOnlyTilesArray(LocalTileIndex begin, LocalTileIndex end, Matrix& matrix) { const LocalTileSize sz{end.row() - begin.row(), end.col() - begin.col()}; namespace ex = pika::execution::experimental; - namespace ut = matrix::util; return ex::when_all_vector(matrix::selectRead(matrix, common::iterate_range2d(begin, sz))); } template auto whenAllReadOnlyTilesArray(Matrix& matrix) { namespace ex = pika::execution::experimental; - namespace ut = matrix::util; return ex::when_all_vector(matrix::selectRead( matrix, common::iterate_range2d(LocalTileIndex(0, 0), matrix.distribution().localNrTiles()))); } @@ -399,14 +378,180 @@ void applyPackingIndex(const matrix::Distribution& subm_dist, IndexMapSender&& i auto sender = ex::when_all(std::forward(index_map), std::forward(in), std::forward(out)); - auto permute_fn = [subm_dist](const auto& index_tile_futs, const auto& mat_in_tiles, - const auto& mat_out_tiles, auto&&... ts) { - const SizeType* i_ptr = index_tile_futs[0].get().ptr(); - applyPermutations(GlobalElementIndex(0, 0), subm_dist.size(), 0, subm_dist, i_ptr, - mat_in_tiles, mat_out_tiles, std::forward(ts)...); + if constexpr (D == Device::CPU) { + auto setup_permute_fn = [subm_dist](auto index_tile_futs, auto mat_in_tiles, auto mat_out_tiles) { + const GlobalElementIndex out_begin{0, 0}; + const SizeType in_offset = 0; + constexpr Coord orth_coord = orthogonal(C); + + std::vector splits = util::interleaveSplits( + subm_dist.size().get(), subm_dist.blockSize().get(), + subm_dist.distanceToAdjacentTile(in_offset), + subm_dist.distanceToAdjacentTile(out_begin.get())); + + return std::tuple(std::move(splits), std::move(index_tile_futs), std::move(mat_in_tiles), + std::move(mat_out_tiles)); + }; + + auto permute_fn = [subm_dist](const auto i_perm, const auto& splits, const auto& index_tile_futs, + const auto& mat_in_tiles, const auto& mat_out_tiles) { + TileElementIndex zero(0, 0); + const SizeType* perm_arr = index_tile_futs[0].get().ptr(zero); + const GlobalElementIndex out_begin{0, 0}; + const SizeType in_offset = 0; + + [[maybe_unused]] const SizeType nperms = subm_dist.size().get(); + DLAF_ASSERT_HEAVY(i_perm >= 0 && i_perm < nperms, i_perm, nperms); + DLAF_ASSERT_HEAVY(perm_arr[i_perm] >= 0 && perm_arr[i_perm] < nperms, i_perm, nperms); + + applyPermutationOnCPU(i_perm, splits, out_begin, in_offset, subm_dist, perm_arr, + mat_in_tiles, mat_out_tiles); + }; + + ex::start_detached(std::move(sender) | + dlaf::internal::transform(dlaf::internal::Policy(), + std::move(setup_permute_fn)) | + ex::unpack() | ex::bulk(subm_dist.size().get(), permute_fn)); + } + else { +#if defined(DLAF_WITH_GPU) + auto permute_fn = [subm_dist](const auto& index_tile_futs, const auto& mat_in_tiles, + const auto& mat_out_tiles, whip::stream_t stream) { + TileElementIndex zero(0, 0); + const SizeType* i_ptr = index_tile_futs[0].get().ptr(zero); + + applyPermutationsOnDevice(GlobalElementIndex(0, 0), subm_dist.size(), 0, subm_dist, i_ptr, + mat_in_tiles, mat_out_tiles, stream); + }; + + ex::start_detached(std::move(sender) | + dlaf::internal::transform(dlaf::internal::Policy(), + std::move(permute_fn))); +#endif + } +} + +template +void unpackLocalOnCPU(const matrix::Distribution& subm_dist, const matrix::Distribution& dist, + SendCountsSender&& send_counts, RecvCountsSender&& recv_counts, + UnpackingIndexSender&& unpacking_index, MatSendSender&& mat_send, + MatOutSender&& mat_out) { + namespace ex = pika::execution::experimental; + namespace di = dlaf::internal; + + auto setup_unpack_local_f = [subm_dist, + rank = dist.rankIndex().get()](auto send_counts, auto recv_counts, + auto index_tile_futs, auto mat_in_tiles, + auto mat_out_tiles) { + const size_t rank_index = to_sizet(rank); + + const SizeType* perm_arr = index_tile_futs[0].get().ptr(); + const GlobalElementSize sz = subm_dist.size(); + + const int a = std::accumulate(send_counts.cbegin(), send_counts.cbegin() + rank, 0); + const int b = a + send_counts[rank_index]; + + // Note: + // These are copied directly from mat_send, while unpacking permutation applies to indices on + // the receiver side. So, we have to "align" the unpacking permutation, by applying the offset + // existing between the send and recv side. + // This is due to the fact that send and recv buffers might be "unbalanced", e.g. rank1 sends 2 + // and receive 1 with rank0, so resulting in a shift in indices between the two buffer sides, + // following previous example the local part would start at index (0-based) 2 in mat_send and + // at index 1 in mat_recv. + const int a_r = std::accumulate(recv_counts.cbegin(), recv_counts.cbegin() + rank, 0); + const SizeType offset = to_SizeType(a - a_r); + std::vector perm_offseted; + perm_offseted.reserve(to_sizet(subm_dist.size().get())); + std::transform(perm_arr, perm_arr + subm_dist.size().get(), std::back_inserter(perm_offseted), + [offset](const SizeType perm) { return perm + offset; }); + + constexpr auto OC = orthogonal(C); + const SizeType in_offset = 0; + const GlobalElementIndex out_begin{0, 0}; + + std::vector splits = + dlaf::util::interleaveSplits(sz.get(), subm_dist.blockSize().get(), + subm_dist.distanceToAdjacentTile(in_offset), + subm_dist.distanceToAdjacentTile(out_begin.get())); + + return std::tuple(a, b, std::move(splits), std::move(perm_offseted), std::move(mat_in_tiles), + std::move(mat_out_tiles)); + }; + + auto permutations_unpack_local_f = [subm_dist](const auto i_perm, const auto a, const auto b, + const auto& splits, const auto& perm_offseted, + const auto& mat_in_tiles, const auto& mat_out_tiles) { + const SizeType* perm_arr = perm_offseted.data(); + + // [a, b) + if (a <= perm_arr[i_perm] && perm_arr[i_perm] < b) { + const SizeType in_offset = 0; + const GlobalElementIndex out_begin{0, 0}; + applyPermutationOnCPU(i_perm, splits, out_begin, in_offset, subm_dist, perm_arr, + mat_in_tiles, mat_out_tiles); + } + }; + + ex::start_detached( + ex::when_all(std::forward(send_counts), + std::forward(recv_counts), + std::forward(unpacking_index), + std::forward(mat_send), std::forward(mat_out)) | + di::transform(di::Policy(), std::move(setup_unpack_local_f)) | ex::unpack() | + ex::bulk(subm_dist.size().get(), std::move(permutations_unpack_local_f))); +} + +template +void unpackOthersOnCPU(const matrix::Distribution& subm_dist, const matrix::Distribution& dist, + RecvCountsSender&& recv_counts, UnpackingIndexSender&& unpacking_index, + MatRecvSender&& mat_recv, MatOutSender&& mat_out) { + namespace ex = pika::execution::experimental; + namespace di = dlaf::internal; + + auto setup_unpack_f = [subm_dist, + rank = dist.rankIndex().get()](auto recv_counts, auto index_tile_futs, + auto mat_in_tiles, auto mat_out_tiles) { + const size_t rank_index = to_sizet(rank); + const int a = std::accumulate(recv_counts.cbegin(), recv_counts.cbegin() + rank, 0); + const int b = a + recv_counts[rank_index]; + + constexpr auto OC = orthogonal(C); + const GlobalElementSize sz = subm_dist.size(); + const SizeType in_offset = 0; + const GlobalElementIndex out_begin{0, 0}; + + std::vector splits = + dlaf::util::interleaveSplits(sz.get(), subm_dist.blockSize().get(), + subm_dist.distanceToAdjacentTile(in_offset), + subm_dist.distanceToAdjacentTile(out_begin.get())); + + return std::tuple(a, b, std::move(splits), std::move(index_tile_futs), std::move(mat_in_tiles), + std::move(mat_out_tiles)); + }; + + auto permutations_unpack_f = [subm_dist](const auto i_perm, const auto a, const auto b, + const auto& splits, const auto& index_tile_futs, + const auto& mat_in_tiles, const auto& mat_out_tiles) { + const SizeType* perm_arr = index_tile_futs[0].get().ptr(); + + // [0, a) and [b, end) + if (perm_arr[i_perm] < a || b <= perm_arr[i_perm]) { + const SizeType in_offset = 0; + const GlobalElementIndex out_begin{0, 0}; + applyPermutationOnCPU(i_perm, splits, out_begin, in_offset, subm_dist, perm_arr, + mat_in_tiles, mat_out_tiles); + } }; - ex::start_detached(di::transform(di::Policy>(), std::move(permute_fn), - std::move(sender))); + + ex::start_detached(ex::when_all(std::forward(recv_counts), + std::forward(unpacking_index), + std::forward(mat_recv), + std::forward(mat_out)) | + di::transform(di::Policy(), std::move(setup_unpack_f)) | ex::unpack() | + ex::bulk(subm_dist.size().get(), std::move(permutations_unpack_f))); } template @@ -493,72 +638,17 @@ void permuteOnCPU(common::Pipeline& sub_task_chain, SizeType // - the last is the same, but it has to skip the part already done for local // LOCAL - auto unpack_local_f = [subm_dist, rank = dist.rankIndex().get()](const auto& send_counts, - const auto& recv_counts, - const auto& index_tile_futs, - const auto& mat_in_tiles, - const auto& mat_out_tiles) { - const size_t rank_index = to_sizet(rank); - - const SizeType* perm_arr = index_tile_futs[0].get().ptr(); - const GlobalElementSize sz = subm_dist.size(); - - const int a = std::accumulate(send_counts.cbegin(), send_counts.cbegin() + rank, 0); - const int b = a + send_counts[rank_index]; - - // Note: - // These are copied directly from mat_send, while unpacking permutation applies to indices on - // the receiver side. So, we have to "align" the unpacking permutation, by applying the offset - // existing between the send and recv side. - // This is due to the fact that send and recv buffers might be "unbalanced", e.g. rank1 sends 2 - // and receive 1 with rank0, so resulting in a shift in indices between the two buffer sides, - // following previous example the local part would start at index (0-based) 2 in mat_send and - // at index 1 in mat_recv. - const int a_r = std::accumulate(recv_counts.cbegin(), recv_counts.cbegin() + rank, 0); - const SizeType offset = to_SizeType(a - a_r); - std::vector perm_offseted(perm_arr, perm_arr + subm_dist.size().get()); - std::transform(perm_offseted.begin(), perm_offseted.end(), perm_offseted.begin(), - [offset](const SizeType perm) { return perm + offset; }); - - // [a, b) - applyPermutationsFiltered({0, 0}, sz, 0, subm_dist, perm_offseted.data(), mat_in_tiles, - mat_out_tiles, - [a, b](SizeType i_perm) { return i_perm >= a && i_perm < b; }); - }; - - ex::when_all(send_counts_sender, recv_counts_sender, whenAllReadOnlyTilesArray(unpacking_index), - whenAllReadOnlyTilesArray(mat_send), - whenAllReadWriteTilesArray(i_loc_begin, i_loc_end, mat_out)) | - di::transformDetach(di::Policy>(), std::move(unpack_local_f)); - + unpackLocalOnCPU(subm_dist, dist, send_counts_sender, recv_counts_sender, + whenAllReadOnlyTilesArray(unpacking_index), whenAllReadOnlyTilesArray(mat_send), + whenAllReadWriteTilesArray(i_loc_begin, i_loc_end, mat_out)); // COMMUNICATION-dependent all2allData(sub_task_chain, nranks, sz_loc, send_counts_sender, mat_send, recv_counts_sender, mat_recv); - - auto unpack_others_f = [subm_dist, rank = dist.rankIndex().get()](const auto& recv_counts, - const auto& index_tile_futs, - const auto& mat_in_tiles, - const auto& mat_out_tiles) { - const size_t rank_index = to_sizet(rank); - const int a = std::accumulate(recv_counts.cbegin(), recv_counts.cbegin() + rank, 0); - const int b = a + recv_counts[rank_index]; - - const SizeType* perm_arr = index_tile_futs[0].get().ptr(); - const GlobalElementSize sz = subm_dist.size(); - - // [0, a) - applyPermutationsFiltered({0, 0}, sz, 0, subm_dist, perm_arr, mat_in_tiles, mat_out_tiles, - [a](SizeType i_perm) { return i_perm < a; }); - - // [b, end) - applyPermutationsFiltered({0, 0}, sz, 0, subm_dist, perm_arr, mat_in_tiles, mat_out_tiles, - [b](SizeType i_perm) { return i_perm >= b; }); - }; - - ex::when_all(recv_counts_sender, whenAllReadOnlyTilesArray(unpacking_index), - whenAllReadOnlyTilesArray(mat_recv), - whenAllReadWriteTilesArray(i_loc_begin, i_loc_end, mat_out)) | - di::transformDetach(di::Policy>(), std::move(unpack_others_f)); + // OTHERS + unpackOthersOnCPU(subm_dist, dist, std::move(recv_counts_sender), + whenAllReadOnlyTilesArray(unpacking_index), + whenAllReadOnlyTilesArray(mat_recv), + whenAllReadWriteTilesArray(i_loc_begin, i_loc_end, mat_out)); } template diff --git a/spack/packages/dla-future/package.py b/spack/packages/dla-future/package.py index d6bb78d01a..3e4fb7577b 100644 --- a/spack/packages/dla-future/package.py +++ b/spack/packages/dla-future/package.py @@ -58,7 +58,8 @@ class DlaFuture(CMakePackage, CudaPackage, ROCmPackage): depends_on("pika@0.15.1:", when="@0.1") depends_on("pika@0.16:", when="@0.2.0") depends_on("pika@0.17:", when="@0.2.1:") - depends_on("pika-algorithms@0.1:") + depends_on("pika@0.18:", when="@master") + depends_on("pika-algorithms@0.1:", when="@:0.2") depends_on("pika +mpi") depends_on("pika +cuda", when="+cuda") depends_on("pika +rocm", when="+rocm") @@ -114,7 +115,7 @@ class DlaFuture(CMakePackage, CudaPackage, ROCmPackage): for cxxstd in cxxstds: depends_on("pika cxxstd={0}".format(cxxstd), when="cxxstd={0}".format(cxxstd)) - depends_on("pika-algorithms cxxstd={0}".format(cxxstd), when="cxxstd={0}".format(cxxstd)) + depends_on("pika-algorithms cxxstd={0}".format(cxxstd), when="@:0.2 cxxstd={0}".format(cxxstd)) variant("ci-test", default=False, description="Build for CI (Advanced usage).") conflicts("~miniapps", when="+ci-test") diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b2e9d0093f..60ef344d90 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -77,7 +77,6 @@ target_link_libraries( DLAF::LAPACK $<$:DLAF::SCALAPACK> pika::pika - pika-algorithms::pika_algorithms lapackpp blaspp umpire