Skip to content

Commit

Permalink
WIP fix intel
Browse files Browse the repository at this point in the history
  • Loading branch information
greole committed Jun 11, 2024
1 parent 5149568 commit 05a5d8c
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 31 deletions.
9 changes: 9 additions & 0 deletions MatrixWrapper/CommunicationPattern/CommunicationPattern.C
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ compute_send_recv_counts(const ExecutorHandler &exec_handler,

label tot_recv_elements{0};
label comm_elements_buffer{0};
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
if (rank == owner_rank) {
// send and recv to it self
recv_offsets[owner_rank] = padding_before;
Expand All @@ -111,6 +112,7 @@ compute_send_recv_counts(const ExecutorHandler &exec_handler,
// the start of the next rank data
tot_recv_elements = padding_before + size + padding_after;

std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
for (int i = 1; i < ranks_per_gpu; i++) {
// receive the recv counts
comm.recv(exec, &comm_elements_buffer, 1, rank + i, rank);
Expand Down Expand Up @@ -166,6 +168,13 @@ void communicate_values (
// send_buffer should be on the host
// recv_buffer should be on the device
// auto rank = comm.rank();
std::cout
<< __FILE__ << ":" << __LINE__
<< " send_counts " << send_counts
<< " recv_counts " << recv_counts
<< " send_offsets " << send_offsets
<< " recv_offsets " << recv_offsets
<< "\n";

comm.all_to_all_v(exec, send_buffer, send_counts.data(),
send_offsets.data(), recv_buffer, recv_counts.data(),
Expand Down
132 changes: 104 additions & 28 deletions MatrixWrapper/Distributed/Distributed.H
Original file line number Diff line number Diff line change
Expand Up @@ -215,10 +215,30 @@ public:
local_sparsity_ = repart_loc_sparsity;
non_local_sparsity_ = repart_non_loc_sparsity;

std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
<< " build_localized_partition \n";
//<< " dim " << local_sparsity_->dim[0] << " send idxs size "
//<< dst_comm_pattern.send_idxs.size() << " target ids "
//<< dst_comm_pattern.target_ids << " target sizes "
//<< dst_comm_pattern.target_sizes << "\n";

auto localized_partition = local_part_type::build_from_blocked_recv(
exec, local_sparsity_->dim[0], dst_comm_pattern->send_idxs,
dst_comm_pattern->target_ids, dst_comm_pattern->target_sizes);

std::cout << __FILE__ << " rank " << rank << " local sparsity size "
<< local_sparsity_->size_ << " local sparsity dim ["
<< local_sparsity_->dim[0] << "x" << local_sparsity_->dim[1]
<< "] non_local sparsity size " << non_local_sparsity_->size_
<< " non local sparsity dim [" << non_local_sparsity_->dim[0]
<< "x" << non_local_sparsity_->dim[1] << "] target_ids "
<< dst_comm_pattern->target_ids << " target_sizes "
<< dst_comm_pattern->target_sizes << " target_send_idxs.size "
<< dst_comm_pattern->send_idxs.size()
<< " non_local_sparsity.size " << non_local_sparsity_->size_
<< " get_recv_indices "
<< localized_partition->get_recv_indices().get_num_elems()
<< " \n";

auto sparse_comm =
sparse_communicator::create(comm, localized_partition);
Expand Down Expand Up @@ -264,11 +284,15 @@ public:
non_local_sparsity_->row_idxs,
non_local_sparsity_->col_idxs, non_local_coeffs),
sparse_comm);
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
<< " dnoe read distributed \n";


update_impl(exec_handler, matrix_format, repartitioner, host_A, dist_A,
local_sparsity_, non_local_sparsity_, src_comm_pattern,
local_interfaces);
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
<< " dnoe update impl \n";

auto ret = std::make_shared<RepartDistMatrix>(
exec, comm, repartitioner.get_repart_dim(), dist_A->get_size(),
Expand Down Expand Up @@ -305,6 +329,8 @@ public:
auto exec = exec_handler.get_ref_exec();
auto device_exec = exec_handler.get_device_exec();
auto ranks_per_gpu = repartitioner.get_ranks_per_gpu();
bool requires_host_buffer = exec_handler.get_gko_force_host_buffer();

label rank{repartitioner.get_rank(exec_handler)};
label owner_rank = repartitioner.get_owner_rank(exec_handler);
bool owner = repartitioner.is_owner(exec_handler);
Expand All @@ -314,29 +340,43 @@ public:
auto diag_comm_pattern = compute_send_recv_counts(
exec_handler, ranks_per_gpu, nrows, local_matrix_nnz,
local_matrix_nnz - nrows, 0);
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
<< " diag comm pattern \n";


label upper_nnz = host_A->get_upper_nnz();
auto upper_comm_pattern = compute_send_recv_counts(
exec_handler, ranks_per_gpu, upper_nnz, local_matrix_nnz, 0,
local_matrix_nnz - upper_nnz);
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
<< " upper comm pattern \n";
auto lower_comm_pattern =
compute_send_recv_counts(exec_handler, ranks_per_gpu, upper_nnz,
local_matrix_nnz, upper_nnz, nrows);

scalar *local_ptr;
scalar *local_ptr_2;
label nnz=0;

// update main values
std::vector<scalar> loc_buffer;
if (owner) {
using Coo = gko::matrix::Coo<scalar, label>;
auto local_mtx = dist_A->get_local_matrix();


std::shared_ptr<const Coo> local =
gko::as<Coo>(gko::as<CombinationMatrix<scalar, label, Coo>>(
dist_A->get_local_matrix())
->get_combination()
->get_operators()[0]);
local_ptr = const_cast<scalar *>(local->get_const_values());
if (requires_host_buffer) {
loc_buffer.resize(local->get_num_stored_elements());
local_ptr = loc_buffer.data();
local_ptr_2 = const_cast<scalar *>(local->get_const_values());
} else {
local_ptr = const_cast<scalar *>(local->get_const_values());
}
}
communicate_values(exec_handler, diag_comm_pattern, host_A->get_diag(),
local_ptr);
Expand All @@ -352,6 +392,18 @@ public:
communicate_values(exec_handler, lower_comm_pattern,
host_A->get_lower(), local_ptr);
}
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
<< " done comm local mtx \n";

if (requires_host_buffer) {
auto host_buffer_view =
gko::array<scalar>::view(exec, nnz, local_ptr);
auto target_buffer_view =
gko::array<scalar>::view(device_exec, nnz, local_ptr_2);
target_buffer_view = host_buffer_view;
}
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
<< " done copy to device \n";

// copy interface values
auto comm = *exec_handler.get_communicator().get();
Expand All @@ -364,6 +416,7 @@ public:
label tag = 0;
label comm_rank, comm_size;
scalar *recv_buffer_ptr;
std::vector<scalar> host_recv_buffer;
label remain_host_interfaces = host_A->get_interface_size();
for (auto [is_local, comm_rank] : local_interfaces) {
label &ctr = (is_local) ? loc_ctr : nloc_ctr;
Expand All @@ -383,9 +436,18 @@ public:
comm_size =
non_local_sparsity->interface_spans[ctr].length();
}
recv_buffer_ptr = const_cast<scalar *>(mtx->get_const_values());

if (requires_host_buffer) {
host_recv_buffer.resize(comm_size);
recv_buffer_ptr = host_recv_buffer.data();
} else {
recv_buffer_ptr = const_cast<scalar *>(mtx->get_const_values());
}

if (comm_rank != rank) {
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
<< " comm_rank " << comm_rank << " rank " << rank << " \n";

comm.recv(exec, recv_buffer_ptr, comm_size, comm_rank, tag);
} else {
// if data is already on this rank
Expand Down Expand Up @@ -427,33 +489,47 @@ public:
}
}

std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
<< " reorder \n";
// reorder updated values
if (owner) {
// NOTE local sparsity size includes the interfaces
using Coo = gko::matrix::Coo<scalar, label>;
using dim_type = gko::dim<2>::dimension_type;
std::shared_ptr<const Coo> local =
gko::as<Coo>(gko::as<CombinationMatrix<scalar, label, Coo>>(
dist_A->get_local_matrix())
->get_combination()
->get_operators()[0]);
auto local_elements = local->get_num_stored_elements();
local_ptr = const_cast<scalar *>(local->get_const_values());
// TODO make sure this doesn't copy
// create a non owning dense matrix of local_values

auto row_collection = gko::share(gko::matrix::Dense<scalar>::create(
exec, gko::dim<2>{static_cast<dim_type>(local_elements), 1},
gko::array<scalar>::view(exec, local_elements, local_ptr), 1));

auto mapping_view = gko::array<label>::view(
exec, local_elements, local_sparsity->ldu_mapping.get_data());


// TODO this needs to copy ldu_mapping to the device
auto dense_vec = row_collection->clone();
dense_vec->row_gather(&mapping_view, row_collection.get());
}
if (owner) {
// NOTE local sparsity size includes the interfaces
using Coo = gko::matrix::Coo<scalar, label>;
using dim_type = gko::dim<2>::dimension_type;
std::shared_ptr<const Coo> local =
gko::as<Coo>(gko::as<CombinationMatrix<scalar, label, Coo>>(
dist_A->get_local_matrix())
->get_combination()
->get_operators()[0]);
auto local_elements = local->get_num_stored_elements();
local_ptr = const_cast<scalar *>(local->get_const_values());
// TODO make sure this doesn't copy
// create a non owning dense matrix of local_values

auto row_collection = gko::share(gko::matrix::Dense<scalar>::create(
device_exec, gko::dim<2>{static_cast<dim_type>(local_elements), 1},
gko::array<scalar>::view(device_exec, local_elements, local_ptr), 1));
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << " local_elements " << local_elements
<< " reorder \n";

auto mapping_view = gko::array<label>::view(
exec, local_elements, local_sparsity->ldu_mapping.get_data());
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
<< " reorder \n";


// TODO this needs to copy ldu_mapping to the device
auto dense_vec = row_collection->clone();
//auto dense_vec = gko::share(gko::matrix::Dense<scalar>::create(exec, row_collection->get_size()));

std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
<< " reorder \n";
dense_vec->row_gather(&mapping_view, row_collection.get());
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
<< " reorder \n";
}
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
<< "done reorder \n";
};

RepartDistMatrix(
Expand Down
10 changes: 10 additions & 0 deletions MatrixWrapper/SparsityPattern/SparsityPattern.H
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,16 @@ struct SparsityPattern {
rank(std::vector<label>{})
{}

SparsityPattern(std::shared_ptr<const SparsityPattern> other)
: size_(other->size_),
row_idxs(other->row_idxs),
col_idxs(other->col_idxs),
ldu_mapping(other->ldu_mapping),
dim(other->dim),
interface_spans(other->interface_spans),
rank(other->rank)
{}

SparsityPattern(std::shared_ptr<const gko::Executor> exec, label size)
: size_(size),
row_idxs{exec, static_cast<gko::size_type>(size_)},
Expand Down
2 changes: 1 addition & 1 deletion Preconditioner/Preconditioner.H
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ public:
auto smoother_gen = gko::share(
ir::build()
.with_solver(inner_solver_gen)
.with_relaxation_factor(0.9)
//.with_relaxation_factor(0.9)
.with_criteria(
gko::stop::Iteration::build().with_max_iters(2u).on(
device_exec))
Expand Down
35 changes: 33 additions & 2 deletions Repartitioner/Repartitioner.H
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,8 @@ public:
** signals whether this is a new local interface (no communication), and the
** second entry (label) tracks the original rank of the interface
*/
std::tuple<std::shared_ptr<SparsityPattern>,
std::tuple<
std::shared_ptr<SparsityPattern>,
std::shared_ptr<SparsityPattern>,
std::vector<std::pair<bool, label>>>
repartition_sparsity(
Expand All @@ -238,19 +239,44 @@ public:
label rank = get_rank(exec_handler);
label owner_rank = get_owner_rank(exec_handler);
label ranks_per_gpu = ranks_per_gpu_;
// TODO dont copy
if (ranks_per_gpu == 1) {
std::vector<std::pair<bool, label>> ret;
for (auto comm_rank:src_non_local_pattern->rank) {
ret.emplace_back(false, rank);
}


return std::make_tuple<
std::shared_ptr<SparsityPattern>,
std::shared_ptr<SparsityPattern>,
std::vector<std::pair<bool, label>>>(
std::make_shared<SparsityPattern>(src_local_pattern),
std::make_shared<SparsityPattern>(src_non_local_pattern),
std::move(ret)
);
}

std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
auto local_comm_pattern = compute_send_recv_counts(
exec_handler, ranks_per_gpu, src_local_pattern->size_);
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << " owner rank" << owner_rank<< "\n";


label offset = orig_partition_->get_range_bounds()[rank] -
label offset = 0;
if (ranks_per_gpu != 1){
offset = orig_partition_->get_range_bounds()[rank] -
orig_partition_->get_range_bounds()[owner_rank];
}
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";

auto gather_closure = [&](auto &comm_pattern, auto &data,
label offset) {
return gather_to_owner(exec_handler, comm_pattern, data.get_size(),
data.get_data(), offset);
};

std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
SparsityPatternVector merged_local{
gather_closure(local_comm_pattern, src_local_pattern->row_idxs,
offset),
Expand All @@ -267,6 +293,7 @@ public:
make_ldu_mapping_consecutive(
local_comm_pattern, merged_local.mapping, rank, ranks_per_gpu);
}
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";

label rows =
(is_owner(exec_handler)) ? merged_local.rows.back() + 1 : 0;
Expand All @@ -284,6 +311,7 @@ public:
spans_begin.push_back(elem.begin);
spans_end.push_back(elem.end);
}
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";

// the non local cols are in local idx of other side
// thus we need the new offset of the other side
Expand All @@ -300,6 +328,7 @@ public:
std::transform(data, data + size, data,
[&](label idx) { return idx + local_offset; });
}
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";

SparsityPatternVector merged_non_local{
gather_closure(non_local_comm_pattern,
Expand Down Expand Up @@ -328,6 +357,7 @@ public:
// build vector with locality information
std::vector<std::pair<bool, label>> locality;
label ctr{0};
std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";

if (is_owner(exec_handler)) {
auto recv_counts = std::get<1>(span_comm_pattern);
Expand All @@ -354,6 +384,7 @@ public:
gathered_non_local.cols[i] = i;
}

std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
LOG_1(verbose_, "done repartition sparsity pattern")
if (is_owner(exec_handler)) {
auto new_local_spars_pattern = std::make_shared<SparsityPattern>(
Expand Down

0 comments on commit 05a5d8c

Please sign in to comment.