WIP fix intel

hpsim · Jun 11, 2024 · 05a5d8c · 05a5d8c
1 parent 5149568
commit 05a5d8c
Show file tree

Hide file tree

Showing 5 changed files with 157 additions and 31 deletions.
diff --git a/MatrixWrapper/CommunicationPattern/CommunicationPattern.C b/MatrixWrapper/CommunicationPattern/CommunicationPattern.C
@@ -103,6 +103,7 @@ compute_send_recv_counts(const ExecutorHandler &exec_handler,
 
     label tot_recv_elements{0};
     label comm_elements_buffer{0};
+    std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
     if (rank == owner_rank) {
         // send and recv to it self
         recv_offsets[owner_rank] = padding_before;
@@ -111,6 +112,7 @@ compute_send_recv_counts(const ExecutorHandler &exec_handler,
         // the start of the next rank data
         tot_recv_elements = padding_before + size + padding_after;
 
+	std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
         for (int i = 1; i < ranks_per_gpu; i++) {
             // receive the recv counts
             comm.recv(exec, &comm_elements_buffer, 1, rank + i, rank);
@@ -166,6 +168,13 @@ void communicate_values (
     // send_buffer should be on the host
     // recv_buffer should be on the device
     // auto rank = comm.rank();
+    std::cout
+	    << __FILE__ << ":" << __LINE__
+	    << " send_counts " <<   send_counts
+	    << " recv_counts " << recv_counts
+	    << " send_offsets " << send_offsets
+	    << " recv_offsets " << recv_offsets
+	    << "\n";
 
     comm.all_to_all_v(exec, send_buffer, send_counts.data(),
                       send_offsets.data(), recv_buffer, recv_counts.data(),

diff --git a/MatrixWrapper/Distributed/Distributed.H b/MatrixWrapper/Distributed/Distributed.H
@@ -215,10 +215,30 @@ public:
         local_sparsity_ = repart_loc_sparsity;
         non_local_sparsity_ = repart_non_loc_sparsity;
 
+        std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
+                  << " build_localized_partition \n";
+                 //<< " dim " << local_sparsity_->dim[0] << " send idxs size "
+                 //<< dst_comm_pattern.send_idxs.size() << " target ids "
+                 //<< dst_comm_pattern.target_ids << " target sizes "
+                 //<< dst_comm_pattern.target_sizes << "\n";
+
         auto localized_partition = local_part_type::build_from_blocked_recv(
             exec, local_sparsity_->dim[0], dst_comm_pattern->send_idxs,
             dst_comm_pattern->target_ids, dst_comm_pattern->target_sizes);
 
+        std::cout << __FILE__ << " rank " << rank << " local sparsity size "
+                  << local_sparsity_->size_ << " local sparsity dim ["
+                  << local_sparsity_->dim[0] << "x" << local_sparsity_->dim[1]
+                  << "] non_local sparsity size " << non_local_sparsity_->size_
+                  << " non local sparsity dim [" << non_local_sparsity_->dim[0]
+                  << "x" << non_local_sparsity_->dim[1] << "] target_ids "
+                  << dst_comm_pattern->target_ids << " target_sizes "
+                  << dst_comm_pattern->target_sizes << " target_send_idxs.size "
+                  << dst_comm_pattern->send_idxs.size()
+                  << " non_local_sparsity.size " << non_local_sparsity_->size_
+                  << " get_recv_indices "
+                  << localized_partition->get_recv_indices().get_num_elems()
+                  << " \n";
 
         auto sparse_comm =
             sparse_communicator::create(comm, localized_partition);
@@ -264,11 +284,15 @@ public:
                                non_local_sparsity_->row_idxs,
                                non_local_sparsity_->col_idxs, non_local_coeffs),
             sparse_comm);
+        std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
+                  << " dnoe read distributed  \n";
 
 
         update_impl(exec_handler, matrix_format, repartitioner, host_A, dist_A,
                     local_sparsity_, non_local_sparsity_, src_comm_pattern,
                     local_interfaces);
+        std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
+                  << " dnoe update impl  \n";
 
         auto ret = std::make_shared<RepartDistMatrix>(
             exec, comm, repartitioner.get_repart_dim(), dist_A->get_size(),
@@ -305,6 +329,8 @@ public:
         auto exec = exec_handler.get_ref_exec();
         auto device_exec = exec_handler.get_device_exec();
         auto ranks_per_gpu = repartitioner.get_ranks_per_gpu();
+	bool requires_host_buffer = exec_handler.get_gko_force_host_buffer();
+
         label rank{repartitioner.get_rank(exec_handler)};
         label owner_rank = repartitioner.get_owner_rank(exec_handler);
         bool owner = repartitioner.is_owner(exec_handler);
@@ -314,29 +340,43 @@ public:
         auto diag_comm_pattern = compute_send_recv_counts(
             exec_handler, ranks_per_gpu, nrows, local_matrix_nnz,
             local_matrix_nnz - nrows, 0);
+        std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
+                  << " diag comm pattern  \n";
 
 
         label upper_nnz = host_A->get_upper_nnz();
         auto upper_comm_pattern = compute_send_recv_counts(
             exec_handler, ranks_per_gpu, upper_nnz, local_matrix_nnz, 0,
             local_matrix_nnz - upper_nnz);
+        std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
+                  << " upper comm pattern  \n";
         auto lower_comm_pattern =
             compute_send_recv_counts(exec_handler, ranks_per_gpu, upper_nnz,
                                      local_matrix_nnz, upper_nnz, nrows);
 
         scalar *local_ptr;
+        scalar *local_ptr_2;
+	label nnz=0;
 
         // update main values
+	std::vector<scalar> loc_buffer;
         if (owner) {
             using Coo = gko::matrix::Coo<scalar, label>;
             auto local_mtx = dist_A->get_local_matrix();
 
+
             std::shared_ptr<const Coo> local =
                 gko::as<Coo>(gko::as<CombinationMatrix<scalar, label, Coo>>(
                                  dist_A->get_local_matrix())
                                  ->get_combination()
                                  ->get_operators()[0]);
-            local_ptr = const_cast<scalar *>(local->get_const_values());
+	    if (requires_host_buffer) {
+		    loc_buffer.resize(local->get_num_stored_elements());
+		    local_ptr = loc_buffer.data();
+		    local_ptr_2 = const_cast<scalar *>(local->get_const_values());
+	    } else {
+		    local_ptr = const_cast<scalar *>(local->get_const_values());
+	    }
         }
         communicate_values(exec_handler, diag_comm_pattern, host_A->get_diag(),
                            local_ptr);
@@ -352,6 +392,18 @@ public:
             communicate_values(exec_handler, lower_comm_pattern,
                                host_A->get_lower(), local_ptr);
         }
+        std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
+                  << " done comm local mtx \n";
+
+	if (requires_host_buffer) {	
+		auto host_buffer_view =
+            gko::array<scalar>::view(exec, nnz, local_ptr);
+		auto target_buffer_view =
+            gko::array<scalar>::view(device_exec, nnz, local_ptr_2);
+		target_buffer_view = host_buffer_view;
+	}
+        std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
+                  << " done copy to device  \n";
 
         // copy interface values
         auto comm = *exec_handler.get_communicator().get();
@@ -364,6 +416,7 @@ public:
             label tag = 0;
             label comm_rank, comm_size;
             scalar *recv_buffer_ptr;
+	    std::vector<scalar> host_recv_buffer;
             label remain_host_interfaces = host_A->get_interface_size();
             for (auto [is_local, comm_rank] : local_interfaces) {
                 label &ctr = (is_local) ? loc_ctr : nloc_ctr;
@@ -383,9 +436,18 @@ public:
                     comm_size =
                         non_local_sparsity->interface_spans[ctr].length();
                 }
-                recv_buffer_ptr = const_cast<scalar *>(mtx->get_const_values());
+
+		if (requires_host_buffer) {
+			host_recv_buffer.resize(comm_size);
+			recv_buffer_ptr = host_recv_buffer.data();
+		} else {
+			recv_buffer_ptr = const_cast<scalar *>(mtx->get_const_values());
+		}
 
                 if (comm_rank != rank) {
+		   std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
+				  << " comm_rank " << comm_rank << " rank " << rank << "  \n";
+
                     comm.recv(exec, recv_buffer_ptr, comm_size, comm_rank, tag);
                 } else {
                     // if data is already on this rank
@@ -427,33 +489,47 @@ public:
             }
         }
 
+        std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
+                  << " reorder \n";
         // reorder updated values
-        if (owner) {
-            // NOTE local sparsity size includes the interfaces
-            using Coo = gko::matrix::Coo<scalar, label>;
-            using dim_type = gko::dim<2>::dimension_type;
-            std::shared_ptr<const Coo> local =
-                gko::as<Coo>(gko::as<CombinationMatrix<scalar, label, Coo>>(
-                                 dist_A->get_local_matrix())
-                                 ->get_combination()
-                                 ->get_operators()[0]);
-            auto local_elements = local->get_num_stored_elements();
-            local_ptr = const_cast<scalar *>(local->get_const_values());
-            // TODO make sure this doesn't copy
-            // create a non owning dense matrix of local_values
-
-            auto row_collection = gko::share(gko::matrix::Dense<scalar>::create(
-                exec, gko::dim<2>{static_cast<dim_type>(local_elements), 1},
-                gko::array<scalar>::view(exec, local_elements, local_ptr), 1));
-
-            auto mapping_view = gko::array<label>::view(
-                exec, local_elements, local_sparsity->ldu_mapping.get_data());
-
-
-            // TODO this needs to copy ldu_mapping to the device
-            auto dense_vec = row_collection->clone();
-            dense_vec->row_gather(&mapping_view, row_collection.get());
-        }
+      if (owner) {
+          // NOTE local sparsity size includes the interfaces
+          using Coo = gko::matrix::Coo<scalar, label>;
+          using dim_type = gko::dim<2>::dimension_type;
+          std::shared_ptr<const Coo> local =
+              gko::as<Coo>(gko::as<CombinationMatrix<scalar, label, Coo>>(
+                               dist_A->get_local_matrix())
+                               ->get_combination()
+                               ->get_operators()[0]);
+          auto local_elements = local->get_num_stored_elements();
+          local_ptr = const_cast<scalar *>(local->get_const_values());
+          // TODO make sure this doesn't copy
+          // create a non owning dense matrix of local_values
+
+          auto row_collection = gko::share(gko::matrix::Dense<scalar>::create(
+              device_exec, gko::dim<2>{static_cast<dim_type>(local_elements), 1},
+              gko::array<scalar>::view(device_exec, local_elements, local_ptr), 1));
+      	std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << " local_elements " << local_elements
+      		  << " reorder \n";
+
+          auto mapping_view = gko::array<label>::view(
+              exec, local_elements, local_sparsity->ldu_mapping.get_data());
+      	std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
+      		  << " reorder \n";
+
+
+          // TODO this needs to copy ldu_mapping to the device
+           auto dense_vec = row_collection->clone();
+        //auto dense_vec = gko::share(gko::matrix::Dense<scalar>::create(exec, row_collection->get_size()));
+
+      	std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
+      		  << " reorder \n";
+          dense_vec->row_gather(&mapping_view, row_collection.get());
+      	std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
+      		  << " reorder \n";
+      }
+        std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank
+                  << "done reorder \n";
     };
 
     RepartDistMatrix(

diff --git a/MatrixWrapper/SparsityPattern/SparsityPattern.H b/MatrixWrapper/SparsityPattern/SparsityPattern.H
@@ -47,6 +47,16 @@ struct SparsityPattern {
           rank(std::vector<label>{})
     {}
 
+    SparsityPattern(std::shared_ptr<const SparsityPattern> other)
+        : size_(other->size_),
+          row_idxs(other->row_idxs),
+          col_idxs(other->col_idxs),
+          ldu_mapping(other->ldu_mapping),
+          dim(other->dim),
+          interface_spans(other->interface_spans),
+          rank(other->rank)
+    {}
+
     SparsityPattern(std::shared_ptr<const gko::Executor> exec, label size)
         : size_(size),
           row_idxs{exec, static_cast<gko::size_type>(size_)},

diff --git a/Preconditioner/Preconditioner.H b/Preconditioner/Preconditioner.H
@@ -262,7 +262,7 @@ public:
             auto smoother_gen = gko::share(
                 ir::build()
                     .with_solver(inner_solver_gen)
-                    .with_relaxation_factor(0.9)
+                    //.with_relaxation_factor(0.9)
                     .with_criteria(
                         gko::stop::Iteration::build().with_max_iters(2u).on(
                             device_exec))

diff --git a/Repartitioner/Repartitioner.H b/Repartitioner/Repartitioner.H
@@ -214,7 +214,8 @@ public:
     ** signals whether this is a new local interface (no communication), and the
     ** second entry (label) tracks the original rank of the interface
     */
-    std::tuple<std::shared_ptr<SparsityPattern>,
+    std::tuple<
+	    std::shared_ptr<SparsityPattern>,
                std::shared_ptr<SparsityPattern>,
                std::vector<std::pair<bool, label>>>
     repartition_sparsity(
@@ -238,19 +239,44 @@ public:
         label rank = get_rank(exec_handler);
         label owner_rank = get_owner_rank(exec_handler);
         label ranks_per_gpu = ranks_per_gpu_;
+	// TODO dont copy
+        if (ranks_per_gpu == 1) {
+               std::vector<std::pair<bool, label>> ret;
+	       for (auto comm_rank:src_non_local_pattern->rank) {
+		       ret.emplace_back(false, rank);
+	       }
+
+
+         	return std::make_tuple<
+	       std::shared_ptr<SparsityPattern>,
+               std::shared_ptr<SparsityPattern>,
+               std::vector<std::pair<bool, label>>>(
+         		std::make_shared<SparsityPattern>(src_local_pattern),
+	        	std::make_shared<SparsityPattern>(src_non_local_pattern),
+	               	std::move(ret)
+        	);
+        }
 
+	std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
         auto local_comm_pattern = compute_send_recv_counts(
             exec_handler, ranks_per_gpu, src_local_pattern->size_);
+	std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank  << " owner rank" << owner_rank<< "\n";
+
 
-        label offset = orig_partition_->get_range_bounds()[rank] -
+        label offset = 0;
+        if (ranks_per_gpu != 1){
+	offset = 	orig_partition_->get_range_bounds()[rank] -
                        orig_partition_->get_range_bounds()[owner_rank];
+	}
+	std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
 
         auto gather_closure = [&](auto &comm_pattern, auto &data,
                                   label offset) {
             return gather_to_owner(exec_handler, comm_pattern, data.get_size(),
                                    data.get_data(), offset);
         };
 
+	std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
         SparsityPatternVector merged_local{
             gather_closure(local_comm_pattern, src_local_pattern->row_idxs,
                            offset),
@@ -267,6 +293,7 @@ public:
             make_ldu_mapping_consecutive(
                 local_comm_pattern, merged_local.mapping, rank, ranks_per_gpu);
         }
+	std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
 
         label rows =
             (is_owner(exec_handler)) ? merged_local.rows.back() + 1 : 0;
@@ -284,6 +311,7 @@ public:
             spans_begin.push_back(elem.begin);
             spans_end.push_back(elem.end);
         }
+	std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
 
         // the non local cols are in local idx of other side
         // thus we need the new offset of the other side
@@ -300,6 +328,7 @@ public:
             std::transform(data, data + size, data,
                            [&](label idx) { return idx + local_offset; });
         }
+	std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
 
         SparsityPatternVector merged_non_local{
             gather_closure(non_local_comm_pattern,
@@ -328,6 +357,7 @@ public:
         // build vector with locality information
         std::vector<std::pair<bool, label>> locality;
         label ctr{0};
+	std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
 
         if (is_owner(exec_handler)) {
             auto recv_counts = std::get<1>(span_comm_pattern);
@@ -354,6 +384,7 @@ public:
             gathered_non_local.cols[i] = i;
         }
 
+	std::cout << __FILE__ << ":" << __LINE__ << " rank " << rank << "\n";
         LOG_1(verbose_, "done repartition sparsity pattern")
         if (is_owner(exec_handler)) {
             auto new_local_spars_pattern = std::make_shared<SparsityPattern>(