Skip to content

Commit

Permalink
WIP: CArray type
Browse files Browse the repository at this point in the history
  • Loading branch information
Adrian-Diaz committed Nov 20, 2024
1 parent 3632dda commit 8d8a347
Show file tree
Hide file tree
Showing 5 changed files with 379 additions and 196 deletions.
16 changes: 8 additions & 8 deletions examples/ann_distributed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@ std::vector <size_t> num_nodes_in_layer = {64000, 30000, 8000, 4000, 2000, 1000,
// array of ANN structs
struct ANNLayer_t{
//input map will store every global id in the vector for simplificty of row-vector products in this example
TpetraPartitionMap<long long int> output_partition_map; //map with all comms for row-vector product
TpetraPartitionMap<long long int> output_unique_map; //submap of uniquely decomposed indices
TpetraPartitionMap<> output_partition_map; //map with all comms for row-vector product
TpetraPartitionMap<> output_unique_map; //submap of uniquely decomposed indices
TpetraDFArray<real_t> distributed_outputs;
TpetraDFArray<real_t> distributed_weights;
TpetraDFArray<real_t> distributed_biases;
Expand Down Expand Up @@ -247,17 +247,17 @@ int main(int argc, char* argv[])
CArray <ANNLayer_t> ANNLayers(num_layers); // starts at 1 and goes to num_layers

// input and ouput values to ANN
TpetraPartitionMap<long long int> input_pmap, input_unique_pmap;
TpetraPartitionMap<> input_pmap, input_unique_pmap;
DCArrayKokkos<long long int> all_layer_indices(num_nodes_in_layer[0]);
FOR_ALL(i,0,num_nodes_in_layer[0], {
all_layer_indices(i) = i;
});
all_layer_indices.update_host(); // copy inputs to device
//map of all indices in this layer to be used for row-vector product (in practice, this would not include all indices in the layer)
input_pmap = TpetraPartitionMap<long long int>(all_layer_indices);
input_pmap = TpetraPartitionMap<>(all_layer_indices);

//map that decomposes indices of this onto set of processes uniquely (used to demonstrate comms for above)
input_unique_pmap = TpetraPartitionMap<long long int>(num_nodes_in_layer[0]);
input_unique_pmap = TpetraPartitionMap<>(num_nodes_in_layer[0]);
TpetraDFArray<real_t> inputs(input_pmap); //rows decomposed onto processes
//comming from subview requires both the original map and the submap to be composed of contiguous indices
inputs.own_comm_setup(input_unique_pmap); //tells the vector its communicating from a contiguous subset of its own data
Expand All @@ -275,8 +275,8 @@ int main(int argc, char* argv[])
all_current_layer_indices(i) = i;
});

ANNLayers(layer).output_partition_map = TpetraPartitionMap<long long int>(all_current_layer_indices);
ANNLayers(layer).output_unique_map = TpetraPartitionMap<long long int>(num_nodes_in_layer[layer+1]);
ANNLayers(layer).output_partition_map = TpetraPartitionMap<>(all_current_layer_indices);
ANNLayers(layer).output_unique_map = TpetraPartitionMap<>(num_nodes_in_layer[layer+1]);
ANNLayers(layer).distributed_outputs = TpetraDFArray<real_t> (ANNLayers(layer).output_partition_map);
//comming from subview requires both the original map and the submap to be composed of contiguous indices
ANNLayers(layer).distributed_outputs.own_comm_setup(ANNLayers(layer).output_unique_map);
Expand Down Expand Up @@ -422,7 +422,7 @@ int main(int argc, char* argv[])
output_grid.print();

//get repartitioned map to distribute new arrays with it
TpetraPartitionMap<long long int> partitioned_output_map = output_grid.pmap;
TpetraPartitionMap<> partitioned_output_map = output_grid.pmap;
TpetraDFArray<real_t> partitioned_output_values(partitioned_output_map, "partitioned output values");

//construct a unique source vector from ANN output using the subview constructor
Expand Down
46 changes: 23 additions & 23 deletions examples/ann_distributed_crs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,11 @@ std::vector <size_t> num_nodes_in_layer = {64000, 30000, 8000, 4000, 2000, 1000,
// array of ANN structs
struct ANNLayer_t{
//input map will store every global id in the vector for simplificty of row-vector products in this example
TpetraPartitionMap<long long int> output_partition_map; //map with all comms for row-vector product
TpetraPartitionMap<long long int> output_unique_map; //submap of uniquely decomposed indices
TpetraDFArray<real_t> distributed_outputs;
TpetraPartitionMap<> output_partition_map; //map with all comms for row-vector product
TpetraPartitionMap<> output_unique_map; //submap of uniquely decomposed indices
TpetraDCArray<real_t> distributed_outputs;
TpetraDCArray<real_t> distributed_weights;
TpetraDFArray<real_t> distributed_biases;
TpetraDCArray<real_t> distributed_biases;

}; // end struct

Expand All @@ -93,8 +93,8 @@ struct ANNLayer_t{
// functions
//
// =================================================================
void vec_mat_multiply(TpetraDFArray<real_t> &inputs,
TpetraDFArray<real_t> &outputs,
void vec_mat_multiply(TpetraDCArray<real_t> &inputs,
TpetraDCArray<real_t> &outputs,
TpetraDCArray<real_t> &matrix){

const size_t num_i = inputs.size();
Expand Down Expand Up @@ -144,10 +144,10 @@ float sigmoid_derivative(const float value){



void forward_propagate_layer(TpetraDFArray<real_t> &inputs,
TpetraDFArray<real_t> &outputs,
void forward_propagate_layer(TpetraDCArray<real_t> &inputs,
TpetraDCArray<real_t> &outputs,
TpetraDCArray<real_t> &weights,
const TpetraDFArray<real_t> &biases){
const TpetraDCArray<real_t> &biases){

const size_t num_i = inputs.size();
const size_t num_j = outputs.submap_size();
Expand Down Expand Up @@ -199,7 +199,7 @@ void forward_propagate_layer(TpetraDFArray<real_t> &inputs,
}; // end function


void set_biases(const TpetraDFArray<real_t> &biases){
void set_biases(const TpetraDCArray<real_t> &biases){
const size_t num_j = biases.size();

FOR_ALL(j,0,num_j, {
Expand Down Expand Up @@ -246,18 +246,18 @@ int main(int argc, char* argv[])
CArray <ANNLayer_t> ANNLayers(num_layers); // starts at 1 and goes to num_layers

// input and ouput values to ANN
TpetraPartitionMap<long long int> input_pmap, input_unique_pmap;
TpetraPartitionMap<> input_pmap, input_unique_pmap;
DCArrayKokkos<long long int> all_layer_indices(num_nodes_in_layer[0]);
FOR_ALL(i,0,num_nodes_in_layer[0], {
all_layer_indices(i) = i;
});
all_layer_indices.update_host(); // copy inputs to device
//map of all indices in this layer to be used for row-vector product (in practice, this would not include all indices in the layer)
input_pmap = TpetraPartitionMap<long long int>(all_layer_indices);
input_pmap = TpetraPartitionMap<>(all_layer_indices);

//map that decomposes indices of this onto set of processes uniquely (used to demonstrate comms for above)
input_unique_pmap = TpetraPartitionMap<long long int>(num_nodes_in_layer[0]);
TpetraDFArray<real_t> inputs(input_pmap); //rows decomposed onto processes
input_unique_pmap = TpetraPartitionMap<>(num_nodes_in_layer[0]);
TpetraDCArray<real_t> inputs(input_pmap); //rows decomposed onto processes
//comming from subview requires both the original map and the submap to be composed of contiguous indices
inputs.own_comm_setup(input_unique_pmap); //tells the vector its communicating from a contiguous subset of its own data

Expand All @@ -274,14 +274,14 @@ int main(int argc, char* argv[])
all_current_layer_indices(i) = i;
});

ANNLayers(layer).output_partition_map = TpetraPartitionMap<long long int>(all_current_layer_indices);
ANNLayers(layer).output_unique_map = TpetraPartitionMap<long long int>(num_nodes_in_layer[layer+1]);
ANNLayers(layer).distributed_outputs = TpetraDFArray<real_t> (ANNLayers(layer).output_partition_map);
ANNLayers(layer).output_partition_map = TpetraPartitionMap<>(all_current_layer_indices);
ANNLayers(layer).output_unique_map = TpetraPartitionMap<>(num_nodes_in_layer[layer+1]);
ANNLayers(layer).distributed_outputs = TpetraDCArray<real_t> (ANNLayers(layer).output_partition_map);
//comming from subview requires both the original map and the submap to be composed of contiguous indices
ANNLayers(layer).distributed_outputs.own_comm_setup(ANNLayers(layer).output_unique_map);
// allocate the weights in this layer
ANNLayers(layer).distributed_weights = TpetraDCArray<real_t> (num_j, num_i);
ANNLayers(layer).distributed_biases = TpetraDFArray<real_t> (num_j);
ANNLayers(layer).distributed_biases = TpetraDCArray<real_t> (num_j);

} // end for

Expand Down Expand Up @@ -392,7 +392,7 @@ int main(int argc, char* argv[])

//test repartition; assume a 10 by 10 grid of outputs from ANN
//assign coords to each grid point, find a partition of the grid, then repartition output layer using new map
TpetraDFArray<real_t> output_grid(100, 2); //array of 2D coordinates for 10 by 10 grid of points
TpetraDCArray<real_t> output_grid(100, 2); //array of 2D coordinates for 10 by 10 grid of points

//populate coords
FOR_ALL(i,0,output_grid.dims(0), {
Expand Down Expand Up @@ -423,16 +423,16 @@ int main(int argc, char* argv[])
output_grid.print();

//get repartitioned map to distribute new arrays with it
TpetraPartitionMap<long long int> partitioned_output_map = output_grid.pmap;
TpetraDFArray<real_t> partitioned_output_values(partitioned_output_map, "partitioned output values");
TpetraPartitionMap<> partitioned_output_map = output_grid.pmap;
TpetraDCArray<real_t> partitioned_output_values(partitioned_output_map, "partitioned output values");

//construct a unique source vector from ANN output using the subview constructor
//(for example's sake this is in fact a copy of the subview wrapped by the output as well)
TpetraDFArray<real_t> sub_output_values(ANNLayers(num_layers-1).distributed_outputs, ANNLayers(num_layers-1).distributed_outputs.comm_pmap,
TpetraDCArray<real_t> sub_output_values(ANNLayers(num_layers-1).distributed_outputs, ANNLayers(num_layers-1).distributed_outputs.comm_pmap,
ANNLayers(num_layers-1).distributed_outputs.comm_pmap.getMinGlobalIndex());

//general communication object between two vectors/arrays
TpetraCommunicationPlan<real_t> output_comms(partitioned_output_values, sub_output_values);
TpetraLRCommunicationPlan<real_t> output_comms(partitioned_output_values, sub_output_values);
output_comms.execute_comms();
partitioned_output_values.print();

Expand Down
7 changes: 1 addition & 6 deletions src/include/Tpetra_LRMultiVector_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -304,12 +304,7 @@ namespace { // (anonymous)
const size_t LDA = view.stride (1);
const size_t numRows = view.extent (0);

if (LDA == 0) {
return (numRows == 0) ? size_t (1) : numRows;
}
else {
return LDA;
}
return numRows;
}

template <class impl_scalar_type, class buffer_device_type>
Expand Down
99 changes: 99 additions & 0 deletions src/include/Tpetra_LR_WrappedDualView.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,15 @@
#include <Kokkos_DualView.hpp>
#include "Teuchos_TestForException.hpp"
#include "Tpetra_Details_ExecutionSpaces.hpp"
#include "Tpetra_Details_gathervPrint.hpp"
#include <sstream>

// #include "Tpetra_Details_WrappedDualView.hpp"
// #include "Kokkos_DualView.hpp"
// #include "Teuchos_TypeNameTraits.hpp"
// #include "Teuchos_Comm.hpp"
// #include "Teuchos_CommHelpers.hpp"

//#define DEBUG_UVM_REMOVAL // Works only with gcc > 4.8

#ifdef DEBUG_UVM_REMOVAL
Expand Down Expand Up @@ -625,6 +632,98 @@ class LRWrappedDualView {
mutable DualViewType dualView;
};

/// \brief Is the given Tpetra::WrappedDualView valid?
///
/// A WrappedDualView is valid if both of its constituent Views are valid.
template<class DataType ,
class... Args>
bool
checkLocalWrappedDualViewValidity
(std::ostream* const lclErrStrm,
const int myMpiProcessRank,
const Tpetra::Details::LRWrappedDualView<Kokkos::DualView<DataType, Args...> >& dv)
{
const bool dev_good = dv.is_valid_device();
const bool host_good = dv. is_valid_host();
const bool good = dev_good && host_good;
if (! good && lclErrStrm != nullptr) {
using Teuchos::TypeNameTraits;
using std::endl;
using dv_type =
Tpetra::Details::WrappedDualView<Kokkos::DualView<DataType, Args...> >;

const std::string dvName = TypeNameTraits<dv_type>::name ();
*lclErrStrm << "Proc " << myMpiProcessRank << ": Tpetra::WrappedDualView "
"of type " << dvName << " has one or more invalid Views. See "
"above error messages from this MPI process for details." << endl;
}
return good;
}

template<class DataType ,
class... Args>
bool
checkGlobalWrappedDualViewValidity
(std::ostream* const gblErrStrm,
const Tpetra::Details::LRWrappedDualView<Kokkos::DualView<DataType, Args...> >& dv,
const bool verbose,
const Teuchos::Comm<int>* const comm)
{
using std::endl;
const int myRank = comm == nullptr ? 0 : comm->getRank ();
std::ostringstream lclErrStrm;
int lclSuccess = 1;

try {
const bool lclValid =
checkLocalWrappedDualViewValidity (&lclErrStrm, myRank, dv);
lclSuccess = lclValid ? 1 : 0;
}
catch (std::exception& e) {
lclErrStrm << "Proc " << myRank << ": checkLocalDualViewValidity "
"threw an exception: " << e.what () << endl;
lclSuccess = 0;
}
catch (...) {
lclErrStrm << "Proc " << myRank << ": checkLocalDualViewValidity "
"threw an exception not a subclass of std::exception." << endl;
lclSuccess = 0;
}

int gblSuccess = 0; // output argument
if (comm == nullptr) {
gblSuccess = lclSuccess;
}
else {
using Teuchos::outArg;
using Teuchos::REDUCE_MIN;
using Teuchos::reduceAll;
reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
}

if (gblSuccess != 1 && gblErrStrm != nullptr) {
*gblErrStrm << "On at least one (MPI) process, the "
"Kokkos::DualView has "
"either the device or host pointer in the "
"DualView equal to null, but the DualView has a nonzero number of "
"rows. For more detailed information, please rerun with the "
"TPETRA_VERBOSE environment variable set to 1. ";
if (verbose) {
*gblErrStrm << " Here are error messages from all "
"processes:" << endl;
if (comm == nullptr) {
*gblErrStrm << lclErrStrm.str ();
}
else {
using Tpetra::Details::gathervPrint;
gathervPrint (*gblErrStrm, lclErrStrm.str (), *comm);
}
}
*gblErrStrm << endl;
}
return gblSuccess == 1;
}

} // namespace Details

} // namespace Tpetra
Expand Down
Loading

0 comments on commit 8d8a347

Please sign in to comment.