From b5a7fd97707a4b3f2296c4d089fd1b74f0e45b6d Mon Sep 17 00:00:00 2001 From: Adrian-Diaz Date: Thu, 31 Oct 2024 00:06:34 -0600 Subject: [PATCH] WIP: tpetra wrappers + ENH: kokkos ann examples --- examples/CMakeLists.txt | 6 + examples/ann_distributed_crs.cpp | 449 +++++++++++++++++++++++++ examples/ann_kokkos.cpp | 9 +- examples/ann_kokkos_compare.cpp | 342 +++++++++++++++++++ src/include/tpetra_wrapper_types.h | 505 +++++++++++++++++++++++++++-- 5 files changed, 1285 insertions(+), 26 deletions(-) create mode 100644 examples/ann_distributed_crs.cpp create mode 100644 examples/ann_kokkos_compare.cpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 3a9bbb6f..514761b5 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -110,9 +110,15 @@ if (KOKKOS) add_executable(annkokkos ann_kokkos.cpp) target_link_libraries(annkokkos ${LINKING_LIBRARIES}) + add_executable(annkokkos_compare ann_kokkos_compare.cpp) + target_link_libraries(annkokkos_compare ${LINKING_LIBRARIES}) + if (Matar_ENABLE_TRILINOS) add_executable(anndistributed ann_distributed.cpp) target_link_libraries(anndistributed ${LINKING_LIBRARIES}) + + add_executable(anndistributed_crs ann_distributed_crs.cpp) + target_link_libraries(anndistributed_crs ${LINKING_LIBRARIES}) endif() if (OPENMP) diff --git a/examples/ann_distributed_crs.cpp b/examples/ann_distributed_crs.cpp new file mode 100644 index 00000000..fc3bf4b6 --- /dev/null +++ b/examples/ann_distributed_crs.cpp @@ -0,0 +1,449 @@ +/********************************************************************************************** + � 2020. Triad National Security, LLC. All rights reserved. + This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos + National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S. + Department of Energy/National Nuclear Security Administration. All rights in the program are + reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear + Security Administration. The Government is granted for itself and others acting on its behalf a + nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare + derivative works, distribute copies to the public, perform publicly and display publicly, and + to permit others to do so. + This program is open source under the BSD-3 License. + Redistribution and use in source and binary forms, with or without modification, are permitted + provided that the following conditions are met: + 1. Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright notice, this list of + conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. + 3. Neither the name of the copyright holder nor the names of its contributors may be used + to endorse or promote products derived from this software without specific prior + written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + **********************************************************************************************/ +#include +#include +#include +#include +#include + +#include "matar.h" + +using namespace mtr; // matar namespace + + + +// ================================================================= +// Artificial Neural Network (ANN) +// +// For a single layer, we have x_i inputs with weights_{ij}, +// creating y_j outputs. We have +// y_j = Fcn(b_j) = Fcn( Sum_i {x_i w_{ij}} ) +// where the activation function Fcn is applied to b_j, creating +// outputs y_j. For multiple layers, we have +// b_j^l = Sum_i (x_i^{l-1} w_{ij}^l) +// where l is a layer, and as before, an activation function is +// applied to b_j^l, creating outputs y_j^l. +// +// ================================================================= + + +// ================================================================= +// +// Number of nodes in each layer including inputs and outputs +// +// ================================================================= +std::vector num_nodes_in_layer = {64000, 30000, 8000, 4000, 2000, 1000, 100} ; +//std::vector num_nodes_in_layer = {50, 25} ; +// {9, 50, 100, 300, 200, 100, 20, 6} + + + +// ================================================================= +// +// data types and classes +// +// ================================================================= + +// array of ANN structs +struct ANNLayer_t{ + //input map will store every global id in the vector for simplificty of row-vector products in this example + TpetraPartitionMap output_partition_map; //map with all comms for row-vector product + TpetraPartitionMap output_unique_map; //submap of uniquely decomposed indices + TpetraMVArray distributed_outputs; + TpetraCRSMatrix distributed_weights; + TpetraMVArray distributed_biases; + +}; // end struct + + + +// ================================================================= +// +// functions +// +// ================================================================= +void vec_mat_multiply(TpetraMVArray &inputs, + TpetraMVArray &outputs, + TpetraCRSMatrix &matrix){ + + const size_t num_i = inputs.size(); + const size_t num_j = outputs.submap_size(); + + using team_t = typename Kokkos::TeamPolicy<>::member_type; + Kokkos::parallel_for ("MatVec", Kokkos::TeamPolicy<> (num_j, Kokkos::AUTO), + KOKKOS_LAMBDA (const team_t& team_h) { + + float sum = 0; + int j = team_h.league_rank(); + Kokkos::parallel_reduce (Kokkos::TeamThreadRange (team_h, num_i), + [&] (int i, float& lsum) { + lsum += inputs(i)*matrix(j,i); + }, sum); // end parallel reduce + int global_index = outputs.getSubMapGlobalIndex(j); + int local_index = outputs.getMapLocalIndex(global_index); + outputs(local_index) = sum; + + }); // end parallel for + + + FOR_ALL(j,0,num_j, { + int global_index = outputs.getSubMapGlobalIndex(j); + int local_index = outputs.getMapLocalIndex(global_index); + if(fabs(outputs(local_index) - num_i)>= 1e-15){ + printf("error in vec mat multiply test at row %d of %f\n", j, fabs(outputs(local_index) - num_i)); + } + }); + + return; + +}; // end function + +KOKKOS_INLINE_FUNCTION +float sigmoid(const float value){ + return 1.0/(1.0 + exp(-value)); // exp2f doesn't work with CUDA +}; // end function + + +KOKKOS_INLINE_FUNCTION +float sigmoid_derivative(const float value){ + float sigval = sigmoid(value); + return sigval*(1.0 - sigval); // exp2f doesn't work with CUDA +}; // end function + + + + +void forward_propagate_layer(TpetraMVArray &inputs, + TpetraMVArray &outputs, + TpetraCRSMatrix &weights, + const TpetraMVArray &biases){ + + const size_t num_i = inputs.size(); + const size_t num_j = outputs.submap_size(); + + //perform comms to get full input vector for row vector products on matrix + //VERY SIMPLE EXAMPLE OF COMMS; THIS IS A TERRIBLE WAY TO DECOMPOSE THE PROBLEM + +/* + FOR_ALL(j, 0, num_j,{ + + //printf("thread = %d \n", omp_get_thread_num()); + + float value = 0.0; + for(int i=0; i::member_type; + Kokkos::parallel_for ("MatVec", Kokkos::TeamPolicy<> (num_j, Kokkos::AUTO), + KOKKOS_LAMBDA (const team_t& team_h) { + + float sum = 0; + int j = team_h.league_rank(); + Kokkos::parallel_reduce (Kokkos::TeamThreadRange (team_h, num_i), + [&] (int i, float& lsum) { + lsum += inputs(i)*weights(j,i) + biases(j); + }, sum); // end parallel reduce + int global_index = outputs.getSubMapGlobalIndex(j); + int local_index = outputs.getMapLocalIndex(global_index); + outputs(local_index) = 1.0/(1.0 + exp(-sum)); + + }); // end parallel for + + + + return; + +}; // end function + + +void set_biases(const TpetraMVArray &biases){ + const size_t num_j = biases.size(); + + FOR_ALL(j,0,num_j, { + biases(j) = 0.0; + }); // end parallel for + +}; // end function + + +void set_weights(const TpetraCRSMatrix &weights){ + + const size_t num_i = weights.dim1(); + const size_t num_j = weights.dim1(); + + FOR_ALL(i,0,num_i, + j,0,num_j, { + + weights(i,j) = 1.0; + }); // end parallel for + +}; // end function + + +// ================================================================= +// +// Main function +// +// ================================================================= +int main(int argc, char* argv[]) +{ + MPI_Init(&argc, &argv); + int process_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &process_rank); + Kokkos::initialize(argc, argv); + { + + // ================================================================= + // allocate arrays + // ================================================================= + + // note: the num_nodes_in_layer has the inputs into the ANN, so subtract 1 for the layers + size_t num_layers = num_nodes_in_layer.size()-1; + + CArray ANNLayers(num_layers); // starts at 1 and goes to num_layers + + // input and ouput values to ANN + TpetraPartitionMap input_pmap, input_unique_pmap; + DCArrayKokkos all_layer_indices(num_nodes_in_layer[0]); + FOR_ALL(i,0,num_nodes_in_layer[0], { + all_layer_indices(i) = i; + }); + all_layer_indices.update_host(); // copy inputs to device + //map of all indices in this layer to be used for row-vector product (in practice, this would not include all indices in the layer) + input_pmap = TpetraPartitionMap(all_layer_indices); + + //map that decomposes indices of this onto set of processes uniquely (used to demonstrate comms for above) + input_unique_pmap = TpetraPartitionMap(num_nodes_in_layer[0]); + TpetraMVArray inputs(input_pmap); //rows decomposed onto processes + //comming from subview requires both the original map and the submap to be composed of contiguous indices + inputs.own_comm_setup(input_unique_pmap); //tells the vector its communicating from a contiguous subset of its own data + + // set the strides + // layer 0 are the inputs to the ANN + // layer n-1 are the outputs from the ANN + for (size_t layer=0; layer all_current_layer_indices(num_nodes_in_layer[layer+1]); + FOR_ALL(i,0,num_nodes_in_layer[layer+1], { + all_current_layer_indices(i) = i; + }); + + ANNLayers(layer).output_partition_map = TpetraPartitionMap(all_current_layer_indices); + ANNLayers(layer).output_unique_map = TpetraPartitionMap(num_nodes_in_layer[layer+1]); + ANNLayers(layer).distributed_outputs = TpetraMVArray (ANNLayers(layer).output_partition_map); + //comming from subview requires both the original map and the submap to be composed of contiguous indices + ANNLayers(layer).distributed_outputs.own_comm_setup(ANNLayers(layer).output_unique_map); + // allocate the weights in this layer + ANNLayers(layer).distributed_weights = TpetraCRSMatrix (num_j, num_i); + ANNLayers(layer).distributed_biases = TpetraMVArray (num_j); + + } // end for + + + // ================================================================= + // set weights, biases, and inputs + // ================================================================= + + // inputs to ANN + size_t local_input_size = inputs.submap_size(); + //std::cout << "full_input_size " << input_pmap.num_global_ << "\n"; + for (size_t i=0; i fos; + // fos = Teuchos::fancyOStream(Teuchos::rcpFromRef(out)); + // inputs.tpetra_sub_vector->describe(*fos,Teuchos::VERB_EXTREME); + + inputs.update_device(); // copy inputs to device + inputs.perform_comms(); //distribute to full map for row-vector product + + // for (size_t i=0; i