diff --git a/CMakeLists.txt b/CMakeLists.txt index c76525c0..0d8cbabc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,13 +55,20 @@ if(Matar_ENABLE_KOKKOS) if("${Matar_KOKKOS_PACKAGE}" STREQUAL "Trilinos") find_package(Trilinos REQUIRED) add_definitions(-DTRILINOS_INTERFACE=1) + elseif(Matar_ENABLE_TRILINOS) + find_package(Trilinos REQUIRED) + add_definitions(-DTRILINOS_INTERFACE=1) else() find_package(Kokkos REQUIRED) endif() if (Matar_ENABLE_MPI) find_package(MPI REQUIRED) add_definitions(-DHAVE_MPI=1) - target_link_libraries(matar INTERFACE Kokkos::kokkos MPI::MPI_CXX) + if(Matar_ENABLE_TRILINOS) + target_link_libraries(matar INTERFACE Trilinos::all_selected_libs MPI::MPI_CXX) + else() + target_link_libraries(matar INTERFACE Kokkos::kokkos MPI::MPI_CXX) + endif() else() target_link_libraries(matar INTERFACE Kokkos::kokkos) endif() diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 0f3e6aab..66b589a3 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -10,9 +10,9 @@ if (NOT TARGET distclean) INCLUDE(../cmake/Modules/TargetDistclean.cmake OPTIONAL) endif (NOT TARGET distclean) +find_package(Matar REQUIRED) set(LINKING_LIBRARIES matar) -find_package(Matar REQUIRED) if (MPI) find_package(MPI REQUIRED) add_definitions(-DHAVE_MPI=1) @@ -36,9 +36,43 @@ if (NOT KOKKOS) endif() if (KOKKOS) - find_package(Kokkos REQUIRED) #new + if (Matar_ENABLE_TRILINOS) + find_package(Trilinos REQUIRED) #new + # Assume if the CXX compiler exists, the rest do too. + if (EXISTS ${Trilinos_CXX_COMPILER}) + set(CMAKE_CXX_COMPILER ${Trilinos_CXX_COMPILER}) + set(CMAKE_C_COMPILER ${Trilinos_C_COMPILER}) + set(CMAKE_Fortran_COMPILER ${Trilinos_Fortran_COMPILER}) + endif() + if(NOT DISTRIBUTION) + # Make sure to use same compilers and flags as Trilinos + set(CMAKE_CXX_FLAGS "${Trilinos_CXX_COMPILER_FLAGS} ${CMAKE_CXX_FLAGS}") + set(CMAKE_C_FLAGS "${Trilinos_C_COMPILER_FLAGS} ${CMAKE_C_FLAGS}") + set(CMAKE_Fortran_FLAGS "${Trilinos_Fortran_COMPILER_FLAGS} ${CMAKE_Fortran_FLAGS}") + endif() + + message("\nFound Trilinos! Here are the details: ") + message(" Trilinos_DIR = ${Trilinos_DIR}") + message(" Trilinos_VERSION = ${Trilinos_VERSION}") + message(" Trilinos_PACKAGE_LIST = ${Trilinos_PACKAGE_LIST}") + message(" Trilinos_LIBRARIES = ${Trilinos_LIBRARIES}") + message(" Trilinos_INCLUDE_DIRS = ${Trilinos_INCLUDE_DIRS}") + message(" Trilinos_LIBRARY_DIRS = ${Trilinos_LIBRARY_DIRS}") + message(" Trilinos_TPL_LIST = ${Trilinos_TPL_LIST}") + message(" Trilinos_TPL_INCLUDE_DIRS = ${Trilinos_TPL_INCLUDE_DIRS}") + message(" Trilinos_TPL_LIBRARIES = ${Trilinos_TPL_LIBRARIES}") + message(" Trilinos_TPL_LIBRARY_DIRS = ${Trilinos_TPL_LIBRARY_DIRS}") + message(" Trilinos_BUILD_SHARED_LIBS = ${Trilinos_BUILD_SHARED_LIBS}") + message("End of Trilinos details\n") + + include_directories(${Trilinos_INCLUDE_DIRS} ${Trilinos_TPL_INCLUDE_DIRS}) + list(APPEND LINKING_LIBRARIES Trilinos::all_selected_libs) + add_definitions(-DTRILINOS_INTERFACE=1) + else() + find_package(Kokkos REQUIRED) #new + list(APPEND LINKING_LIBRARIES Kokkos::kokkos) + endif() - list(APPEND LINKING_LIBRARIES Kokkos::kokkos) add_definitions(-DHAVE_KOKKOS=1) @@ -76,11 +110,36 @@ if (KOKKOS) add_executable(annkokkos ann_kokkos.cpp) target_link_libraries(annkokkos ${LINKING_LIBRARIES}) + add_executable(annkokkos_compare ann_kokkos_compare.cpp) + target_link_libraries(annkokkos_compare ${LINKING_LIBRARIES}) + + if (Matar_ENABLE_TRILINOS) + add_executable(anndistributed ann_distributed.cpp) + target_link_libraries(anndistributed ${LINKING_LIBRARIES}) + + add_executable(anndistributed_crs ann_distributed_crs.cpp) + target_link_libraries(anndistributed_crs ${LINKING_LIBRARIES}) + + add_executable(test_tpetra_farray test_tpetra_farray.cpp) + target_link_libraries(test_tpetra_farray ${LINKING_LIBRARIES}) + + add_executable(test_tpetra_carray test_tpetra_carray.cpp) + target_link_libraries(test_tpetra_carray ${LINKING_LIBRARIES}) + + add_executable(test_tpetra_mesh test_tpetra_mesh.cpp) + target_link_libraries(test_tpetra_mesh ${LINKING_LIBRARIES}) + endif() + if (OPENMP) add_executable(parallel_hello_world parallel_hello_world.cpp) target_link_libraries(parallel_hello_world ${LINKING_LIBRARIES}) endif() + if (MPI) + include_directories(laplaceMPI) + add_subdirectory(laplaceMPI) + endif() + endif() ### HIP Linking error, will add back in after fixed @@ -114,11 +173,6 @@ add_subdirectory(sparsetests) include_directories(test_rocm) add_subdirectory(test_rocm) -if (MPI) - include_directories(laplaceMPI) - add_subdirectory(laplaceMPI) -endif() - #include_directories(phaseField/srcKokkosVerbose) #add_subdirectory(phaseField/srcKokkosVerbose) diff --git a/examples/ann_distributed.cpp b/examples/ann_distributed.cpp new file mode 100644 index 00000000..7f75d919 --- /dev/null +++ b/examples/ann_distributed.cpp @@ -0,0 +1,436 @@ +/********************************************************************************************** + � 2020. Triad National Security, LLC. All rights reserved. + This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos + National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S. + Department of Energy/National Nuclear Security Administration. All rights in the program are + reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear + Security Administration. The Government is granted for itself and others acting on its behalf a + nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare + derivative works, distribute copies to the public, perform publicly and display publicly, and + to permit others to do so. + This program is open source under the BSD-3 License. + Redistribution and use in source and binary forms, with or without modification, are permitted + provided that the following conditions are met: + 1. Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright notice, this list of + conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. + 3. Neither the name of the copyright holder nor the names of its contributors may be used + to endorse or promote products derived from this software without specific prior + written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + **********************************************************************************************/ +#include +#include +#include +#include +#include + +#include "matar.h" + +using namespace mtr; // matar namespace + + + +// ================================================================= +// Artificial Neural Network (ANN) +// +// For a single layer, we have x_i inputs with weights_{ij}, +// creating y_j outputs. We have +// y_j = Fcn(b_j) = Fcn( Sum_i {x_i w_{ij}} ) +// where the activation function Fcn is applied to b_j, creating +// outputs y_j. For multiple layers, we have +// b_j^l = Sum_i (x_i^{l-1} w_{ij}^l) +// where l is a layer, and as before, an activation function is +// applied to b_j^l, creating outputs y_j^l. +// +// ================================================================= + + +// ================================================================= +// +// Number of nodes in each layer including inputs and outputs +// +// ================================================================= +std::vector num_nodes_in_layer = {32000, 16000, 8000, 4000, 2000, 1000, 100} ; +//std::vector num_nodes_in_layer = {50, 25} ; +// {9, 50, 100, 300, 200, 100, 20, 6} + + + +// ================================================================= +// +// data types and classes +// +// ================================================================= + +// array of ANN structs +struct ANNLayer_t{ + //input map will store every global id in the vector for simplificty of row-vector products in this example + TpetraPartitionMap<> output_partition_map; //map with all comms for row-vector product + TpetraPartitionMap<> output_unique_map; //submap of uniquely decomposed indices + TpetraDFArray distributed_output_row; + TpetraDFArray distributed_outputs; + TpetraDFArray distributed_weights; + TpetraDFArray distributed_biases; + TpetraCommunicationPlan output_comms; + +}; // end struct + + + +// ================================================================= +// +// functions +// +// ================================================================= +void vec_mat_multiply(TpetraDFArray &inputs, + TpetraDFArray &outputs, + TpetraDFArray &matrix){ + + const size_t num_i = inputs.size(); + const size_t num_j = outputs.submap_size(); + + using team_t = typename Kokkos::TeamPolicy<>::member_type; + Kokkos::parallel_for ("MatVec", Kokkos::TeamPolicy<> (num_j, Kokkos::AUTO), + KOKKOS_LAMBDA (const team_t& team_h) { + + float sum = 0; + int j = team_h.league_rank(); + Kokkos::parallel_reduce (Kokkos::TeamThreadRange (team_h, num_i), + [&] (int i, float& lsum) { + lsum += inputs(i)*matrix(j,i); + }, sum); // end parallel reduce + outputs(j) = sum; + + }); // end parallel for + + + FOR_ALL(j,0,num_j, { + if(fabs(outputs(j) - num_i)>= 1e-15){ + printf("error in vec mat multiply test at row %d of %f\n", j, fabs(outputs(j) - num_i)); + } + }); + + return; + +}; // end function + +KOKKOS_INLINE_FUNCTION +float sigmoid(const float value){ + return 1.0/(1.0 + exp(-value)); // exp2f doesn't work with CUDA +}; // end function + + +KOKKOS_INLINE_FUNCTION +float sigmoid_derivative(const float value){ + float sigval = sigmoid(value); + return sigval*(1.0 - sigval); // exp2f doesn't work with CUDA +}; // end function + + + + +void forward_propagate_layer(TpetraDFArray &inputs, + TpetraDFArray &outputs, + TpetraDFArray &weights, + const TpetraDFArray &biases){ + + const size_t num_i = inputs.size(); + const size_t num_j = outputs.size(); + //inputs.print(); + //perform comms to get full input vector for row vector products on matrix + //VERY SIMPLE EXAMPLE OF COMMS; THIS IS A NONIDEAL WAY TO DECOMPOSE THE PROBLEM + + FOR_ALL(j, 0, num_j,{ + + //printf("thread = %d \n", omp_get_thread_num()); + + float value = 0.0; + for(int i=0; i &biases){ + const size_t num_j = biases.size(); + + FOR_ALL(j,0,num_j, { + biases(j) = 0.0; + }); // end parallel for + +}; // end function + + +void set_weights(const TpetraDFArray &weights){ + + const size_t num_i = weights.dims(0); + const size_t num_j = weights.dims(1); + + FOR_ALL(i,0,num_i, + j,0,num_j, { + + weights(i,j) = 1.0; + }); // end parallel for + +}; // end function + + +// ================================================================= +// +// Main function +// +// ================================================================= +int main(int argc, char* argv[]) +{ + MPI_Init(&argc, &argv); + int process_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &process_rank); + Kokkos::initialize(argc, argv); + { + + // ================================================================= + // allocate arrays + // ================================================================= + + // note: the num_nodes_in_layer has the inputs into the ANN, so subtract 1 for the layers + size_t num_layers = num_nodes_in_layer.size()-1; + + CArray ANNLayers(num_layers); // starts at 1 and goes to num_layers + + // input and ouput values to ANN + TpetraPartitionMap<> input_pmap, input_unique_pmap; + DCArrayKokkos all_layer_indices(num_nodes_in_layer[0]); + FOR_ALL(i,0,num_nodes_in_layer[0], { + all_layer_indices(i) = i; + }); + all_layer_indices.update_host(); // copy inputs to device + //map of all indices in this layer to be used for row-vector product (in practice, this would not include all indices in the layer) + input_pmap = TpetraPartitionMap<>(all_layer_indices); + + //map that decomposes indices of this onto set of processes uniquely (used to demonstrate comms for above) + input_unique_pmap = TpetraPartitionMap<>(num_nodes_in_layer[0]); + TpetraDFArray inputs_row(input_pmap); //rows decomposed onto processes + long long int min_index = input_pmap.getLocalIndex(input_unique_pmap.getMinGlobalIndex()); + TpetraDFArray inputs(input_unique_pmap); //rows decomposed onto processes + //comming from subview requires both the original map and the submap to be composed of contiguous indices + + // set the strides + // layer 0 are the inputs to the ANN + // layer n-1 are the outputs from the ANN + for (size_t layer=0; layer all_current_layer_indices(num_nodes_in_layer[layer+1]); + FOR_ALL(i,0,num_nodes_in_layer[layer+1], { + all_current_layer_indices(i) = i; + }); + + ANNLayers(layer).output_partition_map = TpetraPartitionMap<>(all_current_layer_indices); + ANNLayers(layer).output_unique_map = TpetraPartitionMap<>(num_nodes_in_layer[layer+1]); + ANNLayers(layer).distributed_output_row = TpetraDFArray (ANNLayers(layer).output_partition_map); + ANNLayers(layer).distributed_outputs = TpetraDFArray (ANNLayers(layer).output_unique_map); + //comm object between unique mapped output and full output row view + ANNLayers(layer).output_comms = TpetraCommunicationPlan(ANNLayers(layer).distributed_output_row, ANNLayers(layer).distributed_outputs); + + // allocate the weights in this layer + ANNLayers(layer).distributed_weights = TpetraDFArray (num_j, num_i); + ANNLayers(layer).distributed_biases = TpetraDFArray (num_j); + + } // end for + + + // ================================================================= + // set weights, biases, and inputs + // ================================================================= + + // inputs to ANN + size_t local_input_size = inputs.size(); + //std::cout << "full_input_size " << input_pmap.num_global_ << "\n"; + for (size_t i=0; i fos; + // fos = Teuchos::fancyOStream(Teuchos::rcpFromRef(out)); + // inputs.tpetra_sub_vector->describe(*fos,Teuchos::VERB_EXTREME); + + inputs.update_device(); // copy inputs to device + TpetraCommunicationPlan input_comms(inputs_row, inputs); + input_comms.execute_comms(); //distribute to full map for row-vector product + //inputs.print(); + + // for (size_t i=0; i