From b5a7fd97707a4b3f2296c4d089fd1b74f0e45b6d Mon Sep 17 00:00:00 2001
From: Adrian-Diaz <adriandiaz1117@gmail.com>
Date: Thu, 31 Oct 2024 00:06:34 -0600
Subject: [PATCH] WIP: tpetra wrappers + ENH: kokkos ann examples

---
 examples/CMakeLists.txt            |   6 +
 examples/ann_distributed_crs.cpp   | 449 +++++++++++++++++++++++++
 examples/ann_kokkos.cpp            |   9 +-
 examples/ann_kokkos_compare.cpp    | 342 +++++++++++++++++++
 src/include/tpetra_wrapper_types.h | 505 +++++++++++++++++++++++++++--
 5 files changed, 1285 insertions(+), 26 deletions(-)
 create mode 100644 examples/ann_distributed_crs.cpp
 create mode 100644 examples/ann_kokkos_compare.cpp

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 3a9bbb6f..514761b5 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -110,9 +110,15 @@ if (KOKKOS)
   add_executable(annkokkos ann_kokkos.cpp)
   target_link_libraries(annkokkos ${LINKING_LIBRARIES})
 
+  add_executable(annkokkos_compare ann_kokkos_compare.cpp)
+  target_link_libraries(annkokkos_compare ${LINKING_LIBRARIES})
+
   if (Matar_ENABLE_TRILINOS)
     add_executable(anndistributed ann_distributed.cpp)
     target_link_libraries(anndistributed ${LINKING_LIBRARIES})
+    
+    add_executable(anndistributed_crs ann_distributed_crs.cpp)
+    target_link_libraries(anndistributed_crs ${LINKING_LIBRARIES})
   endif()
 
   if (OPENMP)
diff --git a/examples/ann_distributed_crs.cpp b/examples/ann_distributed_crs.cpp
new file mode 100644
index 00000000..fc3bf4b6
--- /dev/null
+++ b/examples/ann_distributed_crs.cpp
@@ -0,0 +1,449 @@
+/**********************************************************************************************
+ � 2020. Triad National Security, LLC. All rights reserved.
+ This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
+ National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
+ Department of Energy/National Nuclear Security Administration. All rights in the program are
+ reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
+ Security Administration. The Government is granted for itself and others acting on its behalf a
+ nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
+ derivative works, distribute copies to the public, perform publicly and display publicly, and
+ to permit others to do so.
+ This program is open source under the BSD-3 License.
+ Redistribution and use in source and binary forms, with or without modification, are permitted
+ provided that the following conditions are met:
+ 1.  Redistributions of source code must retain the above copyright notice, this list of
+ conditions and the following disclaimer.
+ 2.  Redistributions in binary form must reproduce the above copyright notice, this list of
+ conditions and the following disclaimer in the documentation and/or other materials
+ provided with the distribution.
+ 3.  Neither the name of the copyright holder nor the names of its contributors may be used
+ to endorse or promote products derived from this software without specific prior
+ written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ **********************************************************************************************/
+#include <stdio.h>
+#include <array>
+#include <vector>
+#include <chrono>
+#include <math.h>
+
+#include "matar.h"
+
+using namespace mtr; // matar namespace
+
+
+
+// =================================================================
+// Artificial Neural Network (ANN)
+//
+// For a single layer, we have x_i inputs with weights_{ij}, 
+// creating y_j outputs.  We have
+//     y_j = Fcn(b_j) = Fcn( Sum_i {x_i w_{ij}} )
+// where the activation function Fcn is applied to b_j, creating 
+// outputs y_j. For multiple layers, we have
+//      b_j^l = Sum_i (x_i^{l-1} w_{ij}^l)
+// where l is a layer, and as before, an activation function is  
+// applied to b_j^l, creating outputs y_j^l.
+// 
+// =================================================================
+
+
+// =================================================================
+//
+// Number of nodes in each layer including inputs and outputs
+//
+// =================================================================
+std::vector <size_t> num_nodes_in_layer = {64000, 30000, 8000, 4000, 2000, 1000, 100} ;
+//std::vector <size_t> num_nodes_in_layer = {50, 25} ;
+// {9, 50, 100, 300, 200, 100, 20, 6}
+
+
+
+// =================================================================
+//
+// data types and classes
+//
+// =================================================================
+
+// array of ANN structs
+struct ANNLayer_t{
+    //input map will store every global id in the vector for simplificty of row-vector products in this example
+    TpetraPartitionMap<long long int> output_partition_map; //map with all comms for row-vector product
+    TpetraPartitionMap<long long int> output_unique_map; //submap of uniquely decomposed indices
+    TpetraMVArray<real_t> distributed_outputs;
+    TpetraCRSMatrix<real_t> distributed_weights;
+    TpetraMVArray<real_t> distributed_biases; 
+
+}; // end struct
+
+
+
+// =================================================================
+//
+// functions
+//
+// =================================================================
+void vec_mat_multiply(TpetraMVArray<real_t> &inputs,
+                      TpetraMVArray<real_t> &outputs, 
+                      TpetraCRSMatrix<real_t> &matrix){
+    
+    const size_t num_i = inputs.size();
+    const size_t num_j = outputs.submap_size();
+
+    using team_t = typename Kokkos::TeamPolicy<>::member_type;
+    Kokkos::parallel_for ("MatVec", Kokkos::TeamPolicy<> (num_j, Kokkos::AUTO),
+                 KOKKOS_LAMBDA (const team_t& team_h) {
+
+        float sum = 0;
+        int j = team_h.league_rank();
+        Kokkos::parallel_reduce (Kokkos::TeamThreadRange (team_h, num_i),
+                        [&] (int i, float& lsum) {
+            lsum += inputs(i)*matrix(j,i);
+        }, sum); // end parallel reduce
+        int global_index = outputs.getSubMapGlobalIndex(j);
+        int local_index = outputs.getMapLocalIndex(global_index);
+        outputs(local_index) = sum;
+
+    }); // end parallel for
+
+
+    FOR_ALL(j,0,num_j, {
+            int global_index = outputs.getSubMapGlobalIndex(j);
+            int local_index = outputs.getMapLocalIndex(global_index);
+            if(fabs(outputs(local_index) - num_i)>= 1e-15){
+                printf("error in vec mat multiply test at row %d of %f\n", j, fabs(outputs(local_index) - num_i));
+            }
+    });
+    
+    return;
+
+}; // end function
+
+KOKKOS_INLINE_FUNCTION
+float sigmoid(const float value){
+    return 1.0/(1.0 + exp(-value));  // exp2f doesn't work with CUDA
+}; // end function
+
+
+KOKKOS_INLINE_FUNCTION
+float sigmoid_derivative(const float value){
+    float sigval = sigmoid(value);
+    return sigval*(1.0 - sigval);  // exp2f doesn't work with CUDA
+}; // end function
+
+
+
+
+void forward_propagate_layer(TpetraMVArray<real_t> &inputs,
+                             TpetraMVArray<real_t> &outputs, 
+                             TpetraCRSMatrix<real_t> &weights,
+                             const TpetraMVArray<real_t> &biases){
+    
+    const size_t num_i = inputs.size();
+    const size_t num_j = outputs.submap_size();
+
+    //perform comms to get full input vector for row vector products on matrix
+    //VERY SIMPLE EXAMPLE OF COMMS; THIS IS A TERRIBLE WAY TO DECOMPOSE THE PROBLEM
+
+/*    
+    FOR_ALL(j, 0, num_j,{
+
+    	//printf("thread = %d \n", omp_get_thread_num());
+
+            float value = 0.0;
+            for(int i=0; i<num_i; i++){
+                // b_j = Sum_i {x_i w_{ij}}
+                value += inputs(i)*weights(i,j);
+            } // end for
+
+            // apply activation function, sigmoid on a float, y_j = Fcn(b_j)
+            outputs(j) = sigmoid(value);
+
+        }); // end parallel for
+*/     
+
+
+    // For a GPU, use the nested parallelism below here
+    using team_t = typename Kokkos::TeamPolicy<>::member_type;
+    Kokkos::parallel_for ("MatVec", Kokkos::TeamPolicy<> (num_j, Kokkos::AUTO),
+                 KOKKOS_LAMBDA (const team_t& team_h) {
+
+        float sum = 0;
+        int j = team_h.league_rank();
+        Kokkos::parallel_reduce (Kokkos::TeamThreadRange (team_h, num_i),
+                        [&] (int i, float& lsum) {
+            lsum += inputs(i)*weights(j,i) + biases(j);
+        }, sum); // end parallel reduce
+        int global_index = outputs.getSubMapGlobalIndex(j);
+        int local_index = outputs.getMapLocalIndex(global_index);
+        outputs(local_index) = 1.0/(1.0 + exp(-sum)); 
+
+    }); // end parallel for
+    
+
+
+    return;
+
+}; // end function
+
+
+void set_biases(const TpetraMVArray<real_t> &biases){
+    const size_t num_j = biases.size();
+
+    FOR_ALL(j,0,num_j, {
+		    biases(j) = 0.0;
+	}); // end parallel for
+
+}; // end function
+
+
+void set_weights(const TpetraCRSMatrix<real_t> &weights){
+
+    const size_t num_i = weights.dim1();
+    const size_t num_j = weights.dim1();
+    
+	FOR_ALL(i,0,num_i,
+	        j,0,num_j, {
+		    
+		    weights(i,j) = 1.0;
+	}); // end parallel for
+
+}; // end function
+
+
+// =================================================================
+//
+// Main function
+//
+// =================================================================
+int main(int argc, char* argv[])
+{   
+    MPI_Init(&argc, &argv);
+    int process_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &process_rank);
+    Kokkos::initialize(argc, argv);
+    {
+
+        // =================================================================
+        // allocate arrays
+        // =================================================================
+
+        // note: the num_nodes_in_layer has the inputs into the ANN, so subtract 1 for the layers
+        size_t num_layers = num_nodes_in_layer.size()-1;  
+
+        CArray <ANNLayer_t> ANNLayers(num_layers); // starts at 1 and goes to num_layers
+
+        // input and ouput values to ANN
+        TpetraPartitionMap<long long int> input_pmap, input_unique_pmap;
+        DCArrayKokkos<long long int> all_layer_indices(num_nodes_in_layer[0]);
+        FOR_ALL(i,0,num_nodes_in_layer[0], {
+            all_layer_indices(i) = i;
+        });
+        all_layer_indices.update_host();  // copy inputs to device
+        //map of all indices in this layer to be used for row-vector product (in practice, this would not include all indices in the layer)
+        input_pmap = TpetraPartitionMap<long long int>(all_layer_indices);
+
+        //map that decomposes indices of this onto set of processes uniquely (used to demonstrate comms for above)
+        input_unique_pmap = TpetraPartitionMap<long long int>(num_nodes_in_layer[0]);
+        TpetraMVArray<real_t> inputs(input_pmap); //rows decomposed onto processes
+        //comming from subview requires both the original map and the submap to be composed of contiguous indices
+        inputs.own_comm_setup(input_unique_pmap); //tells the vector its communicating from a contiguous subset of its own data
+
+        // set the strides
+        // layer 0 are the inputs to the ANN
+        // layer n-1 are the outputs from the ANN
+        for (size_t layer=0; layer<num_layers; layer++){
+
+            // dimensions
+            size_t num_i = num_nodes_in_layer[layer];
+            size_t num_j = num_nodes_in_layer[layer+1];
+            DCArrayKokkos<long long int> all_current_layer_indices(num_nodes_in_layer[layer+1]);
+            FOR_ALL(i,0,num_nodes_in_layer[layer+1], {
+                all_current_layer_indices(i) = i;
+            });
+
+            ANNLayers(layer).output_partition_map = TpetraPartitionMap<long long int>(all_current_layer_indices);
+            ANNLayers(layer).output_unique_map = TpetraPartitionMap<long long int>(num_nodes_in_layer[layer+1]); 
+            ANNLayers(layer).distributed_outputs = TpetraMVArray<real_t> (ANNLayers(layer).output_partition_map);
+            //comming from subview requires both the original map and the submap to be composed of contiguous indices
+            ANNLayers(layer).distributed_outputs.own_comm_setup(ANNLayers(layer).output_unique_map);
+            // allocate the weights in this layer
+            ANNLayers(layer).distributed_weights = TpetraCRSMatrix<real_t> (num_j, num_i);
+            ANNLayers(layer).distributed_biases = TpetraMVArray<real_t> (num_j);
+
+        } // end for
+
+
+        // =================================================================
+        // set weights, biases, and inputs
+        // =================================================================
+        
+        // inputs to ANN
+        size_t local_input_size = inputs.submap_size();
+        //std::cout << "full_input_size " << input_pmap.num_global_ << "\n";
+        for (size_t i=0; i<local_input_size; i++) {
+            int global_index = inputs.getSubMapGlobalIndex(i);
+            int local_index = inputs.getMapLocalIndex(global_index);
+            inputs.host(local_index) = 1.0;
+        }
+        
+        // //debug print
+        // std::ostream &out = std::cout;
+        // Teuchos::RCP<Teuchos::FancyOStream> fos;
+        // fos = Teuchos::fancyOStream(Teuchos::rcpFromRef(out));
+        // inputs.tpetra_sub_vector->describe(*fos,Teuchos::VERB_EXTREME);
+        
+        inputs.update_device();  // copy inputs to device
+        inputs.perform_comms(); //distribute to full map for row-vector product
+
+        // for (size_t i=0; i<num_nodes_in_layer[0]; i++) {
+        //     std::cout << "input at " << i << " is " << inputs(i) << "\n";
+        // }
+
+        // weights of the ANN
+        for (size_t layer=0; layer<num_layers; layer++){
+
+            set_weights(ANNLayers(layer).distributed_weights);
+            set_biases(ANNLayers(layer).distributed_biases);
+
+        } // end for over layers
+
+
+
+        // =================================================================
+        // Testing vec matrix multiply
+        // =================================================================        
+        vec_mat_multiply(inputs,
+                         ANNLayers(0).distributed_outputs,
+                         ANNLayers(0).distributed_weights); 
+        
+        if(process_rank==0)
+            std::cout << "vec mat multiply test completed \n";
+
+
+
+
+        // =================================================================
+        // Use the ANN
+        // =================================================================
+        
+        MPI_Barrier(MPI_COMM_WORLD);
+        auto time_1 = std::chrono::high_resolution_clock::now();
+
+        // forward propogate
+
+        // layer 1, hidden layer 0, uses the inputs as the input values
+        forward_propagate_layer(inputs,
+                                ANNLayers(0).distributed_outputs,
+                                ANNLayers(0).distributed_weights,
+                                ANNLayers(0).distributed_biases); 
+
+        // layer 2 through n-1, layer n-1 goes to the output
+        for (size_t layer=1; layer<num_layers; layer++){
+            
+            ANNLayers(layer-1).distributed_outputs.perform_comms(); //distribute to full map for row-vector product
+            // go through this layer, the fcn takes(inputs, outputs, weights)
+            forward_propagate_layer(ANNLayers(layer-1).distributed_outputs, 
+                                    ANNLayers(layer).distributed_outputs,
+                                    ANNLayers(layer).distributed_weights,
+                                    ANNLayers(layer).distributed_biases);
+            
+        } // end for
+        
+        MPI_Barrier(MPI_COMM_WORLD);
+        auto time_2 = std::chrono::high_resolution_clock::now();
+
+        std::chrono::duration <float, std::milli> ms = time_2 - time_1;
+        if(process_rank==0)
+            std::cout << "runtime of ANN test = " << ms.count() << "ms\n\n";
+        
+        
+        // =================================================================
+        // Copy values to host
+        // =================================================================
+        ANNLayers(num_layers-1).distributed_outputs.update_host();
+
+        if(process_rank==0)
+            std::cout << "output values grid: \n";
+        std::flush(std::cout);
+        MPI_Barrier(MPI_COMM_WORLD);
+        std::stringstream output_stream;
+        size_t local_output_size = ANNLayers(num_layers-1).distributed_outputs.submap_size();
+        for (size_t val=0; val < local_output_size; val++){
+            int global_index = ANNLayers(num_layers-1).distributed_outputs.getSubMapGlobalIndex(val);
+            int local_index = ANNLayers(num_layers-1).distributed_outputs.getMapLocalIndex(global_index);
+            output_stream << " " << ANNLayers(num_layers-1).distributed_outputs.host(local_index);
+            if(val%10==0) output_stream << std::endl;
+        } // end for
+        std::cout << output_stream.str();
+        std::flush(std::cout);
+
+        //test repartition; assume a 10 by 10 grid of outputs from ANN
+        //assign coords to each grid point, find a partition of the grid, then repartition output layer using new map
+        TpetraMVArray<real_t> output_grid(100, 2); //array of 2D coordinates for 10 by 10 grid of points
+        
+        //populate coords
+        FOR_ALL(i,0,output_grid.dims(0), {
+		    output_grid(i, 0) = i/10;
+            output_grid(i, 1) = i%10;
+	    }); // end parallel for
+
+        MPI_Barrier(MPI_COMM_WORLD);
+        if(process_rank==0){ 
+            std::cout << std::endl;
+            std::cout << " Map before repartitioning" << std::endl;
+        }
+        std::flush(std::cout);
+        output_grid.pmap.print();
+        
+        MPI_Barrier(MPI_COMM_WORLD);
+        output_grid.repartition_vector();
+        if(process_rank==0){ 
+            std::cout << std::endl;
+            std::cout << " Map after repartitioning" << std::endl;
+        }
+        output_grid.pmap.print();
+
+        if(process_rank==0){ 
+            std::cout << std::endl;
+            std::cout << " Grid components per rank after repartitioning" << std::endl;
+        }
+        output_grid.print();
+
+        //get repartitioned map to distribute new arrays with it
+        TpetraPartitionMap<long long int> partitioned_output_map = output_grid.pmap;
+        TpetraMVArray<real_t> partitioned_output_values(partitioned_output_map, "partitioned output values");
+
+        //construct a unique source vector from ANN output using the subview constructor
+        //(for example's sake this is in fact a copy of the subview wrapped by the output as well)
+        TpetraMVArray<real_t> sub_output_values(ANNLayers(num_layers-1).distributed_outputs, ANNLayers(num_layers-1).distributed_outputs.comm_pmap,
+                                                 ANNLayers(num_layers-1).distributed_outputs.comm_pmap.getMinGlobalIndex());
+
+        //general communication object between two vectors/arrays
+        TpetraCommunicationPlan<real_t> output_comms(partitioned_output_values, sub_output_values);
+        output_comms.execute_comms();
+        partitioned_output_values.print();
+
+    } // end of kokkos scope
+
+    Kokkos::finalize();
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(process_rank==0)
+        printf("\nfinished\n\n");
+    MPI_Finalize();
+
+
+    return 0;
+}
+
+
diff --git a/examples/ann_kokkos.cpp b/examples/ann_kokkos.cpp
index 33901f6d..056b7fa0 100644
--- a/examples/ann_kokkos.cpp
+++ b/examples/ann_kokkos.cpp
@@ -63,7 +63,7 @@ using namespace mtr; // matar namespace
 // Number of nodes in each layer including inputs and outputs
 //
 // =================================================================
-std::vector <size_t> num_nodes_in_layer = {64000, 30000, 8000, 4000, 100, 25, 6} ;
+std::vector <size_t> num_nodes_in_layer = {32000, 16000, 8000, 4000, 100, 25, 6} ;
 // {9, 50, 100, 300, 200, 100, 20, 6}
 
 
@@ -147,7 +147,7 @@ void forward_propagate_layer(DCArrayKokkos <float> &inputs,
     const size_t num_j = outputs.size();
 
 
-/*    
+    /*
     FOR_ALL(j, 0, num_j,{
 
     	//printf("thread = %d \n", omp_get_thread_num());
@@ -162,7 +162,7 @@ void forward_propagate_layer(DCArrayKokkos <float> &inputs,
             outputs(j) = sigmoid(value);
 
         }); // end parallel for
-*/     
+    */
 
 
     // For a GPU, use the nested parallelism below here
@@ -293,7 +293,7 @@ int main(int argc, char* argv[])
         // =================================================================
         // Use the ANN
         // =================================================================
-
+        Kokkos::fence();
         auto time_1 = std::chrono::high_resolution_clock::now();
 
         // forward propogate
@@ -314,6 +314,7 @@ int main(int argc, char* argv[])
                                     ANNLayers(1).biases); 
         } // end for
 
+        Kokkos::fence();
         auto time_2 = std::chrono::high_resolution_clock::now();
 
         std::chrono::duration <float, std::milli> ms = time_2 - time_1;
diff --git a/examples/ann_kokkos_compare.cpp b/examples/ann_kokkos_compare.cpp
new file mode 100644
index 00000000..e6f498cd
--- /dev/null
+++ b/examples/ann_kokkos_compare.cpp
@@ -0,0 +1,342 @@
+/**********************************************************************************************
+ � 2020. Triad National Security, LLC. All rights reserved.
+ This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
+ National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
+ Department of Energy/National Nuclear Security Administration. All rights in the program are
+ reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
+ Security Administration. The Government is granted for itself and others acting on its behalf a
+ nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
+ derivative works, distribute copies to the public, perform publicly and display publicly, and
+ to permit others to do so.
+ This program is open source under the BSD-3 License.
+ Redistribution and use in source and binary forms, with or without modification, are permitted
+ provided that the following conditions are met:
+ 1.  Redistributions of source code must retain the above copyright notice, this list of
+ conditions and the following disclaimer.
+ 2.  Redistributions in binary form must reproduce the above copyright notice, this list of
+ conditions and the following disclaimer in the documentation and/or other materials
+ provided with the distribution.
+ 3.  Neither the name of the copyright holder nor the names of its contributors may be used
+ to endorse or promote products derived from this software without specific prior
+ written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ **********************************************************************************************/
+#include <stdio.h>
+#include <array>
+#include <vector>
+#include <chrono>
+#include <math.h>
+
+#include "matar.h"
+
+using namespace mtr; // matar namespace
+
+
+
+// =================================================================
+// Artificial Neural Network (ANN)
+//
+// For a single layer, we have x_i inputs with weights_{ij}, 
+// creating y_j outputs.  We have
+//     y_j = Fcn(b_j) = Fcn( Sum_i {x_i w_{ij}} )
+// where the activation function Fcn is applied to b_j, creating 
+// outputs y_j. For multiple layers, we have
+//      b_j^l = Sum_i (x_i^{l-1} w_{ij}^l)
+// where l is a layer, and as before, an activation function is  
+// applied to b_j^l, creating outputs y_j^l.
+// 
+// =================================================================
+
+
+// =================================================================
+//
+// Number of nodes in each layer including inputs and outputs
+//
+// =================================================================
+std::vector <size_t> num_nodes_in_layer = {32000, 16000, 8000, 4000, 100, 25, 6} ;
+// {9, 50, 100, 300, 200, 100, 20, 6}
+
+
+
+// =================================================================
+//
+// data types and classes
+//
+// =================================================================
+
+// array of ANN structs
+struct ANNLayer_t{
+
+    Kokkos::View <float*> outputs;  // dims = [layer]
+    Kokkos::View <float*> weights;  // dims = [layer-1, layer]
+    Kokkos::View <float*> biases;  // dims = [layer]
+
+}; // end struct
+
+
+
+// =================================================================
+//
+// functions
+//
+// =================================================================
+void vec_mat_multiply(Kokkos::View <float*> &inputs,
+                      Kokkos::View <float*> &outputs, 
+                      Kokkos::View <float*> &matrix){
+    
+    const size_t num_i = inputs.size();
+    const size_t num_j = outputs.size();
+
+    using team_t = typename Kokkos::TeamPolicy<>::member_type;
+    Kokkos::parallel_for ("MatVec", Kokkos::TeamPolicy<> (num_j, Kokkos::AUTO),
+                 KOKKOS_LAMBDA (const team_t& team_h) {
+
+        float sum = 0;
+        int j = team_h.league_rank();
+        Kokkos::parallel_reduce (Kokkos::TeamThreadRange (team_h, num_i),
+                        [&] (int i, float& lsum) {
+            lsum += inputs(i)*matrix(i+j*num_i);
+        }, sum); // end parallel reduce
+
+        outputs(j) = sum; 
+
+    }); // end parallel for
+
+
+    FOR_ALL(j,0,num_j, {
+            if(fabs(outputs(j) - num_i)>= 1e-15){
+                printf("error in vec mat multiply test \n");
+            }
+    });
+    
+    return;
+
+}; // end function
+
+KOKKOS_INLINE_FUNCTION
+float sigmoid(const float value){
+    return 1.0/(1.0 + exp(-value));  // exp2f doesn't work with CUDA
+}; // end function
+
+
+KOKKOS_INLINE_FUNCTION
+float sigmoid_derivative(const float value){
+    float sigval = sigmoid(value);
+    return sigval*(1.0 - sigval);  // exp2f doesn't work with CUDA
+}; // end function
+
+
+
+
+void forward_propagate_layer(Kokkos::View <float*> &inputs,
+                             Kokkos::View <float*> &outputs, 
+                             Kokkos::View <float*> &weights,
+                             const Kokkos::View <float*> &biases){
+    
+    const size_t num_i = inputs.size();
+    const size_t num_j = outputs.size();
+
+
+
+    FOR_ALL(j, 0, num_j,{
+
+    	//printf("thread = %d \n", omp_get_thread_num());
+
+            float value = 0.0;
+            for(int i=0; i<num_i; i++){
+                // b_j = Sum_i {x_i w_{ij}}
+                value += inputs(i)*weights(i+j*num_i);
+            } // end for
+
+            // apply activation function, sigmoid on a float, y_j = Fcn(b_j)
+            outputs(j) = sigmoid(value);
+
+        }); // end parallel for
+    
+
+
+    // For a GPU, use the nested parallelism below here
+    /*
+    using team_t = typename Kokkos::TeamPolicy<>::member_type;
+    Kokkos::parallel_for ("MatVec", Kokkos::TeamPolicy<> (num_j, Kokkos::AUTO),
+                 KOKKOS_LAMBDA (const team_t& team_h) {
+
+        float sum = 0;
+        int j = team_h.league_rank();
+        Kokkos::parallel_reduce (Kokkos::TeamThreadRange (team_h, num_i),
+                        [&] (int i, float& lsum) {
+            lsum += inputs(i)*weights(i,j) + biases(j);
+        }, sum); // end parallel reduce
+
+        outputs(j) = 1.0/(1.0 + exp(-sum)); 
+
+    }); // end parallel for
+    */
+
+
+    return;
+
+}; // end function
+
+
+void set_biases(const Kokkos::View <float*> &biases){
+    const size_t num_j = biases.size();
+
+    FOR_ALL(j,0,num_j, {
+		    biases(j) = 0.0;
+	}); // end parallel for
+
+}; // end function
+
+
+void set_weights(const Kokkos::View <float*> &weights, int num_i, int num_j){
+
+    
+	FOR_ALL(i,0,num_i,
+	        j,0,num_j, {
+		    
+		    weights(i+j*num_i) = 1.0;
+	}); // end parallel for
+
+}; // end function
+
+
+// =================================================================
+//
+// Main function
+//
+// =================================================================
+int main(int argc, char* argv[])
+{
+    Kokkos::initialize(argc, argv);
+    {
+
+        // =================================================================
+        // allocate arrays
+        // =================================================================
+
+        // note: the num_nodes_in_layer has the inputs into the ANN, so subtract 1 for the layers
+        size_t num_layers = num_nodes_in_layer.size()-1;  
+
+        CMatrix <ANNLayer_t> ANNLayers(num_layers); // starts at 1 and goes to num_layers
+
+        // input and ouput values to ANN
+        Kokkos::View <float*> inputs("inputs", num_nodes_in_layer[0]);
+
+
+        // set the strides
+        // layer 0 are the inputs to the ANN
+        // layer n-1 are the outputs from the ANN
+        for (size_t layer=1; layer<=num_layers; layer++){
+
+            // dimensions
+            size_t num_i = num_nodes_in_layer[layer-1];
+            size_t num_j = num_nodes_in_layer[layer];
+
+            // allocate the weights in this layer
+            ANNLayers(layer).weights = Kokkos::View <float*> ("weights", num_i*num_j); 
+            ANNLayers(layer).outputs = Kokkos::View <float*> ("outputs", num_j);
+            ANNLayers(layer).biases = Kokkos::View <float*> ("biases", num_j);
+
+        } // end for
+
+
+        // =================================================================
+        // set weights, biases, and inputs
+        // =================================================================
+        
+        // inputs to ANN
+        FOR_ALL(i,0,num_nodes_in_layer[0], {
+		    inputs(i) = 1.0;
+	    }); // end parallel for
+
+        // weights of the ANN
+        for (size_t layer=1; layer<=num_layers; layer++){
+
+            // dimensions
+            size_t num_i = num_nodes_in_layer[layer-1];
+            size_t num_j = num_nodes_in_layer[layer];
+
+
+            set_weights(ANNLayers(layer).weights, num_i, num_j);
+            set_biases(ANNLayers(layer).biases);
+
+        } // end for over layers
+
+
+
+        // =================================================================
+        // Testing vec matrix multiply
+        // =================================================================        
+        vec_mat_multiply(inputs,
+                         ANNLayers(1).outputs,
+                         ANNLayers(1).weights); 
+        
+        std::cout << "vec mat multiply test completed \n";
+
+
+
+
+        // =================================================================
+        // Use the ANN
+        // =================================================================
+        Kokkos::fence();
+        auto time_1 = std::chrono::high_resolution_clock::now();
+
+        // forward propogate
+
+        // layer 1, hidden layer 0, uses the inputs as the input values
+        forward_propagate_layer(inputs,
+                                ANNLayers(1).outputs,
+                                ANNLayers(1).weights,
+                                ANNLayers(1).biases); 
+
+        // layer 2 through n-1, layer n-1 goes to the output
+        for (size_t layer=2; layer<=num_layers; layer++){
+
+            // go through this layer, the fcn takes(inputs, outputs, weights)
+            forward_propagate_layer(ANNLayers(layer-1).outputs, 
+                                    ANNLayers(layer).outputs,
+                                    ANNLayers(layer).weights,
+                                    ANNLayers(1).biases); 
+        } // end for
+
+        Kokkos::fence();
+        auto time_2 = std::chrono::high_resolution_clock::now();
+
+        std::chrono::duration <float, std::milli> ms = time_2 - time_1;
+        std::cout << "runtime of ANN test = " << ms.count() << "ms\n\n";
+
+
+        // =================================================================
+        // Copy values to host
+        // =================================================================
+        //ANNLayers(num_layers).outputs.update_host();
+        
+        std::cout << "output values: \n";
+        for (size_t val=0; val<num_nodes_in_layer[num_layers]; val++){
+            //std::cout << " " << ANNLayers(num_layers).outputs.host(val) << std::endl;
+        } // end for
+ 
+    } // end of kokkos scope
+
+    Kokkos::finalize();
+
+
+
+    printf("\nfinished\n\n");
+
+    return 0;
+}
+
+
diff --git a/src/include/tpetra_wrapper_types.h b/src/include/tpetra_wrapper_types.h
index 0876de31..bfa1c912 100644
--- a/src/include/tpetra_wrapper_types.h
+++ b/src/include/tpetra_wrapper_types.h
@@ -1032,13 +1032,13 @@ TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::~TpetraMVArray() {}
 ////////////////////////////////////////////////////////////////////////////////
 
 /////////////////////////
-// TpetraCRSMatrix:  CRS Matrix Tpetra wrapper.
+// TpetraCArray:  CArray Tpetra wrapper.
 /////////////////////////
 template <typename T, typename Layout = tpetra_array_layout, typename ExecSpace = tpetra_execution_space, typename MemoryTraits = tpetra_memory_traits>
-class TpetraCRSMatrix {
+class TpetraCArray {
 
     // this is manage
-    using  TArray1D = RaggedRightArrayKokkos<T*, Kokkos::LayoutRight, ExecSpace, MemoryTraits>;
+    using  TArray1D = CArrayKokkos<T*, Kokkos::LayoutRight, ExecSpace, MemoryTraits>;
     using  TArray1D_Host = RaggedRightArrayKokkos<T*, Kokkos::LayoutRight, HostSpace, MemoryTraits>;
     using  row_map_type = Kokkos::View<size_t*, ExecSpace>;
     using  input_row_map_type = DCArrayKokkos<size_t,ExecSpace>;
@@ -1087,26 +1087,22 @@ class TpetraCRSMatrix {
     Teuchos::RCP<Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> tpetra_comm_pmap;
     Teuchos::RCP<MAT>       tpetra_crs_matrix;
 
-    TpetraCRSMatrix();
+    TpetraCArray();
     
     //Copy Constructor
-    TpetraCRSMatrix(const TpetraCRSMatrix<T, Layout, ExecSpace,MemoryTraits> &temp){
+    TpetraCArray(const TpetraCArray<T, Layout, ExecSpace,MemoryTraits> &temp){
         *this = temp;
     }
 
     //CRS matrix constructor for banded matrix case
-    // TpetraCRSMatrix(size_t dim1, size_t dim2,
-    //                 const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
-    
-    //CRS matrix constructor with arbitrary row graph
-    TpetraCRSMatrix(size_t dim1, input_row_map_type input_strides, DCArrayKokkos<tpetra_GO,Layout,ExecSpace,MemoryTraits> crs_graph,
-                    TArray1D input_values, const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
+    TpetraCArray(size_t dim1, size_t dim2,
+                    const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
 
     //CRS matrix constructor with arbitrary row graph and column map supplied
-    TpetraCRSMatrix(TpetraPartitionMap<long long int,Layout,ExecSpace,MemoryTraits> &input_pmap, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    TpetraCArray(TpetraPartitionMap<long long int,Layout,ExecSpace,MemoryTraits> &input_pmap, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
     //CRS matric constructor with arbitrary row graph; builds column map for you and thus one less arg
-    TpetraCRSMatrix(Teuchos::RCP<Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    TpetraCArray(Teuchos::RCP<Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
     KOKKOS_INLINE_FUNCTION
     T& operator()(size_t i, size_t j) const;
@@ -1114,7 +1110,7 @@ class TpetraCRSMatrix {
     // T& host(size_t i, size_t j) const;
     
     KOKKOS_INLINE_FUNCTION
-    TpetraCRSMatrix& operator=(const TpetraCRSMatrix& temp);
+    TpetraCArray& operator=(const TpetraCArray& temp);
 
     // GPU Method
     // Method that returns size
@@ -1166,13 +1162,13 @@ class TpetraCRSMatrix {
 
     // Deconstructor
     virtual KOKKOS_INLINE_FUNCTION
-    ~TpetraCRSMatrix ();
+    ~TpetraCArray ();
 }; // End of TpetraMVArray
 
 
 // Default constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::TpetraCRSMatrix(): tpetra_pmap(NULL){
+TpetraCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraCArray(): tpetra_pmap(NULL){
     length_ = 0;
     for (int i = 0; i < 7; i++) {
         dim1_ = 0;
@@ -1181,15 +1177,18 @@ TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::TpetraCRSMatrix(): tpetra_pmap
 
 // Constructor that takes local data in a matar ragged type
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::TpetraCRSMatrix(size_t dim0, input_row_map_type input_strides, DCArrayKokkos<tpetra_GO,Layout,ExecSpace,MemoryTraits> crs_graph,
-                                                                  TArray1D input_values, const std::string& tag_string, MPI_Comm mpi_comm) {
+TpetraCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraCArray(size_t global_dim1, size_t dim2, const std::string& tag_string, MPI_Comm mpi_comm) {
     mpi_comm_ = mpi_comm;
-    global_dim1_ = dim0;
+    global_dim1_ = global_dim1;
     Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<int>(mpi_comm_));
-    tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>((long long int) dim0, 0, teuchos_comm));
+    tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>((long long int) global_dim1, 0, teuchos_comm));
     pmap = TpetraPartitionMap<tpetra_GO,Layout,ExecSpace,MemoryTraits>(tpetra_pmap);
     dim1_ = tpetra_pmap->getLocalNumElements();
-    mystrides_ = input_strides;
+    //construct strides that are constant
+    mystrides_ = row_map_type("mystrides_",dim1_);
+    for(int irow = 0; irow < dim1_; irow++){
+        mystrides_(irow) = dim2;
+    }
     this_array_ = input_values;
     global_indices_array input_crs_graph = crs_graph.get_kokkos_dual_view().d_view;
 
@@ -1229,6 +1228,468 @@ TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::TpetraCRSMatrix(size_t dim0, i
     tpetra_crs_matrix->fillComplete();
 }
 
+// Overloaded 2D constructor where you provide a partition map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraCArray(TpetraPartitionMap<long long int,Layout,ExecSpace,MemoryTraits> &input_pmap,
+                                                              size_t dim1, const std::string& tag_string) {
+    // mpi_comm_ = input_pmap.mpi_comm_;
+    // global_dim1_ = input_pmap.num_global_;
+    // tpetra_pmap = input_pmap.tpetra_map;
+    // pmap = input_pmap;
+    // dims_[0] = tpetra_pmap->getLocalNumElements();
+    // dims_[1] = dim1;
+    // order_ = 2;
+    // length_ = (dims_[0] * dims_[1]);
+    // // Create host ViewCArray
+    // set_mpi_type();
+    // this_array_ = TArray1D(tag_string, dims_[0], dim1);
+    // tpetra_vector   = Teuchos::rcp(new MV(tpetra_pmap, this_array_));
+}
+
+// Overloaded 2D constructor taking an RPC pointer to a Tpetra Map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraCArray(Teuchos::RCP<Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap,
+                                                              size_t dim1, const std::string& tag_string) {
+    
+    // global_dim1_ = input_pmap->getGlobalNumElements();
+    // dims_[0] = input_pmap->getLocalNumElements();
+    // dims_[1] = dim1;
+    // tpetra_pmap = input_pmap;
+    // pmap = TpetraPartitionMap<tpetra_GO,Layout,ExecSpace,MemoryTraits>(tpetra_pmap);
+    // order_ = 2;
+    // length_ = (dims_[0] * dims_[1]);
+    // // Create host ViewCArray
+    // set_mpi_type();
+    // this_array_ = TArray1D(tag_string, dims_[0], dim1);
+    // tpetra_vector   = Teuchos::rcp(new MV(tpetra_pmap, this_array_));
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraCArray<T,Layout,ExecSpace,MemoryTraits>::set_mpi_type() {
+    if (typeid(T).name() == typeid(bool).name()) {
+        mpi_datatype_ = MPI_C_BOOL;
+    }
+    else if (typeid(T).name() == typeid(int).name()) {
+        mpi_datatype_ = MPI_INT;
+    }
+    else if (typeid(T).name() == typeid(long int).name()) {
+        mpi_datatype_ = MPI_LONG;
+    }
+    else if (typeid(T).name() == typeid(long long int).name()) {
+        mpi_datatype_ = MPI_LONG_LONG_INT;
+    }
+    else if (typeid(T).name() == typeid(float).name()) {
+        mpi_datatype_ = MPI_FLOAT;
+    }
+    else if (typeid(T).name() == typeid(double).name()) {
+        mpi_datatype_ = MPI_DOUBLE;
+    }
+    else {
+        printf("Your entered TpetraCArray type is not a supported type for MPI communications and is being set to int\n");
+        mpi_datatype_ = MPI_INT;
+    }
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& TpetraCArray<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j) const {
+    assert(i >= 0 && i < dim1_ && "i is out of bounds in TpetraCArray!");
+    assert(j >= 0 && j < mystrides_(i) && "j is out of bounds in TpetraCArray!");
+    return this_array_(i,j);
+}
+
+// Return global index corresponding to the input local (on this process/rank) index for the sub map this vector comms from
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+long long int TpetraCArray<T,Layout,ExecSpace,MemoryTraits>::getColumnMapGlobalIndex(int local_index) const {
+    long long int global_index = tpetra_column_pmap->getGlobalElement(local_index);
+    return global_index;
+}
+
+// Return global index corresponding to the input local (on this process/rank) index
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+long long int TpetraCArray<T,Layout,ExecSpace,MemoryTraits>::getMapGlobalIndex(int local_index) const {
+    long long int global_index = tpetra_pmap->getGlobalElement(local_index);
+    return global_index;
+}
+
+// Return global index corresponding to the input local (on this process/rank) index for the sub map this vector comms from
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+int TpetraCArray<T,Layout,ExecSpace,MemoryTraits>::getColumnMapLocalIndex(long long int global_index) const {
+    int local_index = tpetra_column_pmap->getLocalElement(global_index);
+    return local_index;
+}
+
+// Return global index corresponding to the input local (on this process/rank) index
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+int TpetraCArray<T,Layout,ExecSpace,MemoryTraits>::getMapLocalIndex(long long int global_index) const {
+    int local_index = tpetra_pmap->getLocalElement(global_index);
+    return local_index;
+}
+
+// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+// T& TpetraCArray<T,Layout,ExecSpace,MemoryTraits>::host(size_t i, size_t j) const {
+//     assert(i >= 0 && i < dim1_ && "i is out of bounds in TpetraCArray");
+//     assert(j >= 0 && j < mystrides_(i) && "j is out of bounds in TpetraCArray");
+//     return this_array_.h_view(i,j);
+// }
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+TpetraCArray<T,Layout,ExecSpace,MemoryTraits>& TpetraCArray<T,Layout,ExecSpace,MemoryTraits>::operator= (const TpetraCArray& temp) {
+    
+    // Do nothing if the assignment is of the form x = x
+    if (this != &temp) {
+        dim1_ = temp.dim1_;
+        mystrides_ = temp.mystrides_;
+        start_index_ = temp.start_index_;
+        crs_local_indices_ = temp.crs_local_indices_;
+        global_dim1_ = temp.global_dim1_;
+        length_ = temp.length_;
+        this_array_ = temp.this_array_;
+        mpi_comm_ = temp.mpi_comm_;
+        mpi_datatype_ = temp.mpi_datatype_;
+        tpetra_crs_matrix = temp.tpetra_crs_matrix;
+        pmap = temp.pmap;
+        column_pmap = temp.column_pmap;
+        tpetra_pmap = temp.tpetra_pmap;
+        tpetra_column_pmap = temp.tpetra_column_pmap;
+        importer = temp.importer;
+        own_comms = temp.own_comms;
+        column_map_size_ = temp.column_map_size_;
+    }
+    
+    return *this;
+}
+
+// Return size
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+size_t TpetraCArray<T,Layout,ExecSpace,MemoryTraits>::size() const {
+    return length_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+size_t TpetraCArray<T,Layout,ExecSpace,MemoryTraits>::extent() const {
+    return length_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+size_t TpetraCArray<T,Layout,ExecSpace,MemoryTraits>::dim1() const {
+    return dim1_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+size_t TpetraCArray<T,Layout,ExecSpace,MemoryTraits>::global_dim() const {
+    return global_dim1_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T* TpetraCArray<T,Layout,ExecSpace,MemoryTraits>::device_pointer() const {
+    return this_array_.pointer();
+}
+
+// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+// KOKKOS_INLINE_FUNCTION
+// T* TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::host_pointer() const {
+//     return this_array_.h_view.data();
+// }
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+Kokkos::View <T**, Layout, ExecSpace, MemoryTraits> TpetraCArray<T,Layout,ExecSpace,MemoryTraits>::get_kokkos_view() const {
+  return this_array_.get_kokkos_view();
+}
+
+// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+// void TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::update_host() {
+
+//     this_array_.template modify<typename TArray1D::execution_space>();
+//     this_array_.template sync<typename TArray1D::host_mirror_space>();
+// }
+
+// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+// void TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::update_device() {
+
+//     this_array_.template modify<typename TArray1D::host_mirror_space>();
+//     this_array_.template sync<typename TArray1D::execution_space>();
+// }
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraCArray<T,Layout,ExecSpace,MemoryTraits>::print() const {
+        std::ostream &out = std::cout;
+        Teuchos::RCP<Teuchos::FancyOStream> fos;
+        fos = Teuchos::fancyOStream(Teuchos::rcpFromRef(out));
+        tpetra_crs_matrix->describe(*fos,Teuchos::VERB_EXTREME);
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+TpetraCArray<T,Layout,ExecSpace,MemoryTraits>::~TpetraCArray() {}
+
+////////////////////////////////////////////////////////////////////////////////
+// End of TpetraCRSMatrix
+////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////
+// TpetraCRSMatrix:  CRS Matrix Tpetra wrapper.
+/////////////////////////
+template <typename T, typename Layout = tpetra_array_layout, typename ExecSpace = tpetra_execution_space, typename MemoryTraits = tpetra_memory_traits>
+class TpetraCRSMatrix {
+
+    // this is manage
+    using  TArray1D = RaggedRightArrayKokkos<T*, Kokkos::LayoutRight, ExecSpace, MemoryTraits>;
+    using  TArray1D_Host = RaggedRightArrayKokkos<T*, Kokkos::LayoutRight, HostSpace, MemoryTraits>;
+    using  row_map_type = Kokkos::View<size_t*, ExecSpace>;
+    using  input_row_map_type = DCArrayKokkos<size_t,ExecSpace>;
+    using  values_array = Kokkos::View<T*, Kokkos::LayoutRight, ExecSpace, MemoryTraits>;
+    using  global_indices_array = Kokkos::View<tpetra_GO*, Layout, ExecSpace, MemoryTraits>;
+    using  indices_array = Kokkos::View<tpetra_LO*, Layout, ExecSpace, MemoryTraits>;
+
+    size_t dim1_;
+    size_t global_dim1_;
+    size_t column_map_size_;
+    size_t length_;
+    MPI_Comm mpi_comm_;
+    MPI_Datatype mpi_datatype_;
+    TArray1D this_array_;
+    row_map_type mystrides_;
+    row_map_type start_index_;
+    indices_array crs_local_indices_;
+    
+    // Trilinos type definitions
+    typedef Tpetra::CrsMatrix<real_t, tpetra_LO, tpetra_GO> MAT; //stands for matrix
+    typedef const Tpetra::CrsMatrix<real_t, tpetra_LO, tpetra_GO> const_MAT;
+    typedef Tpetra::MultiVector<real_t, tpetra_LO, tpetra_GO> MV;
+    typedef MV::dual_view_type::t_dev vec_array;
+    typedef MV::dual_view_type::t_host host_vec_array;
+    typedef Kokkos::View<const real_t**, tpetra_array_layout, HostSpace, tpetra_memory_traits> const_host_vec_array;
+    typedef Kokkos::View<const real_t**, tpetra_array_layout, tpetra_device_type, tpetra_memory_traits> const_vec_array;
+    typedef Kokkos::View<const int**, tpetra_array_layout, HostSpace, tpetra_memory_traits> const_host_ivec_array;
+    typedef Kokkos::View<int**, tpetra_array_layout, HostSpace, tpetra_memory_traits> host_ivec_array;
+    typedef MV::dual_view_type dual_vec_array;
+
+    Teuchos::RCP<Tpetra::Import<tpetra_LO, tpetra_GO>> importer; // tpetra comms object
+    
+
+public:
+    
+    //data for arrays that own both shared and local data and aren't intended to communicate with another MATAR type
+    //This is simplifying for cases such as a local + ghost storage vector where you need to update the ghost entries
+    bool own_comms; //This Mapped MPI Array contains its own communication plan; just call array_comms()
+    
+    void set_mpi_type();
+    TpetraPartitionMap<tpetra_GO, Layout, ExecSpace, MemoryTraits> pmap;
+    TpetraPartitionMap<tpetra_GO, Layout, ExecSpace, MemoryTraits> column_pmap;
+    TpetraPartitionMap<tpetra_GO, Layout, ExecSpace, MemoryTraits> comm_pmap;
+    Teuchos::RCP<Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> tpetra_pmap;
+    Teuchos::RCP<Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> tpetra_column_pmap;
+    Teuchos::RCP<Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> tpetra_comm_pmap;
+    Teuchos::RCP<MAT>       tpetra_crs_matrix;
+
+    TpetraCRSMatrix();
+    
+    //Copy Constructor
+    TpetraCRSMatrix(const TpetraCRSMatrix<T, Layout, ExecSpace,MemoryTraits> &temp){
+        *this = temp;
+    }
+
+    //CRS matrix constructor for banded matrix case
+    // TpetraCRSMatrix(size_t dim1, size_t dim2,
+    //                 const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
+    
+    // //CRS row distributed matrix constructor for rectangular matrix
+    // TpetraCRSMatrix(size_t global_dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
+
+    // TpetraCRSMatrix(size_t dim1, input_row_map_type input_strides, DCArrayKokkos<tpetra_GO,Layout,ExecSpace,MemoryTraits> crs_graph,
+    //                  const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
+
+    //CRS matrix constructor with arbitrary row graph and column map supplied
+    TpetraCRSMatrix(TpetraPartitionMap<long long int,Layout,ExecSpace,MemoryTraits> &input_pmap, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    //CRS matric constructor with arbitrary row graph; builds column map for you and thus one less arg
+    TpetraCRSMatrix(Teuchos::RCP<Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j) const;
+
+    // T& host(size_t i, size_t j) const;
+    
+    KOKKOS_INLINE_FUNCTION
+    TpetraCRSMatrix& operator=(const TpetraCRSMatrix& temp);
+
+    // GPU Method
+    // Method that returns size
+    KOKKOS_INLINE_FUNCTION
+    size_t size() const;
+
+    KOKKOS_INLINE_FUNCTION
+    long long int getColumnMapGlobalIndex(int local_index) const;
+
+    KOKKOS_INLINE_FUNCTION
+    long long int getMapGlobalIndex(int local_index) const;
+
+    KOKKOS_INLINE_FUNCTION
+    int getColumnMapLocalIndex(long long int local_index) const;
+
+    KOKKOS_INLINE_FUNCTION
+    int getMapLocalIndex(long long int local_index) const;
+
+    // Host Method
+    // Method that returns size
+    KOKKOS_INLINE_FUNCTION
+    size_t extent() const;
+
+    KOKKOS_INLINE_FUNCTION
+    size_t dim1() const;
+
+    size_t global_dim() const;
+ 
+    // Method returns the raw device pointer of the Kokkos DualView
+    KOKKOS_INLINE_FUNCTION
+    T* device_pointer() const;
+
+    // Method returns the raw host pointer of the Kokkos DualView
+    // KOKKOS_INLINE_FUNCTION
+    // T* host_pointer() const;
+
+    // Method returns kokkos dual view
+    KOKKOS_INLINE_FUNCTION
+    Kokkos::View <T**, Layout, ExecSpace, MemoryTraits> get_kokkos_view() const;
+
+    // // Method that update host view
+    // void update_host();
+
+    // Method that update device view
+    void update_device();
+
+    //print vector data
+    void print() const;
+
+    // Deconstructor
+    virtual KOKKOS_INLINE_FUNCTION
+    ~TpetraCRSMatrix ();
+}; // End of TpetraMVArray
+
+
+// Default constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::TpetraCRSMatrix(): tpetra_pmap(NULL){
+    length_ = 0;
+    for (int i = 0; i < 7; i++) {
+        dim1_ = 0;
+    }
+}
+
+// // Constructor that takes local data in a matar ragged type
+// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+// TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::TpetraCRSMatrix(size_t global_dim1, size_t dim2, const std::string& tag_string, MPI_Comm mpi_comm) {
+//     mpi_comm_ = mpi_comm;
+//     global_dim1_ = global_dim1;
+//     Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<int>(mpi_comm_));
+//     tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>((long long int) global_dim1, 0, teuchos_comm));
+//     pmap = TpetraPartitionMap<tpetra_GO,Layout,ExecSpace,MemoryTraits>(tpetra_pmap);
+//     dim1_ = tpetra_pmap->getLocalNumElements();
+//     //construct strides that are constant
+//     mystrides_ = row_map_type("mystrides_",dim1_);
+//     for(int irow = 0; irow < dim1_; irow++){
+//         mystrides_(irow) = dim2;
+//     }
+//     this_array_ = input_values;
+//     global_indices_array input_crs_graph = crs_graph.get_kokkos_dual_view().d_view;
+
+    
+//     //build column map for the global conductivity matrix
+//     Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type> > colmap;
+//     const Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type> > dommap = tpetra_pmap;
+
+//     Tpetra::Details::makeColMap<tpetra_LO, tpetra_GO, tpetra_node_type>(colmap, tpetra_pmap, input_crs_graph.get_kokkos_dual_view().d_view, nullptr);
+//     tpetra_column_pmap = colmap;
+//     size_t nnz = input_crs_graph.size();
+
+//     //debug print
+//     //std::cout << "DOF GRAPH SIZE ON RANK " << myrank << " IS " << nnz << std::endl;
+    
+//     //local indices in the graph using the constructed column map
+//     crs_local_indices_ = indices_array("crs_local_indices", nnz);
+    
+//     //row offsets with compatible template arguments
+//         row_map_type row_offsets_pass("row_offsets", dim1_ + 1);
+//         for(int ipass = 0; ipass < dim1_ + 1; ipass++){
+//             row_offsets_pass(ipass) = input_values.start_index_(ipass);
+//         }
+
+//     size_t entrycount = 0;
+//     for(int irow = 0; irow < dim1_; irow++){
+//         for(int istride = 0; istride < mystrides_(irow); istride++){
+//             crs_local_indices_(entrycount) = tpetra_column_pmap->getLocalElement(crs_graph(entrycount));
+//             entrycount++;
+//         }
+//     }
+    
+//     //sort values and indices
+//     Tpetra::Import_Util::sortCrsEntries<row_map_type, indices_array, values_array>(row_offsets_pass, crs_local_indices_.d_view, this_array_.get_kokkos_view());
+
+//     tpetra_crs_matrix = Teuchos::rcp(new MAT(tpetra_pmap, tpetra_column_pmap, start_index_.d_view, crs_local_indices_.d_view, this_array_.get_kokkos_view()));
+//     tpetra_crs_matrix->fillComplete();
+// }
+
+// Constructor that takes local data in a matar ragged type
+// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+// TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::TpetraCRSMatrix(size_t dim0, input_row_map_type input_strides, DCArrayKokkos<tpetra_GO,Layout,ExecSpace,MemoryTraits> crs_graph,
+//                                                                   TArray1D input_values, const std::string& tag_string, MPI_Comm mpi_comm) {
+//     mpi_comm_ = mpi_comm;
+//     global_dim1_ = dim0;
+//     Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<int>(mpi_comm_));
+//     tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>((long long int) dim0, 0, teuchos_comm));
+//     pmap = TpetraPartitionMap<tpetra_GO,Layout,ExecSpace,MemoryTraits>(tpetra_pmap);
+//     dim1_ = tpetra_pmap->getLocalNumElements();
+//     mystrides_ = input_strides;
+//     this_array_ = input_values;
+//     global_indices_array input_crs_graph = crs_graph.get_kokkos_dual_view().d_view;
+
+    
+//     //build column map for the global conductivity matrix
+//     Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type> > colmap;
+//     const Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type> > dommap = tpetra_pmap;
+
+//     Tpetra::Details::makeColMap<tpetra_LO, tpetra_GO, tpetra_node_type>(colmap, tpetra_pmap, input_crs_graph.get_kokkos_dual_view().d_view, nullptr);
+//     tpetra_column_pmap = colmap;
+//     size_t nnz = input_crs_graph.size();
+
+//     //debug print
+//     //std::cout << "DOF GRAPH SIZE ON RANK " << myrank << " IS " << nnz << std::endl;
+    
+//     //local indices in the graph using the constructed column map
+//     crs_local_indices_ = indices_array("crs_local_indices", nnz);
+    
+//     //row offsets with compatible template arguments
+//         row_map_type row_offsets_pass("row_offsets", dim1_ + 1);
+//         for(int ipass = 0; ipass < dim1_ + 1; ipass++){
+//             row_offsets_pass(ipass) = input_values.start_index_(ipass);
+//         }
+
+//     size_t entrycount = 0;
+//     for(int irow = 0; irow < dim1_; irow++){
+//         for(int istride = 0; istride < mystrides_(irow); istride++){
+//             crs_local_indices_(entrycount) = tpetra_column_pmap->getLocalElement(crs_graph(entrycount));
+//             entrycount++;
+//         }
+//     }
+    
+//     //sort values and indices
+//     Tpetra::Import_Util::sortCrsEntries<row_map_type, indices_array, values_array>(row_offsets_pass, crs_local_indices_.d_view, this_array_.get_kokkos_view());
+
+//     tpetra_crs_matrix = Teuchos::rcp(new MAT(tpetra_pmap, tpetra_column_pmap, start_index_.d_view, crs_local_indices_.d_view, this_array_.get_kokkos_view()));
+//     tpetra_crs_matrix->fillComplete();
+// }
+
 // Overloaded 2D constructor where you provide a partition map
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::TpetraCRSMatrix(TpetraPartitionMap<long long int,Layout,ExecSpace,MemoryTraits> &input_pmap,
@@ -1360,7 +1821,7 @@ TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>& TpetraCRSMatrix<T,Layout,ExecS
         tpetra_column_pmap = temp.tpetra_column_pmap;
         importer = temp.importer;
         own_comms = temp.own_comms;
-        column_map_size_ = temp.submap_size_;
+        column_map_size_ = temp.column_map_size_;
     }
     
     return *this;