Add grid gather with slices

ECP-copa · Jan 13, 2021 · 47fe0a8 · 47fe0a8
1 parent 552abb0
commit 47fe0a8
Show file tree

Hide file tree

Showing 3 changed files with 504 additions and 244 deletions.
diff --git a/core/src/Cabana_Halo.hpp b/core/src/Cabana_Halo.hpp
@@ -203,18 +203,20 @@ struct is_halo : public is_halo_impl<typename std::remove_cv<T>::type>::type
 namespace Impl
 {
 
+//---------------------------------------------------------------------------//
+// Check that the AoSoA/slice is the right size.
 template <class Halo_t, class Container_t>
 void checkSize( const Halo_t& halo, Container_t& container )
 {
-    // Check that the AoSoA/slice is the right size.
     if ( container.size() != halo.numLocal() + halo.numGhost() )
         throw std::runtime_error( "AoSoA/slice is the wrong size for gather!" );
 }
 
+//---------------------------------------------------------------------------//
+// Fill the data to send only. AoSoA variant.
 template <class Halo_t, class AoSoA_t, class View_t>
 void sendBuffer( const Halo_t& halo, AoSoA_t& aosoa, View_t& send_buffer )
 {
-
     // Get the steering vector for the sends.
     auto steering = halo.getExportSteering();
 
@@ -230,6 +232,7 @@ void sendBuffer( const Halo_t& halo, AoSoA_t& aosoa, View_t& send_buffer )
     Kokkos::fence();
 }
 
+// Fill the data to send with modifications. AoSoA variant.
 template <class Halo_t, class AoSoA_t, class View_t, class Modify_t>
 void sendBuffer( const Halo_t& halo, AoSoA_t& aosoa, View_t& send_buffer,
                  const Modify_t& modify_functor )
@@ -251,6 +254,7 @@ void sendBuffer( const Halo_t& halo, AoSoA_t& aosoa, View_t& send_buffer,
     Kokkos::fence();
 }
 
+// Extract the received data. AoSoA variant.
 template <class Halo_t, class AoSoA_t, class View_t>
 void recvBuffer( const Halo_t& halo, AoSoA_t& aosoa, const View_t& send_buffer )
 {
@@ -321,6 +325,128 @@ void recvBuffer( const Halo_t& halo, AoSoA_t& aosoa, const View_t& send_buffer )
     MPI_Barrier( halo.comm() );
 }
 
+//---------------------------------------------------------------------------//
+// Fill the data to send only. Slice variant.
+template <class Halo_t, class Slice_t, class View_t>
+void sendBuffer( const Halo_t& halo, Slice_t& slice, View_t& send_buffer,
+                 std::size_t num_comp )
+{
+    // Get the raw slice data.
+    auto slice_data = slice.data();
+
+    // Get the steering vector for the sends.
+    auto steering = halo.getExportSteering();
+
+    // Gather from the local data into a tuple-contiguous send buffer.
+    auto gather_send_buffer_func = KOKKOS_LAMBDA( const std::size_t i )
+    {
+        auto s = Slice_t::index_type::s( steering( i ) );
+        auto a = Slice_t::index_type::a( steering( i ) );
+        std::size_t slice_offset = s * slice.stride( 0 ) + a;
+        for ( std::size_t n = 0; n < num_comp; ++n )
+            send_buffer( i, n ) =
+                slice_data[slice_offset + n * Slice_t::vector_length];
+    };
+}
+
+// Fill the data to send with modifications. Slice variant.
+template <class Halo_t, class Slice_t, class View_t, class Modify_t>
+void sendBuffer( const Halo_t& halo, Slice_t& slice, View_t& send_buffer,
+                 std::size_t num_comp, const Modify_t& modify_functor )
+{
+    // Get the steering vector for the sends.
+    auto steering = halo.getExportSteering();
+
+    // Gather from the local data into a tuple-contiguous send buffer.
+    auto gather_send_buffer_func = KOKKOS_LAMBDA( const std::size_t i )
+    {
+        for ( std::size_t n = 0; n < num_comp; ++n )
+        {
+            send_buffer( i, n ) = slice( i, n );
+            modify_functor( send_buffer, i, n );
+        }
+    };
+    Kokkos::RangePolicy<typename Halo_t::execution_space>
+        gather_send_buffer_policy( 0, halo.totalNumExport() );
+    Kokkos::parallel_for( "Cabana::gather::gather_send_buffer",
+                          gather_send_buffer_policy, gather_send_buffer_func );
+    Kokkos::fence();
+}
+
+template <class Halo_t, class Slice_t, class View_t>
+void recvBuffer( const Halo_t& halo, Slice_t& slice, const View_t& send_buffer,
+                 std::size_t num_comp )
+{
+    // Get the raw slice data.
+    auto slice_data = slice.data();
+
+    // Allocate a receive buffer. Note this one is layout right so the
+    // components are consecutive.
+    Kokkos::View<typename Slice_t::value_type**, Kokkos::LayoutRight,
+                 typename Halo_t::memory_space>
+        recv_buffer(
+            Kokkos::ViewAllocateWithoutInitializing( "halo_recv_buffer" ),
+            halo.totalNumImport(), num_comp );
+
+    // The halo has it's own communication space so choose any mpi tag.
+    const int mpi_tag = 2345;
+
+    // Post non-blocking receives.
+    int num_n = halo.numNeighbor();
+    std::vector<MPI_Request> requests( num_n );
+    std::pair<std::size_t, std::size_t> recv_range = { 0, 0 };
+    for ( int n = 0; n < num_n; ++n )
+    {
+        recv_range.second = recv_range.first + halo.numImport( n );
+
+        auto recv_subview =
+            Kokkos::subview( recv_buffer, recv_range, Kokkos::ALL );
+
+        MPI_Irecv( recv_subview.data(),
+                   recv_subview.size() * sizeof( typename Slice_t::value_type ),
+                   MPI_BYTE, halo.neighborRank( n ), mpi_tag, halo.comm(),
+                   &( requests[n] ) );
+
+        recv_range.first = recv_range.second;
+    }
+
+    // Do blocking sends.
+    std::pair<std::size_t, std::size_t> send_range = { 0, 0 };
+    for ( int n = 0; n < num_n; ++n )
+    {
+        send_range.second = send_range.first + halo.numExport( n );
+
+        auto send_subview =
+            Kokkos::subview( send_buffer, send_range, Kokkos::ALL );
+
+        MPI_Send( send_subview.data(),
+                  send_subview.size() * sizeof( typename Slice_t::value_type ),
+                  MPI_BYTE, halo.neighborRank( n ), mpi_tag, halo.comm() );
+
+        send_range.first = send_range.second;
+    }
+
+    // Wait on non-blocking receives.
+    std::vector<MPI_Status> status( num_n );
+    const int ec =
+        MPI_Waitall( requests.size(), requests.data(), status.data() );
+    if ( MPI_SUCCESS != ec )
+        throw std::logic_error( "Failed MPI Communication" );
+
+    // Extract the receive buffer into the ghosted elements.
+    std::size_t num_local = halo.numLocal();
+    auto extract_recv_buffer_func = KOKKOS_LAMBDA( const std::size_t i )
+    {
+        std::size_t ghost_idx = i + num_local;
+        auto s = Slice_t::index_type::s( ghost_idx );
+        auto a = Slice_t::index_type::a( ghost_idx );
+        std::size_t slice_offset = s * slice.stride( 0 ) + a;
+        for ( std::size_t n = 0; n < num_comp; ++n )
+            slice_data[slice_offset + Slice_t::vector_length * n] =
+                recv_buffer( i, n );
+    };
+}
+
 } // namespace Impl
 
 //---------------------------------------------------------------------------//
@@ -413,7 +539,7 @@ void gather( const Halo_t& halo, AoSoA_t& aosoa,
     Impl::recvBuffer( halo, aosoa, send_buffer );
 }
 
-//---------------------------------------------------------------------------//
+// ---------------------------------------------------------------------------//
 /*!
   \brief Synchronously gather data from the local decomposition to the ghosts
   using the halo forward communication plan. Slice version. This is a
@@ -427,11 +553,11 @@ void gather( const Halo_t& halo, AoSoA_t& aosoa,
 
   \tparam Halo_t Halo type - must be a Halo.
 
-  \tparam Slice_t Slice type - must be a Slice.
+  \tparam Slice_t Slice type - must be a slice.
 
   \param halo The halo to use for the gather.
 
-  \param slice The Slice on which to perform the gather. The Slice should have
+  \param slice The slice on which to perform the gather. The slice should have
   a size equivalent to halo.numGhost() + halo.numLocal(). The locally owned
   elements are expected to appear first (i.e. in the first halo.numLocal()
   elements) and the ghosted elements are expected to appear second (i.e. in
@@ -443,17 +569,15 @@ void gather( const Halo_t& halo, Slice_t& slice,
                                        is_slice<Slice_t>::value ),
                                      int>::type* = 0 )
 {
-    // Check that the Slice is the right size.
+    // Check that the slice is the right size.
     Impl::checkSize( halo, slice );
 
-    // Get the number of components in the slice.
+    // Get the number of components in the slice. Here the slice is unrolled,
+    // including the underlying SoA.
     std::size_t num_comp = 1;
     for ( std::size_t d = 2; d < slice.rank(); ++d )
         num_comp *= slice.extent( d );
 
-    // Get the raw slice data.
-    auto slice_data = slice.data();
-
     // Allocate a send buffer. Note this one is layout right so the components
     // are consecutive.
     Kokkos::View<typename Slice_t::value_type**, Kokkos::LayoutRight,
@@ -462,99 +586,63 @@ void gather( const Halo_t& halo, Slice_t& slice,
             Kokkos::ViewAllocateWithoutInitializing( "halo_send_buffer" ),
             halo.totalNumExport(), num_comp );
 
-    // Get the steering vector for the sends.
-    auto steering = halo.getExportSteering();
-
-    // Gather from the local data into a tuple-contiguous send buffer.
-    auto gather_send_buffer_func = KOKKOS_LAMBDA( const std::size_t i )
-    {
-        auto s = Slice_t::index_type::s( steering( i ) );
-        auto a = Slice_t::index_type::a( steering( i ) );
-        std::size_t slice_offset = s * slice.stride( 0 ) + a;
-        for ( std::size_t n = 0; n < num_comp; ++n )
-            send_buffer( i, n ) =
-                slice_data[slice_offset + n * Slice_t::vector_length];
-    };
-    Kokkos::RangePolicy<typename Halo_t::execution_space>
-        gather_send_buffer_policy( 0, halo.totalNumExport() );
-    Kokkos::parallel_for( "Cabana::gather::gather_send_buffer",
-                          gather_send_buffer_policy, gather_send_buffer_func );
-    Kokkos::fence();
-
-    // Allocate a receive buffer. Note this one is layout right so the
-    // components are consecutive.
-    Kokkos::View<typename Slice_t::value_type**, Kokkos::LayoutRight,
-                 typename Halo_t::memory_space>
-        recv_buffer(
-            Kokkos::ViewAllocateWithoutInitializing( "halo_recv_buffer" ),
-            halo.totalNumImport(), num_comp );
-
-    // The halo has it's own communication space so choose any mpi tag.
-    const int mpi_tag = 2345;
+    Impl::sendBuffer( halo, slice, send_buffer, num_comp );
+    Impl::recvBuffer( halo, slice, send_buffer, num_comp );
+}
 
-    // Post non-blocking receives.
-    int num_n = halo.numNeighbor();
-    std::vector<MPI_Request> requests( num_n );
-    std::pair<std::size_t, std::size_t> recv_range = { 0, 0 };
-    for ( int n = 0; n < num_n; ++n )
-    {
-        recv_range.second = recv_range.first + halo.numImport( n );
+// ---------------------------------------------------------------------------//
+/*!
+  \brief Synchronously gather data from the local decomposition to the ghosts
+  using the halo forward communication plan. Slice version, where the buffer is
+  modified before being sent. This is a uniquely-owned to multiply-owned
+  communication.
 
-        auto recv_subview =
-            Kokkos::subview( recv_buffer, recv_range, Kokkos::ALL );
+  A gather sends data from a locally owned elements to one or many ranks on
+  which they exist as ghosts. A locally owned element may be sent to as many
+  ranks as desired to be used as a ghost on those ranks. The value of the
+  element in the locally owned decomposition will be the value assigned to the
+  element in the ghosted decomposition.
 
-        MPI_Irecv( recv_subview.data(),
-                   recv_subview.size() * sizeof( typename Slice_t::value_type ),
-                   MPI_BYTE, halo.neighborRank( n ), mpi_tag, halo.comm(),
-                   &( requests[n] ) );
+  \tparam Halo_t Halo type - must be a Halo.
 
-        recv_range.first = recv_range.second;
-    }
+  \tparam Slice_t Slice type - must be a slice.
 
-    // Do blocking sends.
-    std::pair<std::size_t, std::size_t> send_range = { 0, 0 };
-    for ( int n = 0; n < num_n; ++n )
-    {
-        send_range.second = send_range.first + halo.numExport( n );
+  \tparam Modify_t Buffer modification type.
 
-        auto send_subview =
-            Kokkos::subview( send_buffer, send_range, Kokkos::ALL );
+  \param halo The halo to use for the gather.
 
-        MPI_Send( send_subview.data(),
-                  send_subview.size() * sizeof( typename Slice_t::value_type ),
-                  MPI_BYTE, halo.neighborRank( n ), mpi_tag, halo.comm() );
+  \param slice The slice on which to perform the gather. The slice should have
+  a size equivalent to halo.numGhost() + halo.numLocal(). The locally owned
+  elements are expected to appear first (i.e. in the first halo.numLocal()
+  elements) and the ghosted elements are expected to appear second (i.e. in
+  the next halo.numGhost() elements()).
 
-        send_range.first = send_range.second;
-    }
+  \param modify_functor Class containing functor to modify the send buffer
+  before being sent (e.g. for periodic coordinate update).
+*/
+template <class Halo_t, class Slice_t, class Modify_t>
+void gather( const Halo_t& halo, Slice_t& slice, const Modify_t& modify_functor,
+             typename std::enable_if<( is_halo<Halo_t>::value &&
+                                       is_slice<Slice_t>::value ),
+                                     int>::type* = 0 )
+{
+    // Check that the slice is the right size.
+    Impl::checkSize( halo, slice );
 
-    // Wait on non-blocking receives.
-    std::vector<MPI_Status> status( num_n );
-    const int ec =
-        MPI_Waitall( requests.size(), requests.data(), status.data() );
-    if ( MPI_SUCCESS != ec )
-        throw std::logic_error( "Failed MPI Communication" );
+    // Get the number of components in the slice. The slice is not unrolled to
+    // enable indexing during buffer modification.
+    std::size_t num_comp = slice.extent( slice.rank() - 1 );
 
-    // Extract the receive buffer into the ghosted elements.
-    std::size_t num_local = halo.numLocal();
-    auto extract_recv_buffer_func = KOKKOS_LAMBDA( const std::size_t i )
-    {
-        std::size_t ghost_idx = i + num_local;
-        auto s = Slice_t::index_type::s( ghost_idx );
-        auto a = Slice_t::index_type::a( ghost_idx );
-        std::size_t slice_offset = s * slice.stride( 0 ) + a;
-        for ( std::size_t n = 0; n < num_comp; ++n )
-            slice_data[slice_offset + Slice_t::vector_length * n] =
-                recv_buffer( i, n );
-    };
-    Kokkos::RangePolicy<typename Halo_t::execution_space>
-        extract_recv_buffer_policy( 0, halo.totalNumImport() );
-    Kokkos::parallel_for( "Cabana::gather::extract_recv_buffer",
-                          extract_recv_buffer_policy,
-                          extract_recv_buffer_func );
-    Kokkos::fence();
+    // Allocate a send buffer. Note this one is layout right so the components
+    // are consecutive.
+    Kokkos::View<typename Slice_t::value_type**, Kokkos::LayoutRight,
+                 typename Halo_t::memory_space>
+        send_buffer(
+            Kokkos::ViewAllocateWithoutInitializing( "halo_send_buffer" ),
+            halo.totalNumExport(), num_comp );
 
-    // Barrier before completing to ensure synchronization.
-    MPI_Barrier( halo.comm() );
+    Impl::sendBuffer( halo, slice, send_buffer, num_comp, modify_functor );
+    Impl::recvBuffer( halo, slice, send_buffer, num_comp );
 }
 
 //---------------------------------------------------------------------------//
@@ -709,6 +797,6 @@ void scatter( const Halo_t& halo, Slice_t& slice,
 
 //---------------------------------------------------------------------------//
 
-} // end namespace Cabana
+} // namespace Cabana
 
 #endif // end CABANA_HALO_HPP