Skip to content

Commit

Permalink
Add grid gather with slices
Browse files Browse the repository at this point in the history
  • Loading branch information
streeve committed Jan 13, 2021
1 parent 552abb0 commit 47fe0a8
Show file tree
Hide file tree
Showing 3 changed files with 504 additions and 244 deletions.
274 changes: 181 additions & 93 deletions core/src/Cabana_Halo.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,18 +203,20 @@ struct is_halo : public is_halo_impl<typename std::remove_cv<T>::type>::type
namespace Impl
{

//---------------------------------------------------------------------------//
// Check that the AoSoA/slice is the right size.
template <class Halo_t, class Container_t>
void checkSize( const Halo_t& halo, Container_t& container )
{
// Check that the AoSoA/slice is the right size.
if ( container.size() != halo.numLocal() + halo.numGhost() )
throw std::runtime_error( "AoSoA/slice is the wrong size for gather!" );
}

//---------------------------------------------------------------------------//
// Fill the data to send only. AoSoA variant.
template <class Halo_t, class AoSoA_t, class View_t>
void sendBuffer( const Halo_t& halo, AoSoA_t& aosoa, View_t& send_buffer )
{

// Get the steering vector for the sends.
auto steering = halo.getExportSteering();

Expand All @@ -230,6 +232,7 @@ void sendBuffer( const Halo_t& halo, AoSoA_t& aosoa, View_t& send_buffer )
Kokkos::fence();
}

// Fill the data to send with modifications. AoSoA variant.
template <class Halo_t, class AoSoA_t, class View_t, class Modify_t>
void sendBuffer( const Halo_t& halo, AoSoA_t& aosoa, View_t& send_buffer,
const Modify_t& modify_functor )
Expand All @@ -251,6 +254,7 @@ void sendBuffer( const Halo_t& halo, AoSoA_t& aosoa, View_t& send_buffer,
Kokkos::fence();
}

// Extract the received data. AoSoA variant.
template <class Halo_t, class AoSoA_t, class View_t>
void recvBuffer( const Halo_t& halo, AoSoA_t& aosoa, const View_t& send_buffer )
{
Expand Down Expand Up @@ -321,6 +325,128 @@ void recvBuffer( const Halo_t& halo, AoSoA_t& aosoa, const View_t& send_buffer )
MPI_Barrier( halo.comm() );
}

//---------------------------------------------------------------------------//
// Fill the data to send only. Slice variant.
template <class Halo_t, class Slice_t, class View_t>
void sendBuffer( const Halo_t& halo, Slice_t& slice, View_t& send_buffer,
std::size_t num_comp )
{
// Get the raw slice data.
auto slice_data = slice.data();

// Get the steering vector for the sends.
auto steering = halo.getExportSteering();

// Gather from the local data into a tuple-contiguous send buffer.
auto gather_send_buffer_func = KOKKOS_LAMBDA( const std::size_t i )
{
auto s = Slice_t::index_type::s( steering( i ) );
auto a = Slice_t::index_type::a( steering( i ) );
std::size_t slice_offset = s * slice.stride( 0 ) + a;
for ( std::size_t n = 0; n < num_comp; ++n )
send_buffer( i, n ) =
slice_data[slice_offset + n * Slice_t::vector_length];
};
}

// Fill the data to send with modifications. Slice variant.
template <class Halo_t, class Slice_t, class View_t, class Modify_t>
void sendBuffer( const Halo_t& halo, Slice_t& slice, View_t& send_buffer,
std::size_t num_comp, const Modify_t& modify_functor )
{
// Get the steering vector for the sends.
auto steering = halo.getExportSteering();

// Gather from the local data into a tuple-contiguous send buffer.
auto gather_send_buffer_func = KOKKOS_LAMBDA( const std::size_t i )
{
for ( std::size_t n = 0; n < num_comp; ++n )
{
send_buffer( i, n ) = slice( i, n );
modify_functor( send_buffer, i, n );
}
};
Kokkos::RangePolicy<typename Halo_t::execution_space>
gather_send_buffer_policy( 0, halo.totalNumExport() );
Kokkos::parallel_for( "Cabana::gather::gather_send_buffer",
gather_send_buffer_policy, gather_send_buffer_func );
Kokkos::fence();
}

template <class Halo_t, class Slice_t, class View_t>
void recvBuffer( const Halo_t& halo, Slice_t& slice, const View_t& send_buffer,
std::size_t num_comp )
{
// Get the raw slice data.
auto slice_data = slice.data();

// Allocate a receive buffer. Note this one is layout right so the
// components are consecutive.
Kokkos::View<typename Slice_t::value_type**, Kokkos::LayoutRight,
typename Halo_t::memory_space>
recv_buffer(
Kokkos::ViewAllocateWithoutInitializing( "halo_recv_buffer" ),
halo.totalNumImport(), num_comp );

// The halo has it's own communication space so choose any mpi tag.
const int mpi_tag = 2345;

// Post non-blocking receives.
int num_n = halo.numNeighbor();
std::vector<MPI_Request> requests( num_n );
std::pair<std::size_t, std::size_t> recv_range = { 0, 0 };
for ( int n = 0; n < num_n; ++n )
{
recv_range.second = recv_range.first + halo.numImport( n );

auto recv_subview =
Kokkos::subview( recv_buffer, recv_range, Kokkos::ALL );

MPI_Irecv( recv_subview.data(),
recv_subview.size() * sizeof( typename Slice_t::value_type ),
MPI_BYTE, halo.neighborRank( n ), mpi_tag, halo.comm(),
&( requests[n] ) );

recv_range.first = recv_range.second;
}

// Do blocking sends.
std::pair<std::size_t, std::size_t> send_range = { 0, 0 };
for ( int n = 0; n < num_n; ++n )
{
send_range.second = send_range.first + halo.numExport( n );

auto send_subview =
Kokkos::subview( send_buffer, send_range, Kokkos::ALL );

MPI_Send( send_subview.data(),
send_subview.size() * sizeof( typename Slice_t::value_type ),
MPI_BYTE, halo.neighborRank( n ), mpi_tag, halo.comm() );

send_range.first = send_range.second;
}

// Wait on non-blocking receives.
std::vector<MPI_Status> status( num_n );
const int ec =
MPI_Waitall( requests.size(), requests.data(), status.data() );
if ( MPI_SUCCESS != ec )
throw std::logic_error( "Failed MPI Communication" );

// Extract the receive buffer into the ghosted elements.
std::size_t num_local = halo.numLocal();
auto extract_recv_buffer_func = KOKKOS_LAMBDA( const std::size_t i )
{
std::size_t ghost_idx = i + num_local;
auto s = Slice_t::index_type::s( ghost_idx );
auto a = Slice_t::index_type::a( ghost_idx );
std::size_t slice_offset = s * slice.stride( 0 ) + a;
for ( std::size_t n = 0; n < num_comp; ++n )
slice_data[slice_offset + Slice_t::vector_length * n] =
recv_buffer( i, n );
};
}

} // namespace Impl

//---------------------------------------------------------------------------//
Expand Down Expand Up @@ -413,7 +539,7 @@ void gather( const Halo_t& halo, AoSoA_t& aosoa,
Impl::recvBuffer( halo, aosoa, send_buffer );
}

//---------------------------------------------------------------------------//
// ---------------------------------------------------------------------------//
/*!
\brief Synchronously gather data from the local decomposition to the ghosts
using the halo forward communication plan. Slice version. This is a
Expand All @@ -427,11 +553,11 @@ void gather( const Halo_t& halo, AoSoA_t& aosoa,
\tparam Halo_t Halo type - must be a Halo.
\tparam Slice_t Slice type - must be a Slice.
\tparam Slice_t Slice type - must be a slice.
\param halo The halo to use for the gather.
\param slice The Slice on which to perform the gather. The Slice should have
\param slice The slice on which to perform the gather. The slice should have
a size equivalent to halo.numGhost() + halo.numLocal(). The locally owned
elements are expected to appear first (i.e. in the first halo.numLocal()
elements) and the ghosted elements are expected to appear second (i.e. in
Expand All @@ -443,17 +569,15 @@ void gather( const Halo_t& halo, Slice_t& slice,
is_slice<Slice_t>::value ),
int>::type* = 0 )
{
// Check that the Slice is the right size.
// Check that the slice is the right size.
Impl::checkSize( halo, slice );

// Get the number of components in the slice.
// Get the number of components in the slice. Here the slice is unrolled,
// including the underlying SoA.
std::size_t num_comp = 1;
for ( std::size_t d = 2; d < slice.rank(); ++d )
num_comp *= slice.extent( d );

// Get the raw slice data.
auto slice_data = slice.data();

// Allocate a send buffer. Note this one is layout right so the components
// are consecutive.
Kokkos::View<typename Slice_t::value_type**, Kokkos::LayoutRight,
Expand All @@ -462,99 +586,63 @@ void gather( const Halo_t& halo, Slice_t& slice,
Kokkos::ViewAllocateWithoutInitializing( "halo_send_buffer" ),
halo.totalNumExport(), num_comp );

// Get the steering vector for the sends.
auto steering = halo.getExportSteering();

// Gather from the local data into a tuple-contiguous send buffer.
auto gather_send_buffer_func = KOKKOS_LAMBDA( const std::size_t i )
{
auto s = Slice_t::index_type::s( steering( i ) );
auto a = Slice_t::index_type::a( steering( i ) );
std::size_t slice_offset = s * slice.stride( 0 ) + a;
for ( std::size_t n = 0; n < num_comp; ++n )
send_buffer( i, n ) =
slice_data[slice_offset + n * Slice_t::vector_length];
};
Kokkos::RangePolicy<typename Halo_t::execution_space>
gather_send_buffer_policy( 0, halo.totalNumExport() );
Kokkos::parallel_for( "Cabana::gather::gather_send_buffer",
gather_send_buffer_policy, gather_send_buffer_func );
Kokkos::fence();

// Allocate a receive buffer. Note this one is layout right so the
// components are consecutive.
Kokkos::View<typename Slice_t::value_type**, Kokkos::LayoutRight,
typename Halo_t::memory_space>
recv_buffer(
Kokkos::ViewAllocateWithoutInitializing( "halo_recv_buffer" ),
halo.totalNumImport(), num_comp );

// The halo has it's own communication space so choose any mpi tag.
const int mpi_tag = 2345;
Impl::sendBuffer( halo, slice, send_buffer, num_comp );
Impl::recvBuffer( halo, slice, send_buffer, num_comp );
}

// Post non-blocking receives.
int num_n = halo.numNeighbor();
std::vector<MPI_Request> requests( num_n );
std::pair<std::size_t, std::size_t> recv_range = { 0, 0 };
for ( int n = 0; n < num_n; ++n )
{
recv_range.second = recv_range.first + halo.numImport( n );
// ---------------------------------------------------------------------------//
/*!
\brief Synchronously gather data from the local decomposition to the ghosts
using the halo forward communication plan. Slice version, where the buffer is
modified before being sent. This is a uniquely-owned to multiply-owned
communication.
auto recv_subview =
Kokkos::subview( recv_buffer, recv_range, Kokkos::ALL );
A gather sends data from a locally owned elements to one or many ranks on
which they exist as ghosts. A locally owned element may be sent to as many
ranks as desired to be used as a ghost on those ranks. The value of the
element in the locally owned decomposition will be the value assigned to the
element in the ghosted decomposition.
MPI_Irecv( recv_subview.data(),
recv_subview.size() * sizeof( typename Slice_t::value_type ),
MPI_BYTE, halo.neighborRank( n ), mpi_tag, halo.comm(),
&( requests[n] ) );
\tparam Halo_t Halo type - must be a Halo.
recv_range.first = recv_range.second;
}
\tparam Slice_t Slice type - must be a slice.
// Do blocking sends.
std::pair<std::size_t, std::size_t> send_range = { 0, 0 };
for ( int n = 0; n < num_n; ++n )
{
send_range.second = send_range.first + halo.numExport( n );
\tparam Modify_t Buffer modification type.
auto send_subview =
Kokkos::subview( send_buffer, send_range, Kokkos::ALL );
\param halo The halo to use for the gather.
MPI_Send( send_subview.data(),
send_subview.size() * sizeof( typename Slice_t::value_type ),
MPI_BYTE, halo.neighborRank( n ), mpi_tag, halo.comm() );
\param slice The slice on which to perform the gather. The slice should have
a size equivalent to halo.numGhost() + halo.numLocal(). The locally owned
elements are expected to appear first (i.e. in the first halo.numLocal()
elements) and the ghosted elements are expected to appear second (i.e. in
the next halo.numGhost() elements()).
send_range.first = send_range.second;
}
\param modify_functor Class containing functor to modify the send buffer
before being sent (e.g. for periodic coordinate update).
*/
template <class Halo_t, class Slice_t, class Modify_t>
void gather( const Halo_t& halo, Slice_t& slice, const Modify_t& modify_functor,
typename std::enable_if<( is_halo<Halo_t>::value &&
is_slice<Slice_t>::value ),
int>::type* = 0 )
{
// Check that the slice is the right size.
Impl::checkSize( halo, slice );

// Wait on non-blocking receives.
std::vector<MPI_Status> status( num_n );
const int ec =
MPI_Waitall( requests.size(), requests.data(), status.data() );
if ( MPI_SUCCESS != ec )
throw std::logic_error( "Failed MPI Communication" );
// Get the number of components in the slice. The slice is not unrolled to
// enable indexing during buffer modification.
std::size_t num_comp = slice.extent( slice.rank() - 1 );

// Extract the receive buffer into the ghosted elements.
std::size_t num_local = halo.numLocal();
auto extract_recv_buffer_func = KOKKOS_LAMBDA( const std::size_t i )
{
std::size_t ghost_idx = i + num_local;
auto s = Slice_t::index_type::s( ghost_idx );
auto a = Slice_t::index_type::a( ghost_idx );
std::size_t slice_offset = s * slice.stride( 0 ) + a;
for ( std::size_t n = 0; n < num_comp; ++n )
slice_data[slice_offset + Slice_t::vector_length * n] =
recv_buffer( i, n );
};
Kokkos::RangePolicy<typename Halo_t::execution_space>
extract_recv_buffer_policy( 0, halo.totalNumImport() );
Kokkos::parallel_for( "Cabana::gather::extract_recv_buffer",
extract_recv_buffer_policy,
extract_recv_buffer_func );
Kokkos::fence();
// Allocate a send buffer. Note this one is layout right so the components
// are consecutive.
Kokkos::View<typename Slice_t::value_type**, Kokkos::LayoutRight,
typename Halo_t::memory_space>
send_buffer(
Kokkos::ViewAllocateWithoutInitializing( "halo_send_buffer" ),
halo.totalNumExport(), num_comp );

// Barrier before completing to ensure synchronization.
MPI_Barrier( halo.comm() );
Impl::sendBuffer( halo, slice, send_buffer, num_comp, modify_functor );
Impl::recvBuffer( halo, slice, send_buffer, num_comp );
}

//---------------------------------------------------------------------------//
Expand Down Expand Up @@ -709,6 +797,6 @@ void scatter( const Halo_t& halo, Slice_t& slice,

//---------------------------------------------------------------------------//

} // end namespace Cabana
} // namespace Cabana

#endif // end CABANA_HALO_HPP
Loading

0 comments on commit 47fe0a8

Please sign in to comment.