From 7a5fa78e98c18e3e62552e763fabfe2f64444d88 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Mon, 13 Jun 2022 16:15:22 -0700 Subject: [PATCH 01/42] Rename SparseDimPartitioner into DynamicPartitioner --- ... Cajita_DynamicPartitionerPerformance.cpp} | 6 +++--- cajita/src/CMakeLists.txt | 2 +- cajita/src/Cajita.hpp | 2 +- ...oner.hpp => Cajita_DynamicPartitioner.hpp} | 20 +++++++++---------- cajita/unit_test/CMakeLists.txt | 2 +- ...titioner.hpp => tstDynamicPartitioner.hpp} | 6 +++--- cajita/unit_test/tstGlobalGrid.hpp | 4 ++-- cajita/unit_test/tstPartitioner.hpp | 2 +- 8 files changed, 22 insertions(+), 22 deletions(-) rename benchmark/cajita/{Cajita_SparsePartitionerPerformance.cpp => Cajita_DynamicPartitionerPerformance.cpp} (98%) rename cajita/src/{Cajita_SparseDimPartitioner.hpp => Cajita_DynamicPartitioner.hpp} (98%) rename cajita/unit_test/{tstSparseDimPartitioner.hpp => tstDynamicPartitioner.hpp} (99%) diff --git a/benchmark/cajita/Cajita_SparsePartitionerPerformance.cpp b/benchmark/cajita/Cajita_DynamicPartitionerPerformance.cpp similarity index 98% rename from benchmark/cajita/Cajita_SparsePartitionerPerformance.cpp rename to benchmark/cajita/Cajita_DynamicPartitionerPerformance.cpp index 4706bdf4e..72f19b4af 100644 --- a/benchmark/cajita/Cajita_SparsePartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_DynamicPartitionerPerformance.cpp @@ -12,7 +12,7 @@ #include "../Cabana_BenchmarkUtils.hpp" #include "Cabana_ParticleInit.hpp" -#include +#include #include #include @@ -142,7 +142,7 @@ void performanceTest( ParticleWorkloadTag, std::ostream& stream, MPI_Comm comm, int num_tiles_per_dim = num_cells_per_dim[c] >> cell_bits_per_tile_dim; // set up partitioner - Cajita::SparseDimPartitioner partitioner( + Cajita::DynamicPartitioner partitioner( comm, max_workload_coeff, max_par_num, num_step_rebalance, global_num_cell, max_optimize_iteration ); auto ranks_per_dim = @@ -273,7 +273,7 @@ void performanceTest( SparseMapTag, std::ostream& stream, MPI_Comm comm, // set up partitioner auto total_num = num_tiles_per_dim * num_tiles_per_dim * num_tiles_per_dim; - Cajita::SparseDimPartitioner partitioner( + Cajita::DynamicPartitioner partitioner( comm, max_workload_coeff, total_num, num_step_rebalance, global_num_cell, max_optimize_iteration ); auto ranks_per_dim = diff --git a/cajita/src/CMakeLists.txt b/cajita/src/CMakeLists.txt index 6c9b5c6d1..9addb651d 100644 --- a/cajita/src/CMakeLists.txt +++ b/cajita/src/CMakeLists.txt @@ -35,7 +35,7 @@ set(HEADERS_PUBLIC Cajita_Splines.hpp Cajita_Types.hpp Cajita_UniformDimPartitioner.hpp - Cajita_SparseDimPartitioner.hpp + Cajita_DynamicPartitioner.hpp Cajita_SparseArray.hpp ) diff --git a/cajita/src/Cajita.hpp b/cajita/src/Cajita.hpp index e0424120c..2b39bf147 100644 --- a/cajita/src/Cajita.hpp +++ b/cajita/src/Cajita.hpp @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/cajita/src/Cajita_SparseDimPartitioner.hpp b/cajita/src/Cajita_DynamicPartitioner.hpp similarity index 98% rename from cajita/src/Cajita_SparseDimPartitioner.hpp rename to cajita/src/Cajita_DynamicPartitioner.hpp index 1e6e8360d..bdb362c34 100644 --- a/cajita/src/Cajita_SparseDimPartitioner.hpp +++ b/cajita/src/Cajita_DynamicPartitioner.hpp @@ -10,11 +10,11 @@ ****************************************************************************/ /*! - \file Cajita_SparseDimPartitioner.hpp - \brief Multi-node sparse grid partitioner + \file Cajita_DynamicPartitioner.hpp + \brief Multi-node dynamic grid partitioner */ -#ifndef CAJITA_SPARSEDIMPARTITIONER_HPP -#define CAJITA_SPARSEDIMPARTITIONER_HPP +#ifndef CAJITA_DYNAMICPARTITIONER_HPP +#define CAJITA_DYNAMICPARTITIONER_HPP #include #include @@ -29,14 +29,14 @@ namespace Cajita { //---------------------------------------------------------------------------// /*! - Sparse mesh block partitioner. (Current Version: Support 3D only) + Dynamic mesh block partitioner. (Current Version: Support 3D only) \tparam Device Kokkos device type. \tparam CellPerTileDim Cells per tile per dimension. \tparam NumSpaceDim Dimemsion (The current version support 3D only) */ template -class SparseDimPartitioner : public BlockPartitioner +class DynamicPartitioner : public BlockPartitioner { public: //! dimension @@ -82,7 +82,7 @@ class SparseDimPartitioner : public BlockPartitioner \param global_cells_per_dim 3D array, global cells in each dimension \param max_optimize_iteration max iteration number to run the optimization */ - SparseDimPartitioner( + DynamicPartitioner( MPI_Comm comm, float max_workload_coeff, int workload_num, int num_step_rebalance, const std::array& global_cells_per_dim, @@ -111,7 +111,7 @@ class SparseDimPartitioner : public BlockPartitioner \param global_cells_per_dim 3D array, global cells in each dimension \param max_optimize_iteration max iteration number to run the optimization */ - SparseDimPartitioner( + DynamicPartitioner( MPI_Comm comm, float max_workload_coeff, int workload_num, int num_step_rebalance, const std::array& ranks_per_dim, @@ -171,7 +171,7 @@ class SparseDimPartitioner : public BlockPartitioner nrank *= _ranks_per_dim[d]; if ( comm_size != nrank ) throw std::runtime_error( - "SparsePartitioner ranks do not match comm size" ); + "DynamicPartitioner ranks do not match comm size" ); return ranks_per_dim; } @@ -905,4 +905,4 @@ class SparseDimPartitioner : public BlockPartitioner }; } // end namespace Cajita -#endif // end CAJITA_SPARSEDIMPARTITIONER_HPP +#endif // end CAJITA_DYNAMICPARTITIONER_HPP diff --git a/cajita/unit_test/CMakeLists.txt b/cajita/unit_test/CMakeLists.txt index 6693944dc..535f52a0a 100644 --- a/cajita/unit_test/CMakeLists.txt +++ b/cajita/unit_test/CMakeLists.txt @@ -37,7 +37,7 @@ set(MPI_TESTS Interpolation2d BovWriter Parallel - SparseDimPartitioner + DynamicPartitioner Partitioner SparseArray ) diff --git a/cajita/unit_test/tstSparseDimPartitioner.hpp b/cajita/unit_test/tstDynamicPartitioner.hpp similarity index 99% rename from cajita/unit_test/tstSparseDimPartitioner.hpp rename to cajita/unit_test/tstDynamicPartitioner.hpp index 9eb6c185b..db1bfde46 100644 --- a/cajita/unit_test/tstSparseDimPartitioner.hpp +++ b/cajita/unit_test/tstDynamicPartitioner.hpp @@ -9,7 +9,7 @@ * SPDX-License-Identifier: BSD-3-Clause * ****************************************************************************/ -#include +#include #include #include @@ -47,7 +47,7 @@ void uniform_distribution_automatic_rank() size_tile_per_dim * cell_per_tile_dim }; // partitioner - SparseDimPartitioner partitioner( + DynamicPartitioner partitioner( MPI_COMM_WORLD, max_workload_coeff, workload_num, num_step_rebalance, global_cells_per_dim, max_optimize_iteration ); @@ -309,7 +309,7 @@ void random_distribution_automatic_rank( int occupy_num_per_rank, size_per_dim }; // partitioner - SparseDimPartitioner partitioner( + DynamicPartitioner partitioner( MPI_COMM_WORLD, max_workload_coeff, particle_num, num_step_rebalance, global_cells_per_dim, max_optimize_iteration ); diff --git a/cajita/unit_test/tstGlobalGrid.hpp b/cajita/unit_test/tstGlobalGrid.hpp index bdf0966e2..bf1926601 100644 --- a/cajita/unit_test/tstGlobalGrid.hpp +++ b/cajita/unit_test/tstGlobalGrid.hpp @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include @@ -431,7 +431,7 @@ void sparseGridTest3d() int num_step_rebalance = 100; int max_optimize_iteration = 10; - SparseDimPartitioner partitioner( + DynamicPartitioner partitioner( MPI_COMM_WORLD, max_workload_coeff, workload_num, num_step_rebalance, global_num_cell, max_optimize_iteration ); diff --git a/cajita/unit_test/tstPartitioner.hpp b/cajita/unit_test/tstPartitioner.hpp index c001da746..3fb6470bc 100644 --- a/cajita/unit_test/tstPartitioner.hpp +++ b/cajita/unit_test/tstPartitioner.hpp @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include From 8c8e975c9f29a4a0a9d10aa4ab6f96130eea9886 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Mon, 13 Jun 2022 16:19:04 -0700 Subject: [PATCH 02/42] Clean up optimizePartitionAlongDim --- cajita/src/Cajita_DynamicPartitioner.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cajita/src/Cajita_DynamicPartitioner.hpp b/cajita/src/Cajita_DynamicPartitioner.hpp index bdb362c34..145de21be 100644 --- a/cajita/src/Cajita_DynamicPartitioner.hpp +++ b/cajita/src/Cajita_DynamicPartitioner.hpp @@ -521,7 +521,7 @@ class DynamicPartitioner : public BlockPartitioner random_dim_id = std::rand() % num_space_dim; bool is_dim_changed = false; // record changes in current dim - optimizePartition( is_dim_changed, random_dim_id ); + optimizePartitionAlongDim( random_dim_id, is_dim_changed ); // update control info is_changed = is_changed || is_dim_changed; @@ -560,7 +560,7 @@ class DynamicPartitioner : public BlockPartitioner random_dim_id = std::rand() % num_space_dim; bool is_dim_changed = false; // record changes in current dim - optimizePartition( is_dim_changed, random_dim_id ); + optimizePartitionAlongDim( random_dim_id, is_dim_changed ); // update control info is_changed = is_changed || is_dim_changed; @@ -575,11 +575,11 @@ class DynamicPartitioner : public BlockPartitioner /*! \brief optimize the partition in three dimensions seperately - \param is_changed label if the partition is changed after the optimization \param iter_seed seed number to choose the starting dimension of the optimization + \param is_changed label if the partition is changed after the optimization */ - void optimizePartition( bool& is_changed, int iter_seed ) + void optimizePartitionAlongDim( int iter_seed, bool& is_changed ) { is_changed = false; // loop over three dimensions, optimize the partition in dimension di From ae62b97a1e9c681e88d5f389fb4be9f8ace4196c Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Mon, 13 Jun 2022 16:52:36 -0700 Subject: [PATCH 03/42] Rename computeLocalWorkLoad into setLocalWorkloadByParticles & setLocalWorkloadBySparseMap --- .../Cajita_DynamicPartitionerPerformance.cpp | 17 ++--- cajita/src/Cajita_DynamicPartitioner.hpp | 68 ++++--------------- cajita/unit_test/tstDynamicPartitioner.hpp | 13 ++-- 3 files changed, 29 insertions(+), 69 deletions(-) diff --git a/benchmark/cajita/Cajita_DynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_DynamicPartitionerPerformance.cpp index 72f19b4af..f5a1dcd93 100644 --- a/benchmark/cajita/Cajita_DynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_DynamicPartitionerPerformance.cpp @@ -189,9 +189,10 @@ void performanceTest( ParticleWorkloadTag, std::ostream& stream, MPI_Comm comm, // compute local workload local_workload_timer.start( p ); - partitioner.computeLocalWorkLoad( pos_view, par_num, - global_low_corner, - 1.0f / num_cells_per_dim[c] ); + partitioner.setLocalWorkloadByParticles( pos_view, par_num, + global_low_corner, + 1.0f / num_cells_per_dim[c], + comm ); local_workload_timer.stop( p ); // compute prefix sum matrix @@ -205,8 +206,8 @@ void performanceTest( ParticleWorkloadTag, std::ostream& stream, MPI_Comm comm, total_optimize_timer.start( p ); for ( int i = 0; i < max_optimize_iteration; ++i ) { - partitioner.optimizePartition( is_changed, - std::rand() % 3 ); + partitioner.optimizePartitionAlongDim( is_changed, + std::rand() % 3 ); if ( !is_changed ) break; } @@ -326,7 +327,7 @@ void performanceTest( SparseMapTag, std::ostream& stream, MPI_Comm comm, // compute local workload local_workload_timer.start( frac ); - partitioner.computeLocalWorkLoad( sis ); + partitioner.setLocalWorkloadBySparseMap( sis, comm ); local_workload_timer.stop( frac ); // compute prefix sum matrix @@ -340,8 +341,8 @@ void performanceTest( SparseMapTag, std::ostream& stream, MPI_Comm comm, total_optimize_timer.start( frac ); for ( int i = 0; i < max_optimize_iteration; ++i ) { - partitioner.optimizePartition( is_changed, - std::rand() % 3 ); + partitioner.optimizePartitionAlongDim( is_changed, + std::rand() % 3 ); if ( !is_changed ) break; } diff --git a/cajita/src/Cajita_DynamicPartitioner.hpp b/cajita/src/Cajita_DynamicPartitioner.hpp index 145de21be..703d5bfcf 100644 --- a/cajita/src/Cajita_DynamicPartitioner.hpp +++ b/cajita/src/Cajita_DynamicPartitioner.hpp @@ -343,12 +343,13 @@ class DynamicPartitioner : public BlockPartitioner \param global_lower_corner the coordinate of the domain global lower corner \param dx cell dx size + \param comm MPI communicator used for workload reduction */ template - void computeLocalWorkLoad( const ParticlePosViewType& view, - int particle_num, - const ArrayType& global_lower_corner, - const CellUnit dx ) + void setLocalWorkloadByParticles( const ParticlePosViewType& view, + int particle_num, + const ArrayType& global_lower_corner, + const CellUnit dx, MPI_Comm comm ) { resetWorkload(); // make a local copy @@ -375,15 +376,18 @@ class DynamicPartitioner : public BlockPartitioner Kokkos::atomic_increment( &workload( ti + 1, tj + 1, tz + 1 ) ); } ); Kokkos::fence(); + // Wait for other ranks' workload to be ready + MPI_Barrier( comm ); } /*! \brief compute the workload in the current MPI rank from sparseMap (the workload of a tile is 1 if the tile is occupied, 0 otherwise) \param sparseMap sparseMap in the current rank + \param comm MPI communicator used for workload reduction */ template - void computeLocalWorkLoad( const SparseMapType& sparseMap ) + void setLocalWorkloadBySparseMap( const SparseMapType& sparseMap, MPI_Comm comm ) { resetWorkload(); // make a local copy @@ -402,6 +406,8 @@ class DynamicPartitioner : public BlockPartitioner } } ); Kokkos::fence(); + // Wait for other ranks' workload to be ready + MPI_Barrier( comm ); } /*! @@ -489,22 +495,11 @@ class DynamicPartitioner : public BlockPartitioner /*! \brief iteratively optimize the partition - \param view particle positions view - \param particle_num total particle number - \param global_lower_corner the coordinate of the domain global lower - corner - \param dx cell dx size \param comm MPI communicator used for workload reduction \return iteration number */ - template - int optimizePartition( const ParticlePosViewType& view, int particle_num, - const ArrayType& global_lower_corner, - const CellUnit dx, MPI_Comm comm ) + int optimizePartition( MPI_Comm comm ) { - computeLocalWorkLoad( view, particle_num, global_lower_corner, dx ); - MPI_Barrier( comm ); - computeFullPrefixSum( comm ); MPI_Barrier( comm ); @@ -534,45 +529,6 @@ class DynamicPartitioner : public BlockPartitioner return _max_optimize_iteration; } - /*! - \brief iteratively optimize the partition - \param sparseMap sparseMap in the current rank - \param comm MPI communicator used for workload reduction - \return iteration number - */ - template - int optimizePartition( const SparseMapType& sparseMap, MPI_Comm comm ) - { - computeLocalWorkLoad( sparseMap ); - MPI_Barrier( comm ); - - computeFullPrefixSum( comm ); - MPI_Barrier( comm ); - - for ( int i = 0; i < _max_optimize_iteration; ++i ) - { - bool is_changed = false; // record changes in current iteration - bool dim_covered[3] = { false, false, false }; - for ( int d = 0; d < 3; ++d ) - { - int random_dim_id = std::rand() % num_space_dim; - while ( dim_covered[random_dim_id] ) - random_dim_id = std::rand() % num_space_dim; - - bool is_dim_changed = false; // record changes in current dim - optimizePartitionAlongDim( random_dim_id, is_dim_changed ); - - // update control info - is_changed = is_changed || is_dim_changed; - dim_covered[random_dim_id] = true; - } - // return if the current partition is optimal - if ( !is_changed ) - return i; - } - return _max_optimize_iteration; - } - /*! \brief optimize the partition in three dimensions seperately \param iter_seed seed number to choose the starting dimension of the diff --git a/cajita/unit_test/tstDynamicPartitioner.hpp b/cajita/unit_test/tstDynamicPartitioner.hpp index db1bfde46..b27db6108 100644 --- a/cajita/unit_test/tstDynamicPartitioner.hpp +++ b/cajita/unit_test/tstDynamicPartitioner.hpp @@ -147,7 +147,8 @@ void uniform_distribution_automatic_rank() Kokkos::fence(); // compute workload and do partition optimization - partitioner.optimizePartition( sis, MPI_COMM_WORLD ); + partitioner.setLocalWorkloadBySparseMap( sis, MPI_COMM_WORLD ); + partitioner.optimizePartition( MPI_COMM_WORLD ); // check results (should be the same as the average partition) owned_cells_per_dim = partitioner.ownedCellsPerDimension( cart_comm ); @@ -428,7 +429,8 @@ void random_distribution_automatic_rank( int occupy_num_per_rank, Kokkos::fence(); // compute workload from a sparseMap and do partition optimization - partitioner.optimizePartition( sis, MPI_COMM_WORLD ); + partitioner.setLocalWorkloadBySparseMap( sis, MPI_COMM_WORLD ); + partitioner.optimizePartition( MPI_COMM_WORLD ); } // use particle positions to compute teh workload on MPI ranks else @@ -438,9 +440,10 @@ void random_distribution_automatic_rank( int occupy_num_per_rank, gt_partition, cart_rank, occupy_num_per_rank, global_low_corner, cell_size, cell_per_tile_dim ); // compute workload from a particle view and do partition optimization - partitioner.optimizePartition( particle_view, occupy_num_per_rank, - global_low_corner, cell_size, - MPI_COMM_WORLD ); + partitioner.setLocalWorkloadByParticles( particle_view, occupy_num_per_rank, + global_low_corner, cell_size, + MPI_COMM_WORLD ); + partitioner.optimizePartition( MPI_COMM_WORLD ); } // check results (should be the same as the gt_partition) From 777f9424db43467aee5b913ce1af635004a449e6 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Mon, 13 Jun 2022 17:27:37 -0700 Subject: [PATCH 04/42] Simplify optimizePartitionAlongDim --- cajita/src/Cajita_DynamicPartitioner.hpp | 37 +++++------------------- 1 file changed, 7 insertions(+), 30 deletions(-) diff --git a/cajita/src/Cajita_DynamicPartitioner.hpp b/cajita/src/Cajita_DynamicPartitioner.hpp index 703d5bfcf..985d32eb6 100644 --- a/cajita/src/Cajita_DynamicPartitioner.hpp +++ b/cajita/src/Cajita_DynamicPartitioner.hpp @@ -586,51 +586,28 @@ class DynamicPartitioner : public BlockPartitioner // last_point: the opimized position for the lask partition int last_point = 0; // current_workload: the workload between [last_point, point_i) - Kokkos::View current_workload( - "current_workload", _ranks_per_dim[dj] * _ranks_per_dim[dk] ); for ( int current_rank = 1; current_rank < rank; current_rank++ ) { int last_diff = __INT_MAX__; while ( true ) { - // compute current workload between [last_point, point_i) - Kokkos::parallel_for( - "compute_current_workload", + int diff; + Kokkos::parallel_reduce( + "diff_reduce", Kokkos::RangePolicy( 0, _ranks_per_dim[dj] * _ranks_per_dim[dk] ), - KOKKOS_LAMBDA( uint32_t jnk ) { + KOKKOS_LAMBDA( const int jnk, int& update ) { int j = static_cast( jnk / rank_k ); int k = static_cast( jnk % rank_k ); - current_workload( jnk ) = compute_sub_workload( + int current_workload = compute_sub_workload( di, last_point, point_i, dj, j, dk, k ); - } ); - Kokkos::fence(); - - // compute the (w_jk^ave - w_jk^{last_point:point_i}) - Kokkos::parallel_for( - "compute_diff", - Kokkos::RangePolicy( - 0, _ranks_per_dim[dj] * _ranks_per_dim[dk] ), - KOKKOS_LAMBDA( uint32_t jnk ) { auto wl = - current_workload( jnk ) - ave_workload( jnk ); + current_workload - ave_workload( jnk ); // compute absolute diff (rather than squares to // avoid potential overflow) // TODO: update when Kokkos::abs() available wl = wl > 0 ? wl : -wl; - current_workload( jnk ) = wl; - } ); - Kokkos::fence(); - - // compute the sum of the difference in all rank_j*rank_k - // regions - int diff; - Kokkos::parallel_reduce( - "diff_reduce", - Kokkos::RangePolicy( - 0, _ranks_per_dim[dj] * _ranks_per_dim[dk] ), - KOKKOS_LAMBDA( const int idx, int& update ) { - update += current_workload( idx ); + update += wl; }, diff ); Kokkos::fence(); From 416b3583627afafe95158b73fa50f8059d90106d Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Tue, 14 Jun 2022 12:09:48 -0700 Subject: [PATCH 05/42] Fix cmake inside benchmark/cajita --- benchmark/cajita/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmark/cajita/CMakeLists.txt b/benchmark/cajita/CMakeLists.txt index 236eb1bfb..608621e29 100644 --- a/benchmark/cajita/CMakeLists.txt +++ b/benchmark/cajita/CMakeLists.txt @@ -12,8 +12,8 @@ add_executable(SparseMapPerformance Cajita_SparseMapPerformance.cpp) target_link_libraries(SparseMapPerformance Cajita) -add_executable(SparsePartitionerPerformance Cajita_SparsePartitionerPerformance.cpp) -target_link_libraries(SparsePartitionerPerformance Cajita) +add_executable(DynamicPartitionerPerformance Cajita_DynamicPartitionerPerformance.cpp) +target_link_libraries(DynamicPartitionerPerformance Cajita) add_executable(HaloPerformance Cajita_HaloPerformance.cpp) target_link_libraries(HaloPerformance Cajita) @@ -29,7 +29,7 @@ endif() if(Cabana_ENABLE_TESTING) add_test(NAME Cajita_SparseMapPerformance COMMAND ${NONMPI_PRECOMMAND} SparseMapPerformance sparsemap_output.txt) - add_test(NAME Cajita_SparsePartitionerPerformance COMMAND ${NONMPI_PRECOMMAND} SparsePartitionerPerformance sparsepartitioner_output.txt) + add_test(NAME Cajita_DynamicPartitionerPerformance COMMAND ${NONMPI_PRECOMMAND} DynamicPartitionerPerformance dynamicpartitioner_output.txt) add_test(NAME Cajita_HaloPerformance COMMAND ${NONMPI_PRECOMMAND} HaloPerformance halo_output.txt) From cfcbeb71701c3f5d5e28167ee7c194b755ea7cd3 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Mon, 20 Jun 2022 12:27:06 -0700 Subject: [PATCH 06/42] Fix compile error --- benchmark/cajita/Cajita_DynamicPartitionerPerformance.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmark/cajita/Cajita_DynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_DynamicPartitionerPerformance.cpp index f5a1dcd93..a4456f209 100644 --- a/benchmark/cajita/Cajita_DynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_DynamicPartitionerPerformance.cpp @@ -206,8 +206,8 @@ void performanceTest( ParticleWorkloadTag, std::ostream& stream, MPI_Comm comm, total_optimize_timer.start( p ); for ( int i = 0; i < max_optimize_iteration; ++i ) { - partitioner.optimizePartitionAlongDim( is_changed, - std::rand() % 3 ); + partitioner.optimizePartitionAlongDim( std::rand() % 3, + is_changed ); if ( !is_changed ) break; } @@ -341,8 +341,8 @@ void performanceTest( SparseMapTag, std::ostream& stream, MPI_Comm comm, total_optimize_timer.start( frac ); for ( int i = 0; i < max_optimize_iteration; ++i ) { - partitioner.optimizePartitionAlongDim( is_changed, - std::rand() % 3 ); + partitioner.optimizePartitionAlongDim( std::rand() % 3, + is_changed ); if ( !is_changed ) break; } From 6c80d6b76245543d3df17587a2903ed0e6e89be9 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Mon, 20 Jun 2022 15:57:42 -0700 Subject: [PATCH 07/42] Format --- cajita/unit_test/tstGlobalGrid.hpp | 2 +- cajita/unit_test/tstPartitioner.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cajita/unit_test/tstGlobalGrid.hpp b/cajita/unit_test/tstGlobalGrid.hpp index bf1926601..03efceeaf 100644 --- a/cajita/unit_test/tstGlobalGrid.hpp +++ b/cajita/unit_test/tstGlobalGrid.hpp @@ -9,10 +9,10 @@ * SPDX-License-Identifier: BSD-3-Clause * ****************************************************************************/ +#include #include #include #include -#include #include #include diff --git a/cajita/unit_test/tstPartitioner.hpp b/cajita/unit_test/tstPartitioner.hpp index 3fb6470bc..0a34fd322 100644 --- a/cajita/unit_test/tstPartitioner.hpp +++ b/cajita/unit_test/tstPartitioner.hpp @@ -9,10 +9,10 @@ * SPDX-License-Identifier: BSD-3-Clause * ****************************************************************************/ +#include #include #include #include -#include #include #include From 588a5249a621e248c0f3ed40dcfff06a2f2df6d4 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Mon, 20 Jun 2022 16:07:08 -0700 Subject: [PATCH 08/42] Add SparseMapDynamicPartitioner and ParticleDynamicPartitioner --- .../Cajita_DynamicPartitionerPerformance.cpp | 21 +- cajita/src/Cajita.hpp | 2 +- cajita/src/Cajita_DynamicPartitioner.hpp | 307 +++++++++++++----- cajita/unit_test/tstDynamicPartitioner.hpp | 36 +- 4 files changed, 260 insertions(+), 106 deletions(-) diff --git a/benchmark/cajita/Cajita_DynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_DynamicPartitionerPerformance.cpp index a4456f209..7be6e347c 100644 --- a/benchmark/cajita/Cajita_DynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_DynamicPartitionerPerformance.cpp @@ -142,9 +142,10 @@ void performanceTest( ParticleWorkloadTag, std::ostream& stream, MPI_Comm comm, int num_tiles_per_dim = num_cells_per_dim[c] >> cell_bits_per_tile_dim; // set up partitioner - Cajita::DynamicPartitioner partitioner( - comm, max_workload_coeff, max_par_num, num_step_rebalance, - global_num_cell, max_optimize_iteration ); + Cajita::ParticleDynamicPartitioner + partitioner( comm, max_workload_coeff, max_par_num, + num_step_rebalance, global_num_cell, + max_optimize_iteration ); auto ranks_per_dim = partitioner.ranksPerDimension( comm, global_num_cell ); auto ave_partition = @@ -189,10 +190,9 @@ void performanceTest( ParticleWorkloadTag, std::ostream& stream, MPI_Comm comm, // compute local workload local_workload_timer.start( p ); - partitioner.setLocalWorkloadByParticles( pos_view, par_num, - global_low_corner, - 1.0f / num_cells_per_dim[c], - comm ); + partitioner.setLocalWorkloadByParticles( + pos_view, par_num, global_low_corner, + 1.0f / num_cells_per_dim[c], comm ); local_workload_timer.stop( p ); // compute prefix sum matrix @@ -274,9 +274,10 @@ void performanceTest( SparseMapTag, std::ostream& stream, MPI_Comm comm, // set up partitioner auto total_num = num_tiles_per_dim * num_tiles_per_dim * num_tiles_per_dim; - Cajita::DynamicPartitioner partitioner( - comm, max_workload_coeff, total_num, num_step_rebalance, - global_num_cell, max_optimize_iteration ); + Cajita::SparseMapDynamicPartitioner + partitioner( comm, max_workload_coeff, total_num, + num_step_rebalance, global_num_cell, + max_optimize_iteration ); auto ranks_per_dim = partitioner.ranksPerDimension( comm, global_num_cell ); auto ave_partition = diff --git a/cajita/src/Cajita.hpp b/cajita/src/Cajita.hpp index 2b39bf147..b825eb236 100644 --- a/cajita/src/Cajita.hpp +++ b/cajita/src/Cajita.hpp @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -34,7 +35,6 @@ #include #include #include -#include #include #include #include diff --git a/cajita/src/Cajita_DynamicPartitioner.hpp b/cajita/src/Cajita_DynamicPartitioner.hpp index 985d32eb6..e93ad8124 100644 --- a/cajita/src/Cajita_DynamicPartitioner.hpp +++ b/cajita/src/Cajita_DynamicPartitioner.hpp @@ -29,10 +29,11 @@ namespace Cajita { //---------------------------------------------------------------------------// /*! - Dynamic mesh block partitioner. (Current Version: Support 3D only) - \tparam Device Kokkos device type. - \tparam CellPerTileDim Cells per tile per dimension. - \tparam NumSpaceDim Dimemsion (The current version support 3D only) + Dynamic mesh block partitioner. (Current Version: Support 3D only) There + should be no instantiation for this class without implementing any workload + computation. \tparam Device Kokkos device type. \tparam CellPerTileDim Cells + per tile per dimension. \tparam NumSpaceDim Dimemsion (The current version + support 3D only) */ template @@ -335,81 +336,6 @@ class DynamicPartitioner : public BlockPartitioner Kokkos::deep_copy( _workload_prefix_sum, 0 ); } - /*! - \brief compute the workload in the current MPI rank from particle - positions (each particle count for 1 workload value) - \param view particle positions view - \param particle_num total particle number - \param global_lower_corner the coordinate of the domain global lower - corner - \param dx cell dx size - \param comm MPI communicator used for workload reduction - */ - template - void setLocalWorkloadByParticles( const ParticlePosViewType& view, - int particle_num, - const ArrayType& global_lower_corner, - const CellUnit dx, MPI_Comm comm ) - { - resetWorkload(); - // make a local copy - auto workload = _workload_per_tile; - Kokkos::Array lower_corner; - for ( std::size_t d = 0; d < num_space_dim; ++d ) - { - lower_corner[d] = global_lower_corner[d]; - } - - Kokkos::parallel_for( - "compute_local_workload_parpos", - Kokkos::RangePolicy( 0, particle_num ), - KOKKOS_LAMBDA( const int i ) { - int ti = static_cast( - ( view( i, 0 ) - lower_corner[0] ) / dx - 0.5 ) >> - cell_bits_per_tile_dim; - int tj = static_cast( - ( view( i, 1 ) - lower_corner[1] ) / dx - 0.5 ) >> - cell_bits_per_tile_dim; - int tz = static_cast( - ( view( i, 2 ) - lower_corner[2] ) / dx - 0.5 ) >> - cell_bits_per_tile_dim; - Kokkos::atomic_increment( &workload( ti + 1, tj + 1, tz + 1 ) ); - } ); - Kokkos::fence(); - // Wait for other ranks' workload to be ready - MPI_Barrier( comm ); - } - - /*! - \brief compute the workload in the current MPI rank from sparseMap - (the workload of a tile is 1 if the tile is occupied, 0 otherwise) - \param sparseMap sparseMap in the current rank - \param comm MPI communicator used for workload reduction - */ - template - void setLocalWorkloadBySparseMap( const SparseMapType& sparseMap, MPI_Comm comm ) - { - resetWorkload(); - // make a local copy - auto workload = _workload_per_tile; - Kokkos::parallel_for( - "compute_local_workload_sparsmap", - Kokkos::RangePolicy( 0, sparseMap.capacity() ), - KOKKOS_LAMBDA( uint32_t i ) { - if ( sparseMap.valid_at( i ) ) - { - auto key = sparseMap.key_at( i ); - int ti, tj, tk; - sparseMap.key2ijk( key, ti, tj, tk ); - Kokkos::atomic_increment( - &workload( ti + 1, tj + 1, tk + 1 ) ); - } - } ); - Kokkos::fence(); - // Wait for other ranks' workload to be ready - MPI_Barrier( comm ); - } - /*! \brief 1. reduce the total workload in all MPI ranks; 2. compute the workload prefix sum matrix for all MPI ranks @@ -601,8 +527,7 @@ class DynamicPartitioner : public BlockPartitioner int k = static_cast( jnk % rank_k ); int current_workload = compute_sub_workload( di, last_point, point_i, dj, j, dk, k ); - auto wl = - current_workload - ave_workload( jnk ); + auto wl = current_workload - ave_workload( jnk ); // compute absolute diff (rather than squares to // avoid potential overflow) // TODO: update when Kokkos::abs() available @@ -806,6 +731,7 @@ class DynamicPartitioner : public BlockPartitioner // max_optimize iterations int _max_optimize_iteration; + protected: // represent the rectangle partition in each dimension // with form [0, p_1, ..., p_n, cell_num], n = rank num in current // dimension, partition in this dimension would be [0, p_1), [p_1, p_2) ... @@ -836,6 +762,225 @@ class DynamicPartitioner : public BlockPartitioner ( global_cells_per_dim[2] >> cell_bits_per_tile_dim ) + 1 ); } }; + +/*! + Dynamic mesh block partitioner. (Current Version: Support 3D only) Workload + are computed from particle distribution. + + \tparam Device Kokkos device type. + \tparam CellPerTileDim Cells per tile per dimension. + \tparam NumSpaceDim Dimemsion (The current version support 3D only) +*/ +template +class ParticleDynamicPartitioner + : public DynamicPartitioner +{ + using base = DynamicPartitioner; + + protected: + using base::_workload_per_tile; + + public: + using base::cell_bits_per_tile_dim; + using base::num_space_dim; + using typename base::execution_space; + + /*! + \brief Constructor - automatically compute ranks_per_dim from MPI + communicator + \param comm MPI communicator to decide the rank nums in each dimension + \param max_workload_coeff threshold factor for re-partition + \param workload_num total workload(particle/tile) number, used to compute + workload_threshold + \param num_step_rebalance the simulation step number after which one + should check if repartition is needed + \param global_cells_per_dim 3D array, global cells in each dimension + \param max_optimize_iteration max iteration number to run the optimization + */ + ParticleDynamicPartitioner( + MPI_Comm comm, float max_workload_coeff, int workload_num, + int num_step_rebalance, + const std::array& global_cells_per_dim, + int max_optimize_iteration = 10 ) + : base( comm, max_workload_coeff, workload_num, num_step_rebalance, + global_cells_per_dim, max_optimize_iteration ) + { + } + + /*! + \brief Constructor - user-defined ranks_per_dim + communicator + \param comm MPI communicator to decide the rank nums in each dimension + \param max_workload_coeff threshold factor for re-partition + \param workload_num total workload(particle/tile) number, used to compute + workload_threshold + \param num_step_rebalance the simulation step number after which one + should check if repartition is needed + \param ranks_per_dim 3D array, user-defined MPI rank constrains in per + dimension + \param global_cells_per_dim 3D array, global cells in each dimension + \param max_optimize_iteration max iteration number to run the optimization + */ + ParticleDynamicPartitioner( + MPI_Comm comm, float max_workload_coeff, int workload_num, + int num_step_rebalance, + const std::array& ranks_per_dim, + const std::array& global_cells_per_dim, + int max_optimize_iteration = 10 ) + : base( comm, max_workload_coeff, workload_num, num_step_rebalance, + max_optimize_iteration ) + { + } + + /*! + \brief compute the workload in the current MPI rank from particle + positions (each particle count for 1 workload value). This function must + be called before running optimizePartition() \param view particle + positions view \param particle_num total particle number \param + global_lower_corner the coordinate of the domain global lower corner + \param dx cell dx size + \param comm MPI communicator used for workload reduction + */ + template + void setLocalWorkloadByParticles( const ParticlePosViewType& view, + int particle_num, + const ArrayType& global_lower_corner, + const CellUnit dx, MPI_Comm comm ) + { + base::resetWorkload(); + // make a local copy + auto workload = _workload_per_tile; + Kokkos::Array lower_corner; + for ( std::size_t d = 0; d < num_space_dim; ++d ) + { + lower_corner[d] = global_lower_corner[d]; + } + + Kokkos::parallel_for( + "compute_local_workload_parpos", + Kokkos::RangePolicy( 0, particle_num ), + KOKKOS_LAMBDA( const int i ) { + int ti = static_cast( + ( view( i, 0 ) - lower_corner[0] ) / dx - 0.5 ) >> + cell_bits_per_tile_dim; + int tj = static_cast( + ( view( i, 1 ) - lower_corner[1] ) / dx - 0.5 ) >> + cell_bits_per_tile_dim; + int tz = static_cast( + ( view( i, 2 ) - lower_corner[2] ) / dx - 0.5 ) >> + cell_bits_per_tile_dim; + Kokkos::atomic_increment( &workload( ti + 1, tj + 1, tz + 1 ) ); + } ); + Kokkos::fence(); + // Wait for other ranks' workload to be ready + MPI_Barrier( comm ); + } +}; + +/*! + Dynamic mesh block partitioner. (Current Version: Support 3D only) Workload + are computed from sparse map occupancy. + + \tparam Device Kokkos device type. + \tparam CellPerTileDim Cells per tile per dimension. + \tparam NumSpaceDim Dimemsion (The current version support 3D only) +*/ +template +class SparseMapDynamicPartitioner + : public DynamicPartitioner +{ + using base = DynamicPartitioner; + + protected: + using base::_workload_per_tile; + + public: + using base::cell_bits_per_tile_dim; + using base::num_space_dim; + using typename base::execution_space; + + /*! + \brief Constructor - automatically compute ranks_per_dim from MPI + communicator + \param comm MPI communicator to decide the rank nums in each dimension + \param max_workload_coeff threshold factor for re-partition + \param workload_num total workload(particle/tile) number, used to compute + workload_threshold + \param num_step_rebalance the simulation step number after which one + should check if repartition is needed + \param global_cells_per_dim 3D array, global cells in each dimension + \param max_optimize_iteration max iteration number to run the optimization + */ + SparseMapDynamicPartitioner( + MPI_Comm comm, float max_workload_coeff, int workload_num, + int num_step_rebalance, + const std::array& global_cells_per_dim, + int max_optimize_iteration = 10 ) + : base( comm, max_workload_coeff, workload_num, num_step_rebalance, + global_cells_per_dim, max_optimize_iteration ) + { + } + + /*! + \brief Constructor - user-defined ranks_per_dim + communicator + \param comm MPI communicator to decide the rank nums in each dimension + \param max_workload_coeff threshold factor for re-partition + \param workload_num total workload(particle/tile) number, used to compute + workload_threshold + \param num_step_rebalance the simulation step number after which one + should check if repartition is needed + \param ranks_per_dim 3D array, user-defined MPI rank constrains in per + dimension + \param global_cells_per_dim 3D array, global cells in each dimension + \param max_optimize_iteration max iteration number to run the optimization + */ + SparseMapDynamicPartitioner( + MPI_Comm comm, float max_workload_coeff, int workload_num, + int num_step_rebalance, + const std::array& ranks_per_dim, + const std::array& global_cells_per_dim, + int max_optimize_iteration = 10 ) + : base( comm, max_workload_coeff, workload_num, num_step_rebalance, + max_optimize_iteration ) + { + } + + /*! + \brief compute the workload in the current MPI rank from sparseMap + (the workload of a tile is 1 if the tile is occupied, 0 otherwise). This + function must be called before running optimizePartition() \param + sparseMap sparseMap in the current rank \param comm MPI communicator used + for workload reduction + */ + template + void setLocalWorkloadBySparseMap( const SparseMapType& sparseMap, + MPI_Comm comm ) + { + base::resetWorkload(); + // make a local copy + auto workload = _workload_per_tile; + Kokkos::parallel_for( + "compute_local_workload_sparsmap", + Kokkos::RangePolicy( 0, sparseMap.capacity() ), + KOKKOS_LAMBDA( uint32_t i ) { + if ( sparseMap.valid_at( i ) ) + { + auto key = sparseMap.key_at( i ); + int ti, tj, tk; + sparseMap.key2ijk( key, ti, tj, tk ); + Kokkos::atomic_increment( + &workload( ti + 1, tj + 1, tk + 1 ) ); + } + } ); + Kokkos::fence(); + // Wait for other ranks' workload to be ready + MPI_Barrier( comm ); + } +}; + } // end namespace Cajita #endif // end CAJITA_DYNAMICPARTITIONER_HPP diff --git a/cajita/unit_test/tstDynamicPartitioner.hpp b/cajita/unit_test/tstDynamicPartitioner.hpp index b27db6108..818fc05e7 100644 --- a/cajita/unit_test/tstDynamicPartitioner.hpp +++ b/cajita/unit_test/tstDynamicPartitioner.hpp @@ -47,7 +47,7 @@ void uniform_distribution_automatic_rank() size_tile_per_dim * cell_per_tile_dim }; // partitioner - DynamicPartitioner partitioner( + SparseMapDynamicPartitioner partitioner( MPI_COMM_WORLD, max_workload_coeff, workload_num, num_step_rebalance, global_cells_per_dim, max_optimize_iteration ); @@ -287,11 +287,9 @@ auto generate_random_particles( truth partition ) \param occupy_num_per_rank the tile number that will be registered on each MPI rank - \param use_tile2workload indicate the source to compute the workload on MPI - ranks, true if using tile occupation while false if using particle positions */ -void random_distribution_automatic_rank( int occupy_num_per_rank, - bool use_tile2workload = true ) +template +void random_distribution_automatic_rank( int occupy_num_per_rank ) { // define the domain size constexpr int size_tile_per_dim = 32; @@ -310,9 +308,13 @@ void random_distribution_automatic_rank( int occupy_num_per_rank, size_per_dim }; // partitioner - DynamicPartitioner partitioner( - MPI_COMM_WORLD, max_workload_coeff, particle_num, num_step_rebalance, - global_cells_per_dim, max_optimize_iteration ); + typename std::conditional< + use_tile2workload, + SparseMapDynamicPartitioner, + ParticleDynamicPartitioner>::type + partitioner( MPI_COMM_WORLD, max_workload_coeff, particle_num, + num_step_rebalance, global_cells_per_dim, + max_optimize_iteration ); // check the value of some pre-computed constants auto cbptd = partitioner.cell_bits_per_tile_dim; @@ -429,7 +431,10 @@ void random_distribution_automatic_rank( int occupy_num_per_rank, Kokkos::fence(); // compute workload from a sparseMap and do partition optimization - partitioner.setLocalWorkloadBySparseMap( sis, MPI_COMM_WORLD ); + reinterpret_cast< + SparseMapDynamicPartitioner&>( + partitioner ) + .setLocalWorkloadBySparseMap( sis, MPI_COMM_WORLD ); partitioner.optimizePartition( MPI_COMM_WORLD ); } // use particle positions to compute teh workload on MPI ranks @@ -440,9 +445,12 @@ void random_distribution_automatic_rank( int occupy_num_per_rank, gt_partition, cart_rank, occupy_num_per_rank, global_low_corner, cell_size, cell_per_tile_dim ); // compute workload from a particle view and do partition optimization - partitioner.setLocalWorkloadByParticles( particle_view, occupy_num_per_rank, - global_low_corner, cell_size, - MPI_COMM_WORLD ); + reinterpret_cast< + ParticleDynamicPartitioner&>( + partitioner ) + .setLocalWorkloadByParticles( particle_view, occupy_num_per_rank, + global_low_corner, cell_size, + MPI_COMM_WORLD ); partitioner.optimizePartition( MPI_COMM_WORLD ); } @@ -467,11 +475,11 @@ TEST( sparse_dim_partitioner, sparse_dim_partitioner_uniform_test ) } TEST( sparse_dim_partitioner, sparse_dim_partitioner_random_tile_test ) { - random_distribution_automatic_rank( 32, true ); + random_distribution_automatic_rank( 32 ); } TEST( sparse_dim_partitioner, sparse_dim_partitioner_random_par_test ) { - random_distribution_automatic_rank( 50, false ); + random_distribution_automatic_rank( 50 ); } //---------------------------------------------------------------------------// } // end namespace Test From 4273392cee9b75d13eaf56a2fc0b95785c2fed45 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Tue, 21 Jun 2022 09:20:52 -0700 Subject: [PATCH 09/42] Fix compile --- cajita/src/Cajita_DynamicPartitioner.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cajita/src/Cajita_DynamicPartitioner.hpp b/cajita/src/Cajita_DynamicPartitioner.hpp index e93ad8124..f6df34225 100644 --- a/cajita/src/Cajita_DynamicPartitioner.hpp +++ b/cajita/src/Cajita_DynamicPartitioner.hpp @@ -829,7 +829,7 @@ class ParticleDynamicPartitioner const std::array& global_cells_per_dim, int max_optimize_iteration = 10 ) : base( comm, max_workload_coeff, workload_num, num_step_rebalance, - max_optimize_iteration ) + ranks_per_dim, global_cells_per_dim, max_optimize_iteration ) { } @@ -944,7 +944,7 @@ class SparseMapDynamicPartitioner const std::array& global_cells_per_dim, int max_optimize_iteration = 10 ) : base( comm, max_workload_coeff, workload_num, num_step_rebalance, - max_optimize_iteration ) + ranks_per_dim, global_cells_per_dim, max_optimize_iteration ) { } From 562547fb99c48cd6a626e0d19047ded6c3af7a8f Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Thu, 23 Jun 2022 09:07:31 -0700 Subject: [PATCH 10/42] Fix tstDynamicPartitioner --- cajita/unit_test/tstDynamicPartitioner.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cajita/unit_test/tstDynamicPartitioner.hpp b/cajita/unit_test/tstDynamicPartitioner.hpp index 818fc05e7..05605322e 100644 --- a/cajita/unit_test/tstDynamicPartitioner.hpp +++ b/cajita/unit_test/tstDynamicPartitioner.hpp @@ -431,10 +431,10 @@ void random_distribution_automatic_rank( int occupy_num_per_rank ) Kokkos::fence(); // compute workload from a sparseMap and do partition optimization - reinterpret_cast< - SparseMapDynamicPartitioner&>( - partitioner ) - .setLocalWorkloadBySparseMap( sis, MPI_COMM_WORLD ); + dynamic_cast< + SparseMapDynamicPartitioner*>( + &partitioner ) + ->setLocalWorkloadBySparseMap( sis, MPI_COMM_WORLD ); partitioner.optimizePartition( MPI_COMM_WORLD ); } // use particle positions to compute teh workload on MPI ranks @@ -445,10 +445,10 @@ void random_distribution_automatic_rank( int occupy_num_per_rank ) gt_partition, cart_rank, occupy_num_per_rank, global_low_corner, cell_size, cell_per_tile_dim ); // compute workload from a particle view and do partition optimization - reinterpret_cast< - ParticleDynamicPartitioner&>( - partitioner ) - .setLocalWorkloadByParticles( particle_view, occupy_num_per_rank, + dynamic_cast< + ParticleDynamicPartitioner*>( + &partitioner ) + ->setLocalWorkloadByParticles( particle_view, occupy_num_per_rank, global_low_corner, cell_size, MPI_COMM_WORLD ); partitioner.optimizePartition( MPI_COMM_WORLD ); From 04f615e0d41f0946bd6f93140e361cd4098b40c8 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Mon, 27 Jun 2022 22:02:40 -0700 Subject: [PATCH 11/42] Use base::base --- cajita/src/Cajita_DynamicPartitioner.hpp | 96 +----------------------- 1 file changed, 2 insertions(+), 94 deletions(-) diff --git a/cajita/src/Cajita_DynamicPartitioner.hpp b/cajita/src/Cajita_DynamicPartitioner.hpp index f6df34225..0509a9fc9 100644 --- a/cajita/src/Cajita_DynamicPartitioner.hpp +++ b/cajita/src/Cajita_DynamicPartitioner.hpp @@ -777,6 +777,7 @@ class ParticleDynamicPartitioner : public DynamicPartitioner { using base = DynamicPartitioner; + using base::base; protected: using base::_workload_per_tile; @@ -786,53 +787,6 @@ class ParticleDynamicPartitioner using base::num_space_dim; using typename base::execution_space; - /*! - \brief Constructor - automatically compute ranks_per_dim from MPI - communicator - \param comm MPI communicator to decide the rank nums in each dimension - \param max_workload_coeff threshold factor for re-partition - \param workload_num total workload(particle/tile) number, used to compute - workload_threshold - \param num_step_rebalance the simulation step number after which one - should check if repartition is needed - \param global_cells_per_dim 3D array, global cells in each dimension - \param max_optimize_iteration max iteration number to run the optimization - */ - ParticleDynamicPartitioner( - MPI_Comm comm, float max_workload_coeff, int workload_num, - int num_step_rebalance, - const std::array& global_cells_per_dim, - int max_optimize_iteration = 10 ) - : base( comm, max_workload_coeff, workload_num, num_step_rebalance, - global_cells_per_dim, max_optimize_iteration ) - { - } - - /*! - \brief Constructor - user-defined ranks_per_dim - communicator - \param comm MPI communicator to decide the rank nums in each dimension - \param max_workload_coeff threshold factor for re-partition - \param workload_num total workload(particle/tile) number, used to compute - workload_threshold - \param num_step_rebalance the simulation step number after which one - should check if repartition is needed - \param ranks_per_dim 3D array, user-defined MPI rank constrains in per - dimension - \param global_cells_per_dim 3D array, global cells in each dimension - \param max_optimize_iteration max iteration number to run the optimization - */ - ParticleDynamicPartitioner( - MPI_Comm comm, float max_workload_coeff, int workload_num, - int num_step_rebalance, - const std::array& ranks_per_dim, - const std::array& global_cells_per_dim, - int max_optimize_iteration = 10 ) - : base( comm, max_workload_coeff, workload_num, num_step_rebalance, - ranks_per_dim, global_cells_per_dim, max_optimize_iteration ) - { - } - /*! \brief compute the workload in the current MPI rank from particle positions (each particle count for 1 workload value). This function must @@ -892,6 +846,7 @@ class SparseMapDynamicPartitioner : public DynamicPartitioner { using base = DynamicPartitioner; + using base::base; protected: using base::_workload_per_tile; @@ -901,53 +856,6 @@ class SparseMapDynamicPartitioner using base::num_space_dim; using typename base::execution_space; - /*! - \brief Constructor - automatically compute ranks_per_dim from MPI - communicator - \param comm MPI communicator to decide the rank nums in each dimension - \param max_workload_coeff threshold factor for re-partition - \param workload_num total workload(particle/tile) number, used to compute - workload_threshold - \param num_step_rebalance the simulation step number after which one - should check if repartition is needed - \param global_cells_per_dim 3D array, global cells in each dimension - \param max_optimize_iteration max iteration number to run the optimization - */ - SparseMapDynamicPartitioner( - MPI_Comm comm, float max_workload_coeff, int workload_num, - int num_step_rebalance, - const std::array& global_cells_per_dim, - int max_optimize_iteration = 10 ) - : base( comm, max_workload_coeff, workload_num, num_step_rebalance, - global_cells_per_dim, max_optimize_iteration ) - { - } - - /*! - \brief Constructor - user-defined ranks_per_dim - communicator - \param comm MPI communicator to decide the rank nums in each dimension - \param max_workload_coeff threshold factor for re-partition - \param workload_num total workload(particle/tile) number, used to compute - workload_threshold - \param num_step_rebalance the simulation step number after which one - should check if repartition is needed - \param ranks_per_dim 3D array, user-defined MPI rank constrains in per - dimension - \param global_cells_per_dim 3D array, global cells in each dimension - \param max_optimize_iteration max iteration number to run the optimization - */ - SparseMapDynamicPartitioner( - MPI_Comm comm, float max_workload_coeff, int workload_num, - int num_step_rebalance, - const std::array& ranks_per_dim, - const std::array& global_cells_per_dim, - int max_optimize_iteration = 10 ) - : base( comm, max_workload_coeff, workload_num, num_step_rebalance, - ranks_per_dim, global_cells_per_dim, max_optimize_iteration ) - { - } - /*! \brief compute the workload in the current MPI rank from sparseMap (the workload of a tile is 1 if the tile is occupied, 0 otherwise). This From d7ed9551589780d2ab6ca2314059c9aee4a95a4e Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Tue, 28 Jun 2022 10:16:20 -0700 Subject: [PATCH 12/42] Separate test files --- cajita/unit_test/CMakeLists.txt | 3 +- .../tstParticleDynamicPartitioner.hpp | 238 ++++++++++++++++++ ...hpp => tstSparseMapDynamicPartitioner.hpp} | 143 ++--------- 3 files changed, 266 insertions(+), 118 deletions(-) create mode 100644 cajita/unit_test/tstParticleDynamicPartitioner.hpp rename cajita/unit_test/{tstDynamicPartitioner.hpp => tstSparseMapDynamicPartitioner.hpp} (74%) diff --git a/cajita/unit_test/CMakeLists.txt b/cajita/unit_test/CMakeLists.txt index 535f52a0a..a8baab468 100644 --- a/cajita/unit_test/CMakeLists.txt +++ b/cajita/unit_test/CMakeLists.txt @@ -37,7 +37,8 @@ set(MPI_TESTS Interpolation2d BovWriter Parallel - DynamicPartitioner + ParticleDynamicPartitioner + SparseMapDynamicPartitioner Partitioner SparseArray ) diff --git a/cajita/unit_test/tstParticleDynamicPartitioner.hpp b/cajita/unit_test/tstParticleDynamicPartitioner.hpp new file mode 100644 index 000000000..0f8e71f41 --- /dev/null +++ b/cajita/unit_test/tstParticleDynamicPartitioner.hpp @@ -0,0 +1,238 @@ +/**************************************************************************** + * Copyright (c) 2018-2022 by the Cabana authors * + * All rights reserved. * + * * + * This file is part of the Cabana library. Cabana is distributed under a * + * BSD 3-clause license. For the licensing terms see the LICENSE file in * + * the top-level directory. * + * * + * SPDX-License-Identifier: BSD-3-Clause * + ****************************************************************************/ + +#include +#include +#include + +#include +#include +#include + +#include + +using namespace Cajita; + +namespace Test +{ + +auto generate_random_particles( + const std::array, 3>& gt_partition, + const Kokkos::Array& cart_rank, int occupy_par_num_per_rank, + const std::array global_low_corner, double dx, + int cell_num_per_tile_dim ) -> Kokkos::View +{ + std::set> par_set; + + double start[3], size[3]; + for ( int d = 0; d < 3; ++d ) + { + start[d] = + ( gt_partition[d][cart_rank[d]] * cell_num_per_tile_dim + 0.5 ) * + dx + + global_low_corner[d]; + + size[d] = + ( ( gt_partition[d][cart_rank[d] + 1] * cell_num_per_tile_dim ) - + ( gt_partition[d][cart_rank[d]] * cell_num_per_tile_dim ) ) * + dx; + } + // insert the corner tiles to the set, to ensure the uniqueness of the + // ground truth partition + par_set.insert( + { start[0] + 0.01 * dx, start[1] + 0.01 * dx, start[2] + 0.01 * dx } ); + par_set.insert( { + start[0] + size[0] - dx - 0.01 * dx, + start[1] + size[1] - dx - 0.01 * dx, + start[2] + size[2] - dx - 0.01 * dx, + } ); + + // insert random tiles to the set + while ( static_cast( par_set.size() ) < occupy_par_num_per_rank ) + { + double rand_offset[3]; + for ( int d = 0; d < 3; ++d ) + rand_offset[d] = (double)std::rand() / RAND_MAX; + par_set.insert( { start[0] + rand_offset[0] * ( size[0] - dx ), + start[1] + rand_offset[1] * ( size[1] - dx ), + start[2] + rand_offset[2] * ( size[2] - dx ) } ); + } + + // particle_set => particle view (host) + typedef typename TEST_EXECSPACE::array_layout layout; + Kokkos::View par_view_host( + "particle_view_host", par_set.size() ); + int i = 0; + for ( auto it = par_set.begin(); it != par_set.end(); ++it ) + { + for ( int d = 0; d < 3; ++d ) + par_view_host( i, d ) = ( *it )[d]; + i++; + } + + // create tiles view on device + Kokkos::View par_view = + Kokkos::create_mirror_view_and_copy( TEST_MEMSPACE(), par_view_host ); + return par_view; +} + +/*! + \brief In this test, the ground truth partition is first randomly chosen, then + a given number of tiles are regiestered on each rank (the most bottom-left and + top-right tiles are always registered to ensure the uniqueness of the ground + truth partition ) + \param occupy_num_per_rank the tile number that will be registered on each MPI + rank +*/ +void random_distribution_automatic_rank( int occupy_num_per_rank ) +{ + // define the domain size + constexpr int size_tile_per_dim = 32; + constexpr int cell_per_tile_dim = 4; + constexpr int size_per_dim = size_tile_per_dim * cell_per_tile_dim; + constexpr int total_size = size_per_dim * size_per_dim * size_per_dim; + srand( time( 0 ) ); + + // some settings for partitioner + float max_workload_coeff = 1.5; + int particle_num = total_size; + int num_step_rebalance = 100; + int max_optimize_iteration = 10; + + std::array global_cells_per_dim = { size_per_dim, size_per_dim, + size_per_dim }; + + // partitioner + ParticleDynamicPartitioner partitioner( + MPI_COMM_WORLD, max_workload_coeff, particle_num, num_step_rebalance, + global_cells_per_dim, max_optimize_iteration ); + + // check the value of some pre-computed constants + auto cbptd = partitioner.cell_bits_per_tile_dim; + EXPECT_EQ( cbptd, 2 ); + + auto cnptd = partitioner.cell_num_per_tile_dim; + EXPECT_EQ( cnptd, 4 ); + + // ranks per dim test + auto ranks_per_dim = + partitioner.ranksPerDimension( MPI_COMM_WORLD, global_cells_per_dim ); + + EXPECT_EQ( ranks_per_dim[0] >= 1, true ); + EXPECT_EQ( ranks_per_dim[1] >= 1, true ); + EXPECT_EQ( ranks_per_dim[2] >= 1, true ); + + // compute the rank ID + Kokkos::Array cart_rank; + std::array periodic_dims = { 0, 0, 0 }; + int reordered_cart_ranks = 1; + MPI_Comm cart_comm; + MPI_Cart_create( MPI_COMM_WORLD, 3, ranks_per_dim.data(), + periodic_dims.data(), reordered_cart_ranks, &cart_comm ); + int linear_rank; + MPI_Comm_rank( cart_comm, &linear_rank ); + MPI_Cart_coords( cart_comm, linear_rank, 3, cart_rank.data() ); + + // generate random ground truth partition on the root rank + std::array, 3> gt_partition_set; + std::array, 3> gt_partition; + int world_rank, world_size; + MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); + MPI_Comm_size( MPI_COMM_WORLD, &world_size ); + for ( int d = 0; d < 3; ++d ) + { + gt_partition[d].resize( ranks_per_dim[d] + 1 ); + } + + if ( world_rank == 0 ) + { + for ( int d = 0; d < 3; ++d ) + { + gt_partition_set[d].insert( 0 ); + while ( static_cast( gt_partition_set[d].size() ) < + ranks_per_dim[d] ) + { + int rand_num = std::rand() % size_tile_per_dim; + gt_partition_set[d].insert( rand_num ); + } + gt_partition_set[d].insert( size_tile_per_dim ); + int i = 0; + for ( auto it = gt_partition_set[d].begin(); + it != gt_partition_set[d].end(); ++it ) + { + gt_partition[d][i++] = *it; + } + } + } + + // broadcast the ground truth partition to all ranks + for ( int d = 0; d < 3; ++d ) + { + MPI_Barrier( MPI_COMM_WORLD ); + MPI_Bcast( gt_partition[d].data(), gt_partition[d].size(), MPI_INT, 0, + MPI_COMM_WORLD ); + MPI_Barrier( MPI_COMM_WORLD ); + } + + // init partitions (average partition) + std::array, 3> rec_partitions; + for ( int d = 0; d < 3; ++d ) + { + int ele = size_tile_per_dim / ranks_per_dim[d]; + int part = 0; + for ( int i = 0; i < ranks_per_dim[d]; ++i ) + { + rec_partitions[d].push_back( part ); + part += ele; + } + rec_partitions[d].push_back( size_tile_per_dim ); + } + + partitioner.initializeRecPartition( rec_partitions[0], rec_partitions[1], + rec_partitions[2] ); + + // basic settings for domain size and position + double cell_size = 0.1; + std::array global_low_corner = { 1.2, 3.3, -2.8 }; + + // randomly generate a fixed number of particles on each MPI rank + auto particle_view = generate_random_particles( + gt_partition, cart_rank, occupy_num_per_rank, global_low_corner, + cell_size, cell_per_tile_dim ); + // compute workload from a particle view and do partition optimization + dynamic_cast*>( + &partitioner ) + ->setLocalWorkloadByParticles( particle_view, occupy_num_per_rank, + global_low_corner, cell_size, + MPI_COMM_WORLD ); + partitioner.optimizePartition( MPI_COMM_WORLD ); + + // check results (should be the same as the gt_partition) + auto part = partitioner.getCurrentPartition(); + for ( int d = 0; d < 3; ++d ) + { + for ( int id = 0; id < ranks_per_dim[d] + 1; id++ ) + EXPECT_EQ( part[d][id], gt_partition[d][id] ); + } + + auto imbalance_factor = partitioner.computeImbalanceFactor( cart_comm ); + EXPECT_FLOAT_EQ( imbalance_factor, 1.0f ); +} + +//---------------------------------------------------------------------------// +// RUN TESTS +//---------------------------------------------------------------------------// +TEST( sparse_dim_partitioner, sparse_dim_partitioner_random_par_test ) +{ + random_distribution_automatic_rank( 50 ); +} +//---------------------------------------------------------------------------// +} // end namespace Test diff --git a/cajita/unit_test/tstDynamicPartitioner.hpp b/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp similarity index 74% rename from cajita/unit_test/tstDynamicPartitioner.hpp rename to cajita/unit_test/tstSparseMapDynamicPartitioner.hpp index 05605322e..063ae8e95 100644 --- a/cajita/unit_test/tstDynamicPartitioner.hpp +++ b/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp @@ -220,66 +220,6 @@ auto generate_random_tiles( const std::array, 3>& gt_partition, return tiles_view; } -auto generate_random_particles( - const std::array, 3>& gt_partition, - const Kokkos::Array& cart_rank, int occupy_par_num_per_rank, - const std::array global_low_corner, double dx, - int cell_num_per_tile_dim ) -> Kokkos::View -{ - std::set> par_set; - - double start[3], size[3]; - for ( int d = 0; d < 3; ++d ) - { - start[d] = - ( gt_partition[d][cart_rank[d]] * cell_num_per_tile_dim + 0.5 ) * - dx + - global_low_corner[d]; - - size[d] = - ( ( gt_partition[d][cart_rank[d] + 1] * cell_num_per_tile_dim ) - - ( gt_partition[d][cart_rank[d]] * cell_num_per_tile_dim ) ) * - dx; - } - // insert the corner tiles to the set, to ensure the uniqueness of the - // ground truth partition - par_set.insert( - { start[0] + 0.01 * dx, start[1] + 0.01 * dx, start[2] + 0.01 * dx } ); - par_set.insert( { - start[0] + size[0] - dx - 0.01 * dx, - start[1] + size[1] - dx - 0.01 * dx, - start[2] + size[2] - dx - 0.01 * dx, - } ); - - // insert random tiles to the set - while ( static_cast( par_set.size() ) < occupy_par_num_per_rank ) - { - double rand_offset[3]; - for ( int d = 0; d < 3; ++d ) - rand_offset[d] = (double)std::rand() / RAND_MAX; - par_set.insert( { start[0] + rand_offset[0] * ( size[0] - dx ), - start[1] + rand_offset[1] * ( size[1] - dx ), - start[2] + rand_offset[2] * ( size[2] - dx ) } ); - } - - // particle_set => particle view (host) - typedef typename TEST_EXECSPACE::array_layout layout; - Kokkos::View par_view_host( - "particle_view_host", par_set.size() ); - int i = 0; - for ( auto it = par_set.begin(); it != par_set.end(); ++it ) - { - for ( int d = 0; d < 3; ++d ) - par_view_host( i, d ) = ( *it )[d]; - i++; - } - - // create tiles view on device - Kokkos::View par_view = - Kokkos::create_mirror_view_and_copy( TEST_MEMSPACE(), par_view_host ); - return par_view; -} - /*! \brief In this test, the ground truth partition is first randomly chosen, then a given number of tiles are regiestered on each rank (the most bottom-left and @@ -288,7 +228,6 @@ auto generate_random_particles( \param occupy_num_per_rank the tile number that will be registered on each MPI rank */ -template void random_distribution_automatic_rank( int occupy_num_per_rank ) { // define the domain size @@ -308,13 +247,9 @@ void random_distribution_automatic_rank( int occupy_num_per_rank ) size_per_dim }; // partitioner - typename std::conditional< - use_tile2workload, - SparseMapDynamicPartitioner, - ParticleDynamicPartitioner>::type - partitioner( MPI_COMM_WORLD, max_workload_coeff, particle_num, - num_step_rebalance, global_cells_per_dim, - max_optimize_iteration ); + SparseMapDynamicPartitioner partitioner( + MPI_COMM_WORLD, max_workload_coeff, particle_num, num_step_rebalance, + global_cells_per_dim, max_optimize_iteration ); // check the value of some pre-computed constants auto cbptd = partitioner.cell_bits_per_tile_dim; @@ -409,50 +344,28 @@ void random_distribution_automatic_rank( int occupy_num_per_rank ) global_low_corner[1] + cell_size * global_cells_per_dim[1], global_low_corner[2] + cell_size * global_cells_per_dim[2] }; - // use tile occupization info to compute the workload on MPI ranks - if ( use_tile2workload ) - { - // randomly generate a fixed number of tiles on every MPI rank - auto tiles_view = generate_random_tiles( - gt_partition, cart_rank, size_tile_per_dim, occupy_num_per_rank ); - // create a new sparseMap - auto global_mesh = createSparseGlobalMesh( - global_low_corner, global_high_corner, global_cells_per_dim ); - auto sis = - createSparseMap( global_mesh, pre_alloc_size ); - // register selected tiles to the sparseMap - Kokkos::parallel_for( - "insert_tile_to_sparse_map", - Kokkos::RangePolicy( 0, tiles_view.extent( 0 ) ), - KOKKOS_LAMBDA( int id ) { - sis.insertTile( tiles_view( id, 0 ), tiles_view( id, 1 ), - tiles_view( id, 2 ) ); - } ); - Kokkos::fence(); - - // compute workload from a sparseMap and do partition optimization - dynamic_cast< - SparseMapDynamicPartitioner*>( - &partitioner ) - ->setLocalWorkloadBySparseMap( sis, MPI_COMM_WORLD ); - partitioner.optimizePartition( MPI_COMM_WORLD ); - } - // use particle positions to compute teh workload on MPI ranks - else - { - // randomly generate a fixed number of particles on each MPI rank - auto particle_view = generate_random_particles( - gt_partition, cart_rank, occupy_num_per_rank, global_low_corner, - cell_size, cell_per_tile_dim ); - // compute workload from a particle view and do partition optimization - dynamic_cast< - ParticleDynamicPartitioner*>( - &partitioner ) - ->setLocalWorkloadByParticles( particle_view, occupy_num_per_rank, - global_low_corner, cell_size, - MPI_COMM_WORLD ); - partitioner.optimizePartition( MPI_COMM_WORLD ); - } + // randomly generate a fixed number of tiles on every MPI rank + auto tiles_view = generate_random_tiles( + gt_partition, cart_rank, size_tile_per_dim, occupy_num_per_rank ); + // create a new sparseMap + auto global_mesh = createSparseGlobalMesh( + global_low_corner, global_high_corner, global_cells_per_dim ); + auto sis = createSparseMap( global_mesh, pre_alloc_size ); + // register selected tiles to the sparseMap + Kokkos::parallel_for( + "insert_tile_to_sparse_map", + Kokkos::RangePolicy( 0, tiles_view.extent( 0 ) ), + KOKKOS_LAMBDA( int id ) { + sis.insertTile( tiles_view( id, 0 ), tiles_view( id, 1 ), + tiles_view( id, 2 ) ); + } ); + Kokkos::fence(); + + // compute workload from a sparseMap and do partition optimization + dynamic_cast*>( + &partitioner ) + ->setLocalWorkloadBySparseMap( sis, MPI_COMM_WORLD ); + partitioner.optimizePartition( MPI_COMM_WORLD ); // check results (should be the same as the gt_partition) auto part = partitioner.getCurrentPartition(); @@ -475,11 +388,7 @@ TEST( sparse_dim_partitioner, sparse_dim_partitioner_uniform_test ) } TEST( sparse_dim_partitioner, sparse_dim_partitioner_random_tile_test ) { - random_distribution_automatic_rank( 32 ); -} -TEST( sparse_dim_partitioner, sparse_dim_partitioner_random_par_test ) -{ - random_distribution_automatic_rank( 50 ); + random_distribution_automatic_rank( 32 ); } //---------------------------------------------------------------------------// } // end namespace Test From eca5b0d5f292f92603e79e035e86b2f9d40103f4 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Tue, 28 Jun 2022 10:25:14 -0700 Subject: [PATCH 13/42] Separate class files --- cajita/src/Cajita_DynamicPartitioner.hpp | 126 -------------- .../src/Cajita_ParticleDynamicPartitioner.hpp | 159 ++++++++++++++++++ .../Cajita_SparseMapDynamicPartitioner.hpp | 90 ++++++++++ .../tstParticleDynamicPartitioner.hpp | 2 +- .../tstSparseMapDynamicPartitioner.hpp | 2 +- 5 files changed, 251 insertions(+), 128 deletions(-) create mode 100644 cajita/src/Cajita_ParticleDynamicPartitioner.hpp create mode 100644 cajita/src/Cajita_SparseMapDynamicPartitioner.hpp diff --git a/cajita/src/Cajita_DynamicPartitioner.hpp b/cajita/src/Cajita_DynamicPartitioner.hpp index 0509a9fc9..0519ff0ed 100644 --- a/cajita/src/Cajita_DynamicPartitioner.hpp +++ b/cajita/src/Cajita_DynamicPartitioner.hpp @@ -763,132 +763,6 @@ class DynamicPartitioner : public BlockPartitioner } }; -/*! - Dynamic mesh block partitioner. (Current Version: Support 3D only) Workload - are computed from particle distribution. - - \tparam Device Kokkos device type. - \tparam CellPerTileDim Cells per tile per dimension. - \tparam NumSpaceDim Dimemsion (The current version support 3D only) -*/ -template -class ParticleDynamicPartitioner - : public DynamicPartitioner -{ - using base = DynamicPartitioner; - using base::base; - - protected: - using base::_workload_per_tile; - - public: - using base::cell_bits_per_tile_dim; - using base::num_space_dim; - using typename base::execution_space; - - /*! - \brief compute the workload in the current MPI rank from particle - positions (each particle count for 1 workload value). This function must - be called before running optimizePartition() \param view particle - positions view \param particle_num total particle number \param - global_lower_corner the coordinate of the domain global lower corner - \param dx cell dx size - \param comm MPI communicator used for workload reduction - */ - template - void setLocalWorkloadByParticles( const ParticlePosViewType& view, - int particle_num, - const ArrayType& global_lower_corner, - const CellUnit dx, MPI_Comm comm ) - { - base::resetWorkload(); - // make a local copy - auto workload = _workload_per_tile; - Kokkos::Array lower_corner; - for ( std::size_t d = 0; d < num_space_dim; ++d ) - { - lower_corner[d] = global_lower_corner[d]; - } - - Kokkos::parallel_for( - "compute_local_workload_parpos", - Kokkos::RangePolicy( 0, particle_num ), - KOKKOS_LAMBDA( const int i ) { - int ti = static_cast( - ( view( i, 0 ) - lower_corner[0] ) / dx - 0.5 ) >> - cell_bits_per_tile_dim; - int tj = static_cast( - ( view( i, 1 ) - lower_corner[1] ) / dx - 0.5 ) >> - cell_bits_per_tile_dim; - int tz = static_cast( - ( view( i, 2 ) - lower_corner[2] ) / dx - 0.5 ) >> - cell_bits_per_tile_dim; - Kokkos::atomic_increment( &workload( ti + 1, tj + 1, tz + 1 ) ); - } ); - Kokkos::fence(); - // Wait for other ranks' workload to be ready - MPI_Barrier( comm ); - } -}; - -/*! - Dynamic mesh block partitioner. (Current Version: Support 3D only) Workload - are computed from sparse map occupancy. - - \tparam Device Kokkos device type. - \tparam CellPerTileDim Cells per tile per dimension. - \tparam NumSpaceDim Dimemsion (The current version support 3D only) -*/ -template -class SparseMapDynamicPartitioner - : public DynamicPartitioner -{ - using base = DynamicPartitioner; - using base::base; - - protected: - using base::_workload_per_tile; - - public: - using base::cell_bits_per_tile_dim; - using base::num_space_dim; - using typename base::execution_space; - - /*! - \brief compute the workload in the current MPI rank from sparseMap - (the workload of a tile is 1 if the tile is occupied, 0 otherwise). This - function must be called before running optimizePartition() \param - sparseMap sparseMap in the current rank \param comm MPI communicator used - for workload reduction - */ - template - void setLocalWorkloadBySparseMap( const SparseMapType& sparseMap, - MPI_Comm comm ) - { - base::resetWorkload(); - // make a local copy - auto workload = _workload_per_tile; - Kokkos::parallel_for( - "compute_local_workload_sparsmap", - Kokkos::RangePolicy( 0, sparseMap.capacity() ), - KOKKOS_LAMBDA( uint32_t i ) { - if ( sparseMap.valid_at( i ) ) - { - auto key = sparseMap.key_at( i ); - int ti, tj, tk; - sparseMap.key2ijk( key, ti, tj, tk ); - Kokkos::atomic_increment( - &workload( ti + 1, tj + 1, tk + 1 ) ); - } - } ); - Kokkos::fence(); - // Wait for other ranks' workload to be ready - MPI_Barrier( comm ); - } -}; - } // end namespace Cajita #endif // end CAJITA_DYNAMICPARTITIONER_HPP diff --git a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp new file mode 100644 index 000000000..ae40895b6 --- /dev/null +++ b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp @@ -0,0 +1,159 @@ +/**************************************************************************** + * Copyright (c) 2018-2022 by the Cabana authors * + * All rights reserved. * + * * + * This file is part of the Cabana library. Cabana is distributed under a * + * BSD 3-clause license. For the licensing terms see the LICENSE file in * + * the top-level directory. * + * * + * SPDX-License-Identifier: BSD-3-Clause * + ****************************************************************************/ + +/*! + \file Cajita_ParticleDynamicPartitioner.hpp + \brief Multi-node particle based dynamic grid partitioner +*/ +#ifndef CAJITA_PARTICLEDYNAMICPARTITIONER_HPP +#define CAJITA_PARTICLEDYNAMICPARTITIONER_HPP + +#include +#include +#include + +#include +#include + +#include + +namespace Cajita +{ +//---------------------------------------------------------------------------// +/*! + Dynamic mesh block partitioner. (Current Version: Support 3D only) Workload + are computed from particle distribution. + + \tparam Device Kokkos device type. + \tparam CellPerTileDim Cells per tile per dimension. + \tparam NumSpaceDim Dimemsion (The current version support 3D only) +*/ +template +class ParticleDynamicPartitioner + : public DynamicPartitioner +{ + using base = DynamicPartitioner; + using base::base; + + protected: + using base::_workload_per_tile; + + public: + using base::cell_bits_per_tile_dim; + using base::num_space_dim; + using typename base::execution_space; + + /*! + \brief compute the workload in the current MPI rank from particle + positions (each particle count for 1 workload value). This function must + be called before running optimizePartition() \param view particle + positions view \param particle_num total particle number \param + global_lower_corner the coordinate of the domain global lower corner + \param dx cell dx size + \param comm MPI communicator used for workload reduction + */ + template + void setLocalWorkloadByParticles( const ParticlePosViewType& view, + int particle_num, + const ArrayType& global_lower_corner, + const CellUnit dx, MPI_Comm comm ) + { + base::resetWorkload(); + // make a local copy + auto workload = _workload_per_tile; + Kokkos::Array lower_corner; + for ( std::size_t d = 0; d < num_space_dim; ++d ) + { + lower_corner[d] = global_lower_corner[d]; + } + + Kokkos::parallel_for( + "compute_local_workload_parpos", + Kokkos::RangePolicy( 0, particle_num ), + KOKKOS_LAMBDA( const int i ) { + int ti = static_cast( + ( view( i, 0 ) - lower_corner[0] ) / dx - 0.5 ) >> + cell_bits_per_tile_dim; + int tj = static_cast( + ( view( i, 1 ) - lower_corner[1] ) / dx - 0.5 ) >> + cell_bits_per_tile_dim; + int tz = static_cast( + ( view( i, 2 ) - lower_corner[2] ) / dx - 0.5 ) >> + cell_bits_per_tile_dim; + Kokkos::atomic_increment( &workload( ti + 1, tj + 1, tz + 1 ) ); + } ); + Kokkos::fence(); + // Wait for other ranks' workload to be ready + MPI_Barrier( comm ); + } +}; + +/*! + Dynamic mesh block partitioner. (Current Version: Support 3D only) Workload + are computed from sparse map occupancy. + + \tparam Device Kokkos device type. + \tparam CellPerTileDim Cells per tile per dimension. + \tparam NumSpaceDim Dimemsion (The current version support 3D only) +*/ +template +class SparseMapDynamicPartitioner + : public DynamicPartitioner +{ + using base = DynamicPartitioner; + using base::base; + + protected: + using base::_workload_per_tile; + + public: + using base::cell_bits_per_tile_dim; + using base::num_space_dim; + using typename base::execution_space; + + /*! + \brief compute the workload in the current MPI rank from sparseMap + (the workload of a tile is 1 if the tile is occupied, 0 otherwise). This + function must be called before running optimizePartition() \param + sparseMap sparseMap in the current rank \param comm MPI communicator used + for workload reduction + */ + template + void setLocalWorkloadBySparseMap( const SparseMapType& sparseMap, + MPI_Comm comm ) + { + base::resetWorkload(); + // make a local copy + auto workload = _workload_per_tile; + Kokkos::parallel_for( + "compute_local_workload_sparsmap", + Kokkos::RangePolicy( 0, sparseMap.capacity() ), + KOKKOS_LAMBDA( uint32_t i ) { + if ( sparseMap.valid_at( i ) ) + { + auto key = sparseMap.key_at( i ); + int ti, tj, tk; + sparseMap.key2ijk( key, ti, tj, tk ); + Kokkos::atomic_increment( + &workload( ti + 1, tj + 1, tk + 1 ) ); + } + } ); + Kokkos::fence(); + // Wait for other ranks' workload to be ready + MPI_Barrier( comm ); + } +}; + +} // end namespace Cajita + +#endif // end CAJITA_PARTICLEDYNAMICPARTITIONER_HPP diff --git a/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp b/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp new file mode 100644 index 000000000..108a1b1b7 --- /dev/null +++ b/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp @@ -0,0 +1,90 @@ +/**************************************************************************** + * Copyright (c) 2018-2022 by the Cabana authors * + * All rights reserved. * + * * + * This file is part of the Cabana library. Cabana is distributed under a * + * BSD 3-clause license. For the licensing terms see the LICENSE file in * + * the top-level directory. * + * * + * SPDX-License-Identifier: BSD-3-Clause * + ****************************************************************************/ + +/*! + \file Cajita_SparseMapDynamicPartitioner.hpp + \brief Multi-node sparse map based dynamic grid partitioner +*/ +#ifndef CAJITA_SPARSEMAPDYNAMICPARTITIONER_HPP +#define CAJITA_SPARSEMAPDYNAMICPARTITIONER_HPP + +#include +#include +#include + +#include +#include + +#include + +namespace Cajita +{ +//---------------------------------------------------------------------------// +/*! + Dynamic mesh block partitioner. (Current Version: Support 3D only) Workload + are computed from sparse map occupancy. + + \tparam Device Kokkos device type. + \tparam CellPerTileDim Cells per tile per dimension. + \tparam NumSpaceDim Dimemsion (The current version support 3D only) +*/ +template +class SparseMapDynamicPartitioner + : public DynamicPartitioner +{ + using base = DynamicPartitioner; + using base::base; + + protected: + using base::_workload_per_tile; + + public: + using base::cell_bits_per_tile_dim; + using base::num_space_dim; + using typename base::execution_space; + + /*! + \brief compute the workload in the current MPI rank from sparseMap + (the workload of a tile is 1 if the tile is occupied, 0 otherwise). This + function must be called before running optimizePartition() \param + sparseMap sparseMap in the current rank \param comm MPI communicator used + for workload reduction + */ + template + void setLocalWorkloadBySparseMap( const SparseMapType& sparseMap, + MPI_Comm comm ) + { + base::resetWorkload(); + // make a local copy + auto workload = _workload_per_tile; + Kokkos::parallel_for( + "compute_local_workload_sparsmap", + Kokkos::RangePolicy( 0, sparseMap.capacity() ), + KOKKOS_LAMBDA( uint32_t i ) { + if ( sparseMap.valid_at( i ) ) + { + auto key = sparseMap.key_at( i ); + int ti, tj, tk; + sparseMap.key2ijk( key, ti, tj, tk ); + Kokkos::atomic_increment( + &workload( ti + 1, tj + 1, tk + 1 ) ); + } + } ); + Kokkos::fence(); + // Wait for other ranks' workload to be ready + MPI_Barrier( comm ); + } +}; + +} // end namespace Cajita + +#endif // end CAJITA_SPARSEMAPDYNAMICPARTITIONER_HPP diff --git a/cajita/unit_test/tstParticleDynamicPartitioner.hpp b/cajita/unit_test/tstParticleDynamicPartitioner.hpp index 0f8e71f41..db0dbe754 100644 --- a/cajita/unit_test/tstParticleDynamicPartitioner.hpp +++ b/cajita/unit_test/tstParticleDynamicPartitioner.hpp @@ -9,7 +9,7 @@ * SPDX-License-Identifier: BSD-3-Clause * ****************************************************************************/ -#include +#include #include #include diff --git a/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp b/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp index 063ae8e95..d907ba1a4 100644 --- a/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp +++ b/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp @@ -9,7 +9,7 @@ * SPDX-License-Identifier: BSD-3-Clause * ****************************************************************************/ -#include +#include #include #include From 2120482aed50f78db0d631660b6f27fa1d7da929 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Tue, 28 Jun 2022 10:36:22 -0700 Subject: [PATCH 14/42] Separate performance test files --- benchmark/cajita/CMakeLists.txt | 11 +- ..._ParticleDynamicPartitionerPerformance.cpp | 300 ++++++++++++++++++ ...parseMapDynamicPartitionerPerformance.cpp} | 155 +-------- 3 files changed, 309 insertions(+), 157 deletions(-) create mode 100644 benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp rename benchmark/cajita/{Cajita_DynamicPartitionerPerformance.cpp => Cajita_SparseMapDynamicPartitionerPerformance.cpp} (64%) diff --git a/benchmark/cajita/CMakeLists.txt b/benchmark/cajita/CMakeLists.txt index 608621e29..404333fdd 100644 --- a/benchmark/cajita/CMakeLists.txt +++ b/benchmark/cajita/CMakeLists.txt @@ -12,8 +12,11 @@ add_executable(SparseMapPerformance Cajita_SparseMapPerformance.cpp) target_link_libraries(SparseMapPerformance Cajita) -add_executable(DynamicPartitionerPerformance Cajita_DynamicPartitionerPerformance.cpp) -target_link_libraries(DynamicPartitionerPerformance Cajita) +add_executable(ParticleDynamicPartitionerPerformance Cajita_ParticleDynamicPartitionerPerformance.cpp) +target_link_libraries(ParticleDynamicPartitionerPerformance Cajita) + +add_executable(SparseMapDynamicPartitionerPerformance Cajita_SparseMapDynamicPartitionerPerformance.cpp) +target_link_libraries(SparseMapDynamicPartitionerPerformance Cajita) add_executable(HaloPerformance Cajita_HaloPerformance.cpp) target_link_libraries(HaloPerformance Cajita) @@ -29,7 +32,9 @@ endif() if(Cabana_ENABLE_TESTING) add_test(NAME Cajita_SparseMapPerformance COMMAND ${NONMPI_PRECOMMAND} SparseMapPerformance sparsemap_output.txt) - add_test(NAME Cajita_DynamicPartitionerPerformance COMMAND ${NONMPI_PRECOMMAND} DynamicPartitionerPerformance dynamicpartitioner_output.txt) + add_test(NAME Cajita_ParticleDynamicPartitionerPerformance COMMAND ${NONMPI_PRECOMMAND} ParticleDynamicPartitionerPerformance particledynamicpartitioner_output.txt) + + add_test(NAME Cajita_SparseMapDynamicPartitionerPerformance COMMAND ${NONMPI_PRECOMMAND} SparseMapDynamicPartitionerPerformance sparsemapdynamicpartitioner_output.txt) add_test(NAME Cajita_HaloPerformance COMMAND ${NONMPI_PRECOMMAND} HaloPerformance halo_output.txt) diff --git a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp new file mode 100644 index 000000000..63494419a --- /dev/null +++ b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp @@ -0,0 +1,300 @@ +/**************************************************************************** + * Copyright (c) 2018-2022 by the Cabana authors * + * All rights reserved. * + * * + * This file is part of the Cabana library. Cabana is distributed under a * + * BSD 3-clause license. For the licensing terms see the LICENSE file in * + * the top-level directory. * + * * + * SPDX-License-Identifier: BSD-3-Clause * + ****************************************************************************/ + +#include "../Cabana_BenchmarkUtils.hpp" +#include "Cabana_ParticleInit.hpp" + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +//---------------------------------------------------------------------------// +// Helper functions. +struct ParticleWorkloadTag +{ +}; + +// generate average partitioner +std::array, 3> computeAveragePartition( + const int tile_per_dim, const std::array& ranks_per_dim ) +{ + std::array, 3> rec_partitions; + for ( int d = 0; d < 3; ++d ) + { + int ele = tile_per_dim / ranks_per_dim[d]; + int part = 0; + for ( int i = 0; i < ranks_per_dim[d]; ++i ) + { + rec_partitions[d].push_back( part ); + part += ele; + } + rec_partitions[d].push_back( tile_per_dim ); + } + return rec_partitions; +} + +//---------------------------------------------------------------------------// +// Performance test. +template +void performanceTest( ParticleWorkloadTag, std::ostream& stream, MPI_Comm comm, + const std::string& test_prefix, + std::vector problem_sizes, + std::vector num_cells_per_dim ) +{ + using memory_space = typename Device::memory_space; + + // Get comm rank; + int comm_rank; + MPI_Comm_rank( comm, &comm_rank ); + + // Get comm size; + int comm_size; + MPI_Comm_size( comm, &comm_size ); + + // Domain size setup + std::array global_low_corner = { 0.0, 0.0, 0.0 }; + std::array global_high_corner = { 1.0, 1.0, 1.0 }; + constexpr int cell_num_per_tile_dim = 4; + constexpr int cell_bits_per_tile_dim = 2; + + // Declare the total number of particles + int num_problem_size = problem_sizes.size(); + + // Declare the size (cell nums) of the domain + int num_cells_per_dim_size = num_cells_per_dim.size(); + + // Number of runs in the test loops. + int num_run = 10; + + // Basic settings for partitioenr + float max_workload_coeff = 1.5; + int max_optimize_iteration = 10; + int num_step_rebalance = 100; + + // compute the max number of particles handled by the current MPI rank + int max_par_num = problem_sizes.back() / comm_size + + ( problem_sizes.back() % comm_size < comm_rank ? 1 : 0 ); + + // Create random sets of particle positions. + using position_type = Kokkos::View; + std::vector positions( num_problem_size ); + for ( int p = 0; p < num_problem_size; ++p ) + { + positions[p] = position_type( + Kokkos::ViewAllocateWithoutInitializing( "positions" ), + problem_sizes[p] ); + Cabana::createRandomParticles( positions[p], problem_sizes[p], + global_low_corner[0], + global_high_corner[0] ); + } + + for ( int c = 0; c < num_cells_per_dim_size; ++c ) + { + // init the sparse grid domain + std::array global_num_cell = { + num_cells_per_dim[c], num_cells_per_dim[c], num_cells_per_dim[c] }; + int num_tiles_per_dim = num_cells_per_dim[c] >> cell_bits_per_tile_dim; + + // set up partitioner + Cajita::ParticleDynamicPartitioner + partitioner( comm, max_workload_coeff, max_par_num, + num_step_rebalance, global_num_cell, + max_optimize_iteration ); + auto ranks_per_dim = + partitioner.ranksPerDimension( comm, global_num_cell ); + auto ave_partition = + computeAveragePartition( num_tiles_per_dim, ranks_per_dim ); + + // Create insertion timers + std::stringstream local_workload_name; + local_workload_name << test_prefix << "compute_local_workload_" + << "domain_size(cell)_" << num_cells_per_dim[c]; + Cabana::Benchmark::Timer local_workload_timer( + local_workload_name.str(), num_problem_size ); + + std::stringstream prefix_sum_name; + prefix_sum_name << test_prefix << "compute_prefix_sum_" + << "domain_size(cell)_" << num_cells_per_dim[c]; + Cabana::Benchmark::Timer prefix_sum_timer( prefix_sum_name.str(), + num_problem_size ); + + std::stringstream total_optimize_name; + total_optimize_name << test_prefix << "total_optimize_" + << "domain_size(cell)_" << num_cells_per_dim[c]; + Cabana::Benchmark::Timer total_optimize_timer( + total_optimize_name.str(), num_problem_size ); + + // loop over all the particle numbers + for ( int p = 0; p < num_problem_size; ++p ) + { + // compute the number of particles handled by the current MPI rank + int par_num = problem_sizes[p] / comm_size + + ( problem_sizes[p] % comm_size < comm_rank ? 1 : 0 ); + + auto pos_view = Kokkos::subview( + positions[p], Kokkos::pair( 0, par_num ), + Kokkos::pair( 0, 3 ) ); + + // try for num_run times + for ( int t = 0; t < num_run; ++t ) + { + // ensure every optimization process starts from the same status + partitioner.initializeRecPartition( + ave_partition[0], ave_partition[1], ave_partition[2] ); + + // compute local workload + local_workload_timer.start( p ); + partitioner.setLocalWorkloadByParticles( + pos_view, par_num, global_low_corner, + 1.0f / num_cells_per_dim[c], comm ); + local_workload_timer.stop( p ); + + // compute prefix sum matrix + prefix_sum_timer.start( p ); + partitioner.computeFullPrefixSum( comm ); + prefix_sum_timer.stop( p ); + + // optimization + bool is_changed = false; + // full timer + total_optimize_timer.start( p ); + for ( int i = 0; i < max_optimize_iteration; ++i ) + { + partitioner.optimizePartitionAlongDim( std::rand() % 3, + is_changed ); + if ( !is_changed ) + break; + } + total_optimize_timer.stop( p ); + } + } + // Output results + outputResults( stream, "insert_tile_num", problem_sizes, + local_workload_timer, comm ); + outputResults( stream, "insert_tile_num", problem_sizes, + prefix_sum_timer, comm ); + outputResults( stream, "insert_tile_num", problem_sizes, + total_optimize_timer, comm ); + stream << std::flush; + } +} + +//---------------------------------------------------------------------------// +// main +int main( int argc, char* argv[] ) +{ + // Initialize environment + MPI_Init( &argc, &argv ); + Kokkos::initialize( argc, argv ); + + // Check arguments. + if ( argc < 2 ) + throw std::runtime_error( "Incorrect number of arguments. \n \ + First argument - file name for output \n \ + Optional second argument - run size (small or large) \n \ + \n \ + Example: \n \ + $/: ./SparseMapPerformance test_results.txt\n" ); + + // Define run sizes. + std::string run_type = ""; + if ( argc > 2 ) + run_type = argv[2]; + std::vector problem_sizes = { 1000, 10000 }; + std::vector num_cells_per_dim = { 32, 64 }; + if ( run_type == "large" ) + { + problem_sizes = { 1000, 10000, 100000, 1000000 }; + num_cells_per_dim = { 32, 64, 128, 256 }; + } + std::vector occupy_fraction = { 0.01, 0.1, 0.5, 0.75, 1.0 }; + + // Get the name of the output file. + std::string filename = argv[1]; + + // Barier before continuing. + MPI_Barrier( MPI_COMM_WORLD ); + + // Get comm rank; + int comm_rank; + MPI_Comm_rank( MPI_COMM_WORLD, &comm_rank ); + + // Get comm size; + int comm_size; + MPI_Comm_size( MPI_COMM_WORLD, &comm_size ); + + // Get Cartesian comm + std::array ranks_per_dim; + for ( std::size_t d = 0; d < 3; ++d ) + ranks_per_dim[d] = 0; + MPI_Dims_create( comm_size, 3, ranks_per_dim.data() ); + + // Open the output file on rank 0. + std::fstream file; + if ( 0 == comm_rank ) + file.open( filename, std::fstream::out ); + + // Output problem details. + if ( 0 == comm_rank ) + { + file << "\n"; + file << "Cajita Sparse Partitioner Performance Benchmark" + << "\n"; + file << "----------------------------------------------" + << "\n"; + file << "MPI Ranks: " << comm_size << "\n"; + file << "MPI Cartesian Dim Ranks: (" << ranks_per_dim[0] << ", " + << ranks_per_dim[1] << ", " << ranks_per_dim[2] << ")\n"; + file << "----------------------------------------------" + << "\n"; + file << "\n"; + file << std::flush; + } + + // Do everything on the default CPU. + using host_exec_space = Kokkos::DefaultHostExecutionSpace; + using host_device_type = host_exec_space::device_type; + // Do everything on the default device with default memory. + using exec_space = Kokkos::DefaultExecutionSpace; + using device_type = exec_space::device_type; + + // Don't run twice on the CPU if only host enabled. + // Don't rerun on the CPU if already done or if turned off. + if ( !std::is_same{} ) + { + performanceTest( ParticleWorkloadTag(), file, + MPI_COMM_WORLD, "device_particleWL_", + problem_sizes, num_cells_per_dim ); + } + performanceTest( ParticleWorkloadTag(), file, + MPI_COMM_WORLD, "host_particleWL_", + problem_sizes, num_cells_per_dim ); + + // Close the output file on rank 0. + file.close(); + + // Finalize + Kokkos::finalize(); + MPI_Finalize(); + return 0; +} diff --git a/benchmark/cajita/Cajita_DynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp similarity index 64% rename from benchmark/cajita/Cajita_DynamicPartitionerPerformance.cpp rename to benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp index 7be6e347c..9c900f6db 100644 --- a/benchmark/cajita/Cajita_DynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp @@ -12,7 +12,7 @@ #include "../Cabana_BenchmarkUtils.hpp" #include "Cabana_ParticleInit.hpp" -#include +#include #include #include @@ -30,9 +30,6 @@ //---------------------------------------------------------------------------// // Helper functions. -struct ParticleWorkloadTag -{ -}; struct SparseMapTag { }; @@ -81,150 +78,6 @@ std::array, 3> computeAveragePartition( //---------------------------------------------------------------------------// // Performance test. -template -void performanceTest( ParticleWorkloadTag, std::ostream& stream, MPI_Comm comm, - const std::string& test_prefix, - std::vector problem_sizes, - std::vector num_cells_per_dim ) -{ - using memory_space = typename Device::memory_space; - - // Get comm rank; - int comm_rank; - MPI_Comm_rank( comm, &comm_rank ); - - // Get comm size; - int comm_size; - MPI_Comm_size( comm, &comm_size ); - - // Domain size setup - std::array global_low_corner = { 0.0, 0.0, 0.0 }; - std::array global_high_corner = { 1.0, 1.0, 1.0 }; - constexpr int cell_num_per_tile_dim = 4; - constexpr int cell_bits_per_tile_dim = 2; - - // Declare the total number of particles - int num_problem_size = problem_sizes.size(); - - // Declare the size (cell nums) of the domain - int num_cells_per_dim_size = num_cells_per_dim.size(); - - // Number of runs in the test loops. - int num_run = 10; - - // Basic settings for partitioenr - float max_workload_coeff = 1.5; - int max_optimize_iteration = 10; - int num_step_rebalance = 100; - - // compute the max number of particles handled by the current MPI rank - int max_par_num = problem_sizes.back() / comm_size + - ( problem_sizes.back() % comm_size < comm_rank ? 1 : 0 ); - - // Create random sets of particle positions. - using position_type = Kokkos::View; - std::vector positions( num_problem_size ); - for ( int p = 0; p < num_problem_size; ++p ) - { - positions[p] = position_type( - Kokkos::ViewAllocateWithoutInitializing( "positions" ), - problem_sizes[p] ); - Cabana::createRandomParticles( positions[p], problem_sizes[p], - global_low_corner[0], - global_high_corner[0] ); - } - - for ( int c = 0; c < num_cells_per_dim_size; ++c ) - { - // init the sparse grid domain - std::array global_num_cell = { - num_cells_per_dim[c], num_cells_per_dim[c], num_cells_per_dim[c] }; - int num_tiles_per_dim = num_cells_per_dim[c] >> cell_bits_per_tile_dim; - - // set up partitioner - Cajita::ParticleDynamicPartitioner - partitioner( comm, max_workload_coeff, max_par_num, - num_step_rebalance, global_num_cell, - max_optimize_iteration ); - auto ranks_per_dim = - partitioner.ranksPerDimension( comm, global_num_cell ); - auto ave_partition = - computeAveragePartition( num_tiles_per_dim, ranks_per_dim ); - - // Create insertion timers - std::stringstream local_workload_name; - local_workload_name << test_prefix << "compute_local_workload_" - << "domain_size(cell)_" << num_cells_per_dim[c]; - Cabana::Benchmark::Timer local_workload_timer( - local_workload_name.str(), num_problem_size ); - - std::stringstream prefix_sum_name; - prefix_sum_name << test_prefix << "compute_prefix_sum_" - << "domain_size(cell)_" << num_cells_per_dim[c]; - Cabana::Benchmark::Timer prefix_sum_timer( prefix_sum_name.str(), - num_problem_size ); - - std::stringstream total_optimize_name; - total_optimize_name << test_prefix << "total_optimize_" - << "domain_size(cell)_" << num_cells_per_dim[c]; - Cabana::Benchmark::Timer total_optimize_timer( - total_optimize_name.str(), num_problem_size ); - - // loop over all the particle numbers - for ( int p = 0; p < num_problem_size; ++p ) - { - // compute the number of particles handled by the current MPI rank - int par_num = problem_sizes[p] / comm_size + - ( problem_sizes[p] % comm_size < comm_rank ? 1 : 0 ); - - auto pos_view = Kokkos::subview( - positions[p], Kokkos::pair( 0, par_num ), - Kokkos::pair( 0, 3 ) ); - - // try for num_run times - for ( int t = 0; t < num_run; ++t ) - { - // ensure every optimization process starts from the same status - partitioner.initializeRecPartition( - ave_partition[0], ave_partition[1], ave_partition[2] ); - - // compute local workload - local_workload_timer.start( p ); - partitioner.setLocalWorkloadByParticles( - pos_view, par_num, global_low_corner, - 1.0f / num_cells_per_dim[c], comm ); - local_workload_timer.stop( p ); - - // compute prefix sum matrix - prefix_sum_timer.start( p ); - partitioner.computeFullPrefixSum( comm ); - prefix_sum_timer.stop( p ); - - // optimization - bool is_changed = false; - // full timer - total_optimize_timer.start( p ); - for ( int i = 0; i < max_optimize_iteration; ++i ) - { - partitioner.optimizePartitionAlongDim( std::rand() % 3, - is_changed ); - if ( !is_changed ) - break; - } - total_optimize_timer.stop( p ); - } - } - // Output results - outputResults( stream, "insert_tile_num", problem_sizes, - local_workload_timer, comm ); - outputResults( stream, "insert_tile_num", problem_sizes, - prefix_sum_timer, comm ); - outputResults( stream, "insert_tile_num", problem_sizes, - total_optimize_timer, comm ); - stream << std::flush; - } -} - template void performanceTest( SparseMapTag, std::ostream& stream, MPI_Comm comm, const std::string& test_prefix, @@ -445,16 +298,10 @@ int main( int argc, char* argv[] ) // Don't rerun on the CPU if already done or if turned off. if ( !std::is_same{} ) { - performanceTest( ParticleWorkloadTag(), file, - MPI_COMM_WORLD, "device_particleWL_", - problem_sizes, num_cells_per_dim ); performanceTest( SparseMapTag(), file, MPI_COMM_WORLD, "device_sparsemapWL_", occupy_fraction, num_cells_per_dim ); } - performanceTest( ParticleWorkloadTag(), file, - MPI_COMM_WORLD, "host_particleWL_", - problem_sizes, num_cells_per_dim ); performanceTest( SparseMapTag(), file, MPI_COMM_WORLD, "host_sparsemapWL_", occupy_fraction, num_cells_per_dim ); From 5ab63988f3529a8e22a2b7efc66861c5454d9032 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Tue, 28 Jun 2022 10:42:29 -0700 Subject: [PATCH 15/42] Clean up --- .../src/Cajita_ParticleDynamicPartitioner.hpp | 57 ------------------- 1 file changed, 57 deletions(-) diff --git a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp index ae40895b6..8756eceed 100644 --- a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp +++ b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp @@ -97,63 +97,6 @@ class ParticleDynamicPartitioner } }; -/*! - Dynamic mesh block partitioner. (Current Version: Support 3D only) Workload - are computed from sparse map occupancy. - - \tparam Device Kokkos device type. - \tparam CellPerTileDim Cells per tile per dimension. - \tparam NumSpaceDim Dimemsion (The current version support 3D only) -*/ -template -class SparseMapDynamicPartitioner - : public DynamicPartitioner -{ - using base = DynamicPartitioner; - using base::base; - - protected: - using base::_workload_per_tile; - - public: - using base::cell_bits_per_tile_dim; - using base::num_space_dim; - using typename base::execution_space; - - /*! - \brief compute the workload in the current MPI rank from sparseMap - (the workload of a tile is 1 if the tile is occupied, 0 otherwise). This - function must be called before running optimizePartition() \param - sparseMap sparseMap in the current rank \param comm MPI communicator used - for workload reduction - */ - template - void setLocalWorkloadBySparseMap( const SparseMapType& sparseMap, - MPI_Comm comm ) - { - base::resetWorkload(); - // make a local copy - auto workload = _workload_per_tile; - Kokkos::parallel_for( - "compute_local_workload_sparsmap", - Kokkos::RangePolicy( 0, sparseMap.capacity() ), - KOKKOS_LAMBDA( uint32_t i ) { - if ( sparseMap.valid_at( i ) ) - { - auto key = sparseMap.key_at( i ); - int ti, tj, tk; - sparseMap.key2ijk( key, ti, tj, tk ); - Kokkos::atomic_increment( - &workload( ti + 1, tj + 1, tk + 1 ) ); - } - } ); - Kokkos::fence(); - // Wait for other ranks' workload to be ready - MPI_Barrier( comm ); - } -}; - } // end namespace Cajita #endif // end CAJITA_PARTICLEDYNAMICPARTITIONER_HPP From ceecf4ad4769982bb97ef9d24cc6af91e22a2699 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Tue, 28 Jun 2022 10:46:13 -0700 Subject: [PATCH 16/42] Rename setLocalWorkload --- .../cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp | 2 +- .../cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp | 2 +- cajita/src/Cajita_ParticleDynamicPartitioner.hpp | 2 +- cajita/src/Cajita_SparseMapDynamicPartitioner.hpp | 2 +- cajita/unit_test/tstParticleDynamicPartitioner.hpp | 2 +- cajita/unit_test/tstSparseMapDynamicPartitioner.hpp | 4 ++-- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp index 63494419a..1878356d2 100644 --- a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp @@ -164,7 +164,7 @@ void performanceTest( ParticleWorkloadTag, std::ostream& stream, MPI_Comm comm, // compute local workload local_workload_timer.start( p ); - partitioner.setLocalWorkloadByParticles( + partitioner.setLocalWorkload( pos_view, par_num, global_low_corner, 1.0f / num_cells_per_dim[c], comm ); local_workload_timer.stop( p ); diff --git a/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp index 9c900f6db..501620949 100644 --- a/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp @@ -181,7 +181,7 @@ void performanceTest( SparseMapTag, std::ostream& stream, MPI_Comm comm, // compute local workload local_workload_timer.start( frac ); - partitioner.setLocalWorkloadBySparseMap( sis, comm ); + partitioner.setLocalWorkload( sis, comm ); local_workload_timer.stop( frac ); // compute prefix sum matrix diff --git a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp index 8756eceed..24384d5d5 100644 --- a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp +++ b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp @@ -62,7 +62,7 @@ class ParticleDynamicPartitioner \param comm MPI communicator used for workload reduction */ template - void setLocalWorkloadByParticles( const ParticlePosViewType& view, + void setLocalWorkload( const ParticlePosViewType& view, int particle_num, const ArrayType& global_lower_corner, const CellUnit dx, MPI_Comm comm ) diff --git a/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp b/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp index 108a1b1b7..c5ae8e5e8 100644 --- a/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp +++ b/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp @@ -60,7 +60,7 @@ class SparseMapDynamicPartitioner for workload reduction */ template - void setLocalWorkloadBySparseMap( const SparseMapType& sparseMap, + void setLocalWorkload( const SparseMapType& sparseMap, MPI_Comm comm ) { base::resetWorkload(); diff --git a/cajita/unit_test/tstParticleDynamicPartitioner.hpp b/cajita/unit_test/tstParticleDynamicPartitioner.hpp index db0dbe754..7e099b0da 100644 --- a/cajita/unit_test/tstParticleDynamicPartitioner.hpp +++ b/cajita/unit_test/tstParticleDynamicPartitioner.hpp @@ -210,7 +210,7 @@ void random_distribution_automatic_rank( int occupy_num_per_rank ) // compute workload from a particle view and do partition optimization dynamic_cast*>( &partitioner ) - ->setLocalWorkloadByParticles( particle_view, occupy_num_per_rank, + ->setLocalWorkload( particle_view, occupy_num_per_rank, global_low_corner, cell_size, MPI_COMM_WORLD ); partitioner.optimizePartition( MPI_COMM_WORLD ); diff --git a/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp b/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp index d907ba1a4..ae30c5525 100644 --- a/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp +++ b/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp @@ -147,7 +147,7 @@ void uniform_distribution_automatic_rank() Kokkos::fence(); // compute workload and do partition optimization - partitioner.setLocalWorkloadBySparseMap( sis, MPI_COMM_WORLD ); + partitioner.setLocalWorkload( sis, MPI_COMM_WORLD ); partitioner.optimizePartition( MPI_COMM_WORLD ); // check results (should be the same as the average partition) @@ -364,7 +364,7 @@ void random_distribution_automatic_rank( int occupy_num_per_rank ) // compute workload from a sparseMap and do partition optimization dynamic_cast*>( &partitioner ) - ->setLocalWorkloadBySparseMap( sis, MPI_COMM_WORLD ); + ->setLocalWorkload( sis, MPI_COMM_WORLD ); partitioner.optimizePartition( MPI_COMM_WORLD ); // check results (should be the same as the gt_partition) From 45559fb6445062bc74313ad0ddc9ace0e671b7ee Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Tue, 28 Jun 2022 10:47:24 -0700 Subject: [PATCH 17/42] Format --- .../Cajita_ParticleDynamicPartitionerPerformance.cpp | 5 +++-- .../Cajita_SparseMapDynamicPartitionerPerformance.cpp | 2 +- cajita/src/Cajita_ParticleDynamicPartitioner.hpp | 7 +++---- cajita/src/Cajita_SparseMapDynamicPartitioner.hpp | 3 +-- cajita/unit_test/tstParticleDynamicPartitioner.hpp | 3 +-- cajita/unit_test/tstSparseMapDynamicPartitioner.hpp | 2 +- 6 files changed, 10 insertions(+), 12 deletions(-) diff --git a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp index 1878356d2..641e81860 100644 --- a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp @@ -35,8 +35,9 @@ struct ParticleWorkloadTag }; // generate average partitioner -std::array, 3> computeAveragePartition( - const int tile_per_dim, const std::array& ranks_per_dim ) +std::array, 3> +computeAveragePartition( const int tile_per_dim, + const std::array& ranks_per_dim ) { std::array, 3> rec_partitions; for ( int d = 0; d < 3; ++d ) diff --git a/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp index 501620949..06a8a08e0 100644 --- a/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp @@ -12,8 +12,8 @@ #include "../Cabana_BenchmarkUtils.hpp" #include "Cabana_ParticleInit.hpp" -#include #include +#include #include diff --git a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp index 24384d5d5..ed6009a0c 100644 --- a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp +++ b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp @@ -62,10 +62,9 @@ class ParticleDynamicPartitioner \param comm MPI communicator used for workload reduction */ template - void setLocalWorkload( const ParticlePosViewType& view, - int particle_num, - const ArrayType& global_lower_corner, - const CellUnit dx, MPI_Comm comm ) + void setLocalWorkload( const ParticlePosViewType& view, int particle_num, + const ArrayType& global_lower_corner, + const CellUnit dx, MPI_Comm comm ) { base::resetWorkload(); // make a local copy diff --git a/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp b/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp index c5ae8e5e8..00498eab4 100644 --- a/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp +++ b/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp @@ -60,8 +60,7 @@ class SparseMapDynamicPartitioner for workload reduction */ template - void setLocalWorkload( const SparseMapType& sparseMap, - MPI_Comm comm ) + void setLocalWorkload( const SparseMapType& sparseMap, MPI_Comm comm ) { base::resetWorkload(); // make a local copy diff --git a/cajita/unit_test/tstParticleDynamicPartitioner.hpp b/cajita/unit_test/tstParticleDynamicPartitioner.hpp index 7e099b0da..8429b3380 100644 --- a/cajita/unit_test/tstParticleDynamicPartitioner.hpp +++ b/cajita/unit_test/tstParticleDynamicPartitioner.hpp @@ -211,8 +211,7 @@ void random_distribution_automatic_rank( int occupy_num_per_rank ) dynamic_cast*>( &partitioner ) ->setLocalWorkload( particle_view, occupy_num_per_rank, - global_low_corner, cell_size, - MPI_COMM_WORLD ); + global_low_corner, cell_size, MPI_COMM_WORLD ); partitioner.optimizePartition( MPI_COMM_WORLD ); // check results (should be the same as the gt_partition) diff --git a/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp b/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp index ae30c5525..5154c4a4a 100644 --- a/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp +++ b/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp @@ -9,8 +9,8 @@ * SPDX-License-Identifier: BSD-3-Clause * ****************************************************************************/ -#include #include +#include #include #include From 3b817f0fec9f0e47259fc62e717b24120c2df8f7 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Tue, 28 Jun 2022 10:49:32 -0700 Subject: [PATCH 18/42] Format --- cajita/src/Cajita_DynamicPartitioner.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cajita/src/Cajita_DynamicPartitioner.hpp b/cajita/src/Cajita_DynamicPartitioner.hpp index 0519ff0ed..23daa62e6 100644 --- a/cajita/src/Cajita_DynamicPartitioner.hpp +++ b/cajita/src/Cajita_DynamicPartitioner.hpp @@ -31,8 +31,12 @@ namespace Cajita /*! Dynamic mesh block partitioner. (Current Version: Support 3D only) There should be no instantiation for this class without implementing any workload - computation. \tparam Device Kokkos device type. \tparam CellPerTileDim Cells - per tile per dimension. \tparam NumSpaceDim Dimemsion (The current version + computation. + + \tparam Device Kokkos device type. + \tparam CellPerTileDim Cells + per tile per dimension. + \tparam NumSpaceDim Dimemsion (The current version support 3D only) */ template Date: Thu, 7 Jul 2022 04:23:03 -0700 Subject: [PATCH 19/42] Add WorkloadSetter class --- cajita/src/Cajita_DynamicPartitioner.hpp | 15 +++++ .../src/Cajita_ParticleDynamicPartitioner.hpp | 58 +++++++++++++++++++ .../Cajita_SparseMapDynamicPartitioner.hpp | 38 ++++++++++++ 3 files changed, 111 insertions(+) diff --git a/cajita/src/Cajita_DynamicPartitioner.hpp b/cajita/src/Cajita_DynamicPartitioner.hpp index 23daa62e6..c6a439b38 100644 --- a/cajita/src/Cajita_DynamicPartitioner.hpp +++ b/cajita/src/Cajita_DynamicPartitioner.hpp @@ -27,6 +27,15 @@ namespace Cajita { + +template +class WorkloadSetter +{ + using memory_space = typename Device::memory_space; + public: + virtual void run( Kokkos::View& ) = 0; +}; + //---------------------------------------------------------------------------// /*! Dynamic mesh block partitioner. (Current Version: Support 3D only) There @@ -423,6 +432,12 @@ class DynamicPartitioner : public BlockPartitioner Kokkos::fence(); } + void setLocalWorkload( WorkloadSetter* setter ) + { + resetWorkload(); + setter.run( _workload_per_tile ); + } + /*! \brief iteratively optimize the partition \param comm MPI communicator used for workload reduction diff --git a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp index ed6009a0c..f9151c868 100644 --- a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp +++ b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp @@ -27,6 +27,64 @@ namespace Cajita { + +template +class ParticleWorkloadSetter : public WorkloadSetter +{ + using memory_space = typename Device::memory_space; + using execution_space = typename Device::execution_space; + + static constexpr unsigned long long cell_bits_per_tile_dim = + bitCount( CellPerTileDim ); + + const ParticlePosViewType& view; + int particle_num; + const ArrayType& global_lower_corner; + const CellUnit dx; + MPI_Comm comm; + + public: + ParticleWorkloadSetter( const ParticlePosViewType& view, int particle_num, + const ArrayType& global_lower_corner, + const CellUnit dx, MPI_Comm comm ) + : view( view ) + , particle_num( particle_num ) + , global_lower_corner( global_lower_corner ) + , dx( dx ) + , comm( comm ) + { + } + + void run( Kokkos::View& workload ) override + { + Kokkos::Array lower_corner; + for ( std::size_t d = 0; d < num_space_dim; ++d ) + { + lower_corner[d] = global_lower_corner[d]; + } + + Kokkos::parallel_for( + "compute_local_workload_parpos", + Kokkos::RangePolicy( 0, particle_num ), + KOKKOS_LAMBDA( const int i ) { + int ti = static_cast( + ( view( i, 0 ) - lower_corner[0] ) / dx - 0.5 ) >> + cell_bits_per_tile_dim; + int tj = static_cast( + ( view( i, 1 ) - lower_corner[1] ) / dx - 0.5 ) >> + cell_bits_per_tile_dim; + int tz = static_cast( + ( view( i, 2 ) - lower_corner[2] ) / dx - 0.5 ) >> + cell_bits_per_tile_dim; + Kokkos::atomic_increment( &workload( ti + 1, tj + 1, tz + 1 ) ); + } ); + Kokkos::fence(); + // Wait for other ranks' workload to be ready + MPI_Barrier( comm ); + } +}; + //---------------------------------------------------------------------------// /*! Dynamic mesh block partitioner. (Current Version: Support 3D only) Workload diff --git a/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp b/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp index 00498eab4..82248408f 100644 --- a/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp +++ b/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp @@ -27,6 +27,44 @@ namespace Cajita { + +template +class SparseMapWorkloadSetter : public WorkloadSetter +{ + using memory_space = typename Device::memory_space; + using execution_space = typename Device::execution_space; + + const SparseMapType& sparseMap; + MPI_Comm comm; + + public: + SparseMapWorkloadSetter( const SparseMapType& sparseMap, MPI_Comm comm ) + : sparseMap( sparseMap ) + , comm( comm ) + { + } + + void run( Kokkos::View& workload ) override + { + Kokkos::parallel_for( + "compute_local_workload_sparsmap", + Kokkos::RangePolicy( 0, sparseMap.capacity() ), + KOKKOS_LAMBDA( uint32_t i ) { + if ( sparseMap.valid_at( i ) ) + { + auto key = sparseMap.key_at( i ); + int ti, tj, tk; + sparseMap.key2ijk( key, ti, tj, tk ); + Kokkos::atomic_increment( + &workload( ti + 1, tj + 1, tk + 1 ) ); + } + } ); + Kokkos::fence(); + // Wait for other ranks' workload to be ready + MPI_Barrier( comm ); + } +}; + //---------------------------------------------------------------------------// /*! Dynamic mesh block partitioner. (Current Version: Support 3D only) Workload From 57a04bf4c77986e003b0daf9f2f11b61fcaeb2c6 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Mon, 11 Jul 2022 23:29:57 -0700 Subject: [PATCH 20/42] Use WorkloadSetter --- ..._ParticleDynamicPartitionerPerformance.cpp | 12 +-- ...SparseMapDynamicPartitionerPerformance.cpp | 10 +-- cajita/src/Cajita_DynamicPartitioner.hpp | 3 +- .../src/Cajita_ParticleDynamicPartitioner.hpp | 83 ++++--------------- .../Cajita_SparseMapDynamicPartitioner.hpp | 63 +++----------- .../tstParticleDynamicPartitioner.hpp | 12 +-- .../tstSparseMapDynamicPartitioner.hpp | 14 ++-- 7 files changed, 57 insertions(+), 140 deletions(-) diff --git a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp index 641e81860..b008e1804 100644 --- a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp @@ -117,10 +117,9 @@ void performanceTest( ParticleWorkloadTag, std::ostream& stream, MPI_Comm comm, int num_tiles_per_dim = num_cells_per_dim[c] >> cell_bits_per_tile_dim; // set up partitioner - Cajita::ParticleDynamicPartitioner - partitioner( comm, max_workload_coeff, max_par_num, - num_step_rebalance, global_num_cell, - max_optimize_iteration ); + Cajita::DynamicPartitioner partitioner( + comm, max_workload_coeff, max_par_num, num_step_rebalance, + global_num_cell, max_optimize_iteration ); auto ranks_per_dim = partitioner.ranksPerDimension( comm, global_num_cell ); auto ave_partition = @@ -165,9 +164,12 @@ void performanceTest( ParticleWorkloadTag, std::ostream& stream, MPI_Comm comm, // compute local workload local_workload_timer.start( p ); - partitioner.setLocalWorkload( + auto pws = createParticleWorkloadSetter< + partitioner.cell_num_per_tile_dim, + partitioner.num_space_dim, TEST_DEVICE>( pos_view, par_num, global_low_corner, 1.0f / num_cells_per_dim[c], comm ); + partitioner.setLocalWorkload( &pws ); local_workload_timer.stop( p ); // compute prefix sum matrix diff --git a/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp index 06a8a08e0..2e232248c 100644 --- a/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp @@ -127,10 +127,9 @@ void performanceTest( SparseMapTag, std::ostream& stream, MPI_Comm comm, // set up partitioner auto total_num = num_tiles_per_dim * num_tiles_per_dim * num_tiles_per_dim; - Cajita::SparseMapDynamicPartitioner - partitioner( comm, max_workload_coeff, total_num, - num_step_rebalance, global_num_cell, - max_optimize_iteration ); + Cajita::DynamicPartitioner partitioner( + comm, max_workload_coeff, total_num, num_step_rebalance, + global_num_cell, max_optimize_iteration ); auto ranks_per_dim = partitioner.ranksPerDimension( comm, global_num_cell ); auto ave_partition = @@ -181,7 +180,8 @@ void performanceTest( SparseMapTag, std::ostream& stream, MPI_Comm comm, // compute local workload local_workload_timer.start( frac ); - partitioner.setLocalWorkload( sis, comm ); + auto smws = createSparseMapWorkloadSetter( sis, comm ); + partitioner.setLocalWorkload( &smws ); local_workload_timer.stop( frac ); // compute prefix sum matrix diff --git a/cajita/src/Cajita_DynamicPartitioner.hpp b/cajita/src/Cajita_DynamicPartitioner.hpp index c6a439b38..50d5941a9 100644 --- a/cajita/src/Cajita_DynamicPartitioner.hpp +++ b/cajita/src/Cajita_DynamicPartitioner.hpp @@ -32,6 +32,7 @@ template class WorkloadSetter { using memory_space = typename Device::memory_space; + public: virtual void run( Kokkos::View& ) = 0; }; @@ -435,7 +436,7 @@ class DynamicPartitioner : public BlockPartitioner void setLocalWorkload( WorkloadSetter* setter ) { resetWorkload(); - setter.run( _workload_per_tile ); + setter->run( _workload_per_tile ); } /*! diff --git a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp index f9151c868..68caf2c99 100644 --- a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp +++ b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp @@ -85,74 +85,27 @@ class ParticleWorkloadSetter : public WorkloadSetter } }; -//---------------------------------------------------------------------------// /*! - Dynamic mesh block partitioner. (Current Version: Support 3D only) Workload - are computed from particle distribution. - - \tparam Device Kokkos device type. - \tparam CellPerTileDim Cells per tile per dimension. - \tparam NumSpaceDim Dimemsion (The current version support 3D only) + \brief compute the workload in the current MPI rank from particle + positions (each particle count for 1 workload value). This function must + be called before running optimizePartition() \param view particle + positions view \param particle_num total particle number \param + global_lower_corner the coordinate of the domain global lower corner + \param dx cell dx size + \param comm MPI communicator used for workload reduction */ -template -class ParticleDynamicPartitioner - : public DynamicPartitioner +template +ParticleWorkloadSetter +createParticleWorkloadSetter( const ParticlePosViewType& view, int particle_num, + const ArrayType& global_lower_corner, + const CellUnit dx, MPI_Comm comm ) { - using base = DynamicPartitioner; - using base::base; - - protected: - using base::_workload_per_tile; - - public: - using base::cell_bits_per_tile_dim; - using base::num_space_dim; - using typename base::execution_space; - - /*! - \brief compute the workload in the current MPI rank from particle - positions (each particle count for 1 workload value). This function must - be called before running optimizePartition() \param view particle - positions view \param particle_num total particle number \param - global_lower_corner the coordinate of the domain global lower corner - \param dx cell dx size - \param comm MPI communicator used for workload reduction - */ - template - void setLocalWorkload( const ParticlePosViewType& view, int particle_num, - const ArrayType& global_lower_corner, - const CellUnit dx, MPI_Comm comm ) - { - base::resetWorkload(); - // make a local copy - auto workload = _workload_per_tile; - Kokkos::Array lower_corner; - for ( std::size_t d = 0; d < num_space_dim; ++d ) - { - lower_corner[d] = global_lower_corner[d]; - } - - Kokkos::parallel_for( - "compute_local_workload_parpos", - Kokkos::RangePolicy( 0, particle_num ), - KOKKOS_LAMBDA( const int i ) { - int ti = static_cast( - ( view( i, 0 ) - lower_corner[0] ) / dx - 0.5 ) >> - cell_bits_per_tile_dim; - int tj = static_cast( - ( view( i, 1 ) - lower_corner[1] ) / dx - 0.5 ) >> - cell_bits_per_tile_dim; - int tz = static_cast( - ( view( i, 2 ) - lower_corner[2] ) / dx - 0.5 ) >> - cell_bits_per_tile_dim; - Kokkos::atomic_increment( &workload( ti + 1, tj + 1, tz + 1 ) ); - } ); - Kokkos::fence(); - // Wait for other ranks' workload to be ready - MPI_Barrier( comm ); - } -}; + return ParticleWorkloadSetter( + view, particle_num, global_lower_corner, dx, comm ); +} } // end namespace Cajita diff --git a/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp b/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp index 82248408f..4f3a41541 100644 --- a/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp +++ b/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp @@ -65,62 +65,19 @@ class SparseMapWorkloadSetter : public WorkloadSetter } }; -//---------------------------------------------------------------------------// /*! - Dynamic mesh block partitioner. (Current Version: Support 3D only) Workload - are computed from sparse map occupancy. - - \tparam Device Kokkos device type. - \tparam CellPerTileDim Cells per tile per dimension. - \tparam NumSpaceDim Dimemsion (The current version support 3D only) + \brief compute the workload in the current MPI rank from sparseMap + (the workload of a tile is 1 if the tile is occupied, 0 otherwise). This + function must be called before running optimizePartition() \param + sparseMap sparseMap in the current rank \param comm MPI communicator used + for workload reduction */ -template -class SparseMapDynamicPartitioner - : public DynamicPartitioner +template +SparseMapWorkloadSetter +createSparseMapWorkloadSetter( const SparseMapType& sparseMap, MPI_Comm comm ) { - using base = DynamicPartitioner; - using base::base; - - protected: - using base::_workload_per_tile; - - public: - using base::cell_bits_per_tile_dim; - using base::num_space_dim; - using typename base::execution_space; - - /*! - \brief compute the workload in the current MPI rank from sparseMap - (the workload of a tile is 1 if the tile is occupied, 0 otherwise). This - function must be called before running optimizePartition() \param - sparseMap sparseMap in the current rank \param comm MPI communicator used - for workload reduction - */ - template - void setLocalWorkload( const SparseMapType& sparseMap, MPI_Comm comm ) - { - base::resetWorkload(); - // make a local copy - auto workload = _workload_per_tile; - Kokkos::parallel_for( - "compute_local_workload_sparsmap", - Kokkos::RangePolicy( 0, sparseMap.capacity() ), - KOKKOS_LAMBDA( uint32_t i ) { - if ( sparseMap.valid_at( i ) ) - { - auto key = sparseMap.key_at( i ); - int ti, tj, tk; - sparseMap.key2ijk( key, ti, tj, tk ); - Kokkos::atomic_increment( - &workload( ti + 1, tj + 1, tk + 1 ) ); - } - } ); - Kokkos::fence(); - // Wait for other ranks' workload to be ready - MPI_Barrier( comm ); - } -}; + return SparseMapWorkloadSetter( sparseMap, comm ); +} } // end namespace Cajita diff --git a/cajita/unit_test/tstParticleDynamicPartitioner.hpp b/cajita/unit_test/tstParticleDynamicPartitioner.hpp index 8429b3380..54f143d8c 100644 --- a/cajita/unit_test/tstParticleDynamicPartitioner.hpp +++ b/cajita/unit_test/tstParticleDynamicPartitioner.hpp @@ -111,7 +111,7 @@ void random_distribution_automatic_rank( int occupy_num_per_rank ) size_per_dim }; // partitioner - ParticleDynamicPartitioner partitioner( + DynamicPartitioner partitioner( MPI_COMM_WORLD, max_workload_coeff, particle_num, num_step_rebalance, global_cells_per_dim, max_optimize_iteration ); @@ -208,10 +208,12 @@ void random_distribution_automatic_rank( int occupy_num_per_rank ) gt_partition, cart_rank, occupy_num_per_rank, global_low_corner, cell_size, cell_per_tile_dim ); // compute workload from a particle view and do partition optimization - dynamic_cast*>( - &partitioner ) - ->setLocalWorkload( particle_view, occupy_num_per_rank, - global_low_corner, cell_size, MPI_COMM_WORLD ); + auto pws = + createParticleWorkloadSetter( + particle_view, occupy_num_per_rank, global_low_corner, cell_size, + MPI_COMM_WORLD ); + partitioner.setLocalWorkload( &pws ); partitioner.optimizePartition( MPI_COMM_WORLD ); // check results (should be the same as the gt_partition) diff --git a/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp b/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp index 5154c4a4a..c61ff984a 100644 --- a/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp +++ b/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp @@ -47,7 +47,7 @@ void uniform_distribution_automatic_rank() size_tile_per_dim * cell_per_tile_dim }; // partitioner - SparseMapDynamicPartitioner partitioner( + DynamicPartitioner partitioner( MPI_COMM_WORLD, max_workload_coeff, workload_num, num_step_rebalance, global_cells_per_dim, max_optimize_iteration ); @@ -147,7 +147,9 @@ void uniform_distribution_automatic_rank() Kokkos::fence(); // compute workload and do partition optimization - partitioner.setLocalWorkload( sis, MPI_COMM_WORLD ); + auto smws = + createSparseMapWorkloadSetter( sis, MPI_COMM_WORLD ); + partitioner.setLocalWorkload( &smws ); partitioner.optimizePartition( MPI_COMM_WORLD ); // check results (should be the same as the average partition) @@ -247,7 +249,7 @@ void random_distribution_automatic_rank( int occupy_num_per_rank ) size_per_dim }; // partitioner - SparseMapDynamicPartitioner partitioner( + DynamicPartitioner partitioner( MPI_COMM_WORLD, max_workload_coeff, particle_num, num_step_rebalance, global_cells_per_dim, max_optimize_iteration ); @@ -362,9 +364,9 @@ void random_distribution_automatic_rank( int occupy_num_per_rank ) Kokkos::fence(); // compute workload from a sparseMap and do partition optimization - dynamic_cast*>( - &partitioner ) - ->setLocalWorkload( sis, MPI_COMM_WORLD ); + auto smws = + createSparseMapWorkloadSetter( sis, MPI_COMM_WORLD ); + partitioner.setLocalWorkload( &smws ); partitioner.optimizePartition( MPI_COMM_WORLD ); // check results (should be the same as the gt_partition) From a0d9c557320b39692f6749cd9c7b1cf8cb04f94b Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Tue, 12 Jul 2022 04:06:54 -0700 Subject: [PATCH 21/42] Fix benchmark compile --- .../cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp | 4 ++-- .../cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp index b008e1804..8147fc375 100644 --- a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp @@ -164,9 +164,9 @@ void performanceTest( ParticleWorkloadTag, std::ostream& stream, MPI_Comm comm, // compute local workload local_workload_timer.start( p ); - auto pws = createParticleWorkloadSetter< + auto pws = Cajita::createParticleWorkloadSetter< partitioner.cell_num_per_tile_dim, - partitioner.num_space_dim, TEST_DEVICE>( + partitioner.num_space_dim, Device>( pos_view, par_num, global_low_corner, 1.0f / num_cells_per_dim[c], comm ); partitioner.setLocalWorkload( &pws ); diff --git a/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp index 2e232248c..8174f143f 100644 --- a/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp @@ -180,7 +180,7 @@ void performanceTest( SparseMapTag, std::ostream& stream, MPI_Comm comm, // compute local workload local_workload_timer.start( frac ); - auto smws = createSparseMapWorkloadSetter( sis, comm ); + auto smws = Cajita::createSparseMapWorkloadSetter( sis, comm ); partitioner.setLocalWorkload( &smws ); local_workload_timer.stop( frac ); From b266f218ece1d3a9381578280340f4bdd8d6d268 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Tue, 12 Jul 2022 04:08:05 -0700 Subject: [PATCH 22/42] Format --- .../cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp index 8174f143f..eba703009 100644 --- a/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp @@ -180,7 +180,8 @@ void performanceTest( SparseMapTag, std::ostream& stream, MPI_Comm comm, // compute local workload local_workload_timer.start( frac ); - auto smws = Cajita::createSparseMapWorkloadSetter( sis, comm ); + auto smws = + Cajita::createSparseMapWorkloadSetter( sis, comm ); partitioner.setLocalWorkload( &smws ); local_workload_timer.stop( frac ); From cc3878827c413d630bbbd6f70aa9f9b86c718d2a Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Tue, 12 Jul 2022 04:32:46 -0700 Subject: [PATCH 23/42] Add comments --- .../src/Cajita_ParticleDynamicPartitioner.hpp | 31 +++++++++++++------ .../Cajita_SparseMapDynamicPartitioner.hpp | 21 ++++++++----- 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp index 68caf2c99..85bda0ab8 100644 --- a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp +++ b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp @@ -28,6 +28,16 @@ namespace Cajita { +//---------------------------------------------------------------------------// +/*! + \brief Helper class to set workload for DynamicPartitioner with particles. + \tparam Particles' position view type (Kokkos::View) + \tparam Global grid bottom left corner type + \tparam Global grid unit cell size type + \tparam Partitioner's cell number per tile dim + \tparam Partitioner's space dim number + \tparam Partitioner's device type +*/ template class ParticleWorkloadSetter : public WorkloadSetter @@ -45,6 +55,14 @@ class ParticleWorkloadSetter : public WorkloadSetter MPI_Comm comm; public: + /*! + \brief Constructor. + \param view Position of particles used in workload computation. + \param particle_num The number of particles used in workload computation. + \param global_lower_corner The bottom-left corner of global grid. + \param dx The global grid resolution. + \param comm MPI communicator to use for computing workload. + */ ParticleWorkloadSetter( const ParticlePosViewType& view, int particle_num, const ArrayType& global_lower_corner, const CellUnit dx, MPI_Comm comm ) @@ -56,6 +74,7 @@ class ParticleWorkloadSetter : public WorkloadSetter { } + //! \brief Called by DynamicPartitioner to compute workload void run( Kokkos::View& workload ) override { Kokkos::Array lower_corner; @@ -85,15 +104,9 @@ class ParticleWorkloadSetter : public WorkloadSetter } }; -/*! - \brief compute the workload in the current MPI rank from particle - positions (each particle count for 1 workload value). This function must - be called before running optimizePartition() \param view particle - positions view \param particle_num total particle number \param - global_lower_corner the coordinate of the domain global lower corner - \param dx cell dx size - \param comm MPI communicator used for workload reduction -*/ +//---------------------------------------------------------------------------// +//! Creation function for ParticleWorkloadSetter from Kokkos::View template ParticleWorkloadSetter class SparseMapWorkloadSetter : public WorkloadSetter { @@ -38,12 +44,18 @@ class SparseMapWorkloadSetter : public WorkloadSetter MPI_Comm comm; public: + /*! + \brief Constructor. + \param sparseMap Sparse map used in workload computation. + \param comm MPI communicator to use for computing workload. + */ SparseMapWorkloadSetter( const SparseMapType& sparseMap, MPI_Comm comm ) : sparseMap( sparseMap ) , comm( comm ) { } + //! \brief Called by DynamicPartitioner to compute workload void run( Kokkos::View& workload ) override { Kokkos::parallel_for( @@ -65,13 +77,8 @@ class SparseMapWorkloadSetter : public WorkloadSetter } }; -/*! - \brief compute the workload in the current MPI rank from sparseMap - (the workload of a tile is 1 if the tile is occupied, 0 otherwise). This - function must be called before running optimizePartition() \param - sparseMap sparseMap in the current rank \param comm MPI communicator used - for workload reduction -*/ +//---------------------------------------------------------------------------// +//! Creation function for SparseMapWorkloadSetter from SparseMap template SparseMapWorkloadSetter createSparseMapWorkloadSetter( const SparseMapType& sparseMap, MPI_Comm comm ) From e19ad620f4c4dc364d3829245c4be16725dd21a3 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Thu, 21 Jul 2022 00:48:19 -0700 Subject: [PATCH 24/42] Rename optimizePartitionAlongDim into updatePartition --- .../cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp | 2 +- .../cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp | 2 +- cajita/src/Cajita_DynamicPartitioner.hpp | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp index 8147fc375..bb4657574 100644 --- a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp @@ -183,7 +183,7 @@ void performanceTest( ParticleWorkloadTag, std::ostream& stream, MPI_Comm comm, total_optimize_timer.start( p ); for ( int i = 0; i < max_optimize_iteration; ++i ) { - partitioner.optimizePartitionAlongDim( std::rand() % 3, + partitioner.updatePartition( std::rand() % 3, is_changed ); if ( !is_changed ) break; diff --git a/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp index eba703009..ec7f03743 100644 --- a/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp @@ -196,7 +196,7 @@ void performanceTest( SparseMapTag, std::ostream& stream, MPI_Comm comm, total_optimize_timer.start( frac ); for ( int i = 0; i < max_optimize_iteration; ++i ) { - partitioner.optimizePartitionAlongDim( std::rand() % 3, + partitioner.updatePartition( std::rand() % 3, is_changed ); if ( !is_changed ) break; diff --git a/cajita/src/Cajita_DynamicPartitioner.hpp b/cajita/src/Cajita_DynamicPartitioner.hpp index 50d5941a9..b9a11edd1 100644 --- a/cajita/src/Cajita_DynamicPartitioner.hpp +++ b/cajita/src/Cajita_DynamicPartitioner.hpp @@ -462,7 +462,7 @@ class DynamicPartitioner : public BlockPartitioner random_dim_id = std::rand() % num_space_dim; bool is_dim_changed = false; // record changes in current dim - optimizePartitionAlongDim( random_dim_id, is_dim_changed ); + updatePartition( random_dim_id, is_dim_changed ); // update control info is_changed = is_changed || is_dim_changed; @@ -481,7 +481,7 @@ class DynamicPartitioner : public BlockPartitioner optimization \param is_changed label if the partition is changed after the optimization */ - void optimizePartitionAlongDim( int iter_seed, bool& is_changed ) + void updatePartition( int iter_seed, bool& is_changed ) { is_changed = false; // loop over three dimensions, optimize the partition in dimension di From 7d1125352692eb4e2157ff64fd9f1d4cc3da903c Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Thu, 21 Jul 2022 00:51:09 -0700 Subject: [PATCH 25/42] Fix LAMBDA --- ..._ParticleDynamicPartitionerPerformance.cpp | 3 +-- ...SparseMapDynamicPartitionerPerformance.cpp | 3 +-- .../src/Cajita_ParticleDynamicPartitioner.hpp | 24 ++++++++++++------- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp index bb4657574..7a13669ce 100644 --- a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp @@ -183,8 +183,7 @@ void performanceTest( ParticleWorkloadTag, std::ostream& stream, MPI_Comm comm, total_optimize_timer.start( p ); for ( int i = 0; i < max_optimize_iteration; ++i ) { - partitioner.updatePartition( std::rand() % 3, - is_changed ); + partitioner.updatePartition( std::rand() % 3, is_changed ); if ( !is_changed ) break; } diff --git a/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp index ec7f03743..51d04f8ce 100644 --- a/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp @@ -196,8 +196,7 @@ void performanceTest( SparseMapTag, std::ostream& stream, MPI_Comm comm, total_optimize_timer.start( frac ); for ( int i = 0; i < max_optimize_iteration; ++i ) { - partitioner.updatePartition( std::rand() % 3, - is_changed ); + partitioner.updatePartition( std::rand() % 3, is_changed ); if ( !is_changed ) break; } diff --git a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp index 85bda0ab8..a8ff2a70e 100644 --- a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp +++ b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp @@ -83,19 +83,25 @@ class ParticleWorkloadSetter : public WorkloadSetter lower_corner[d] = global_lower_corner[d]; } + CellUnit dx_proxy = dx; + unsigned long long cell_bits_per_tile_dim_proxy = + cell_bits_per_tile_dim; Kokkos::parallel_for( "compute_local_workload_parpos", Kokkos::RangePolicy( 0, particle_num ), KOKKOS_LAMBDA( const int i ) { - int ti = static_cast( - ( view( i, 0 ) - lower_corner[0] ) / dx - 0.5 ) >> - cell_bits_per_tile_dim; - int tj = static_cast( - ( view( i, 1 ) - lower_corner[1] ) / dx - 0.5 ) >> - cell_bits_per_tile_dim; - int tz = static_cast( - ( view( i, 2 ) - lower_corner[2] ) / dx - 0.5 ) >> - cell_bits_per_tile_dim; + int ti = + static_cast( + ( view( i, 0 ) - lower_corner[0] ) / dx_proxy - 0.5 ) >> + cell_bits_per_tile_dim_proxy; + int tj = + static_cast( + ( view( i, 1 ) - lower_corner[1] ) / dx_proxy - 0.5 ) >> + cell_bits_per_tile_dim_proxy; + int tz = + static_cast( + ( view( i, 2 ) - lower_corner[2] ) / dx_proxy - 0.5 ) >> + cell_bits_per_tile_dim_proxy; Kokkos::atomic_increment( &workload( ti + 1, tj + 1, tz + 1 ) ); } ); Kokkos::fence(); From 6ad716fd937589a4019d2faee1cd5ea9273c89a3 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Thu, 21 Jul 2022 00:52:23 -0700 Subject: [PATCH 26/42] Rename run into compute --- cajita/src/Cajita_DynamicPartitioner.hpp | 4 ++-- cajita/src/Cajita_ParticleDynamicPartitioner.hpp | 2 +- cajita/src/Cajita_SparseMapDynamicPartitioner.hpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cajita/src/Cajita_DynamicPartitioner.hpp b/cajita/src/Cajita_DynamicPartitioner.hpp index b9a11edd1..9789225cf 100644 --- a/cajita/src/Cajita_DynamicPartitioner.hpp +++ b/cajita/src/Cajita_DynamicPartitioner.hpp @@ -34,7 +34,7 @@ class WorkloadSetter using memory_space = typename Device::memory_space; public: - virtual void run( Kokkos::View& ) = 0; + virtual void compute( Kokkos::View& ) = 0; }; //---------------------------------------------------------------------------// @@ -436,7 +436,7 @@ class DynamicPartitioner : public BlockPartitioner void setLocalWorkload( WorkloadSetter* setter ) { resetWorkload(); - setter->run( _workload_per_tile ); + setter->compute( _workload_per_tile ); } /*! diff --git a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp index a8ff2a70e..be6c60624 100644 --- a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp +++ b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp @@ -75,7 +75,7 @@ class ParticleWorkloadSetter : public WorkloadSetter } //! \brief Called by DynamicPartitioner to compute workload - void run( Kokkos::View& workload ) override + void compute( Kokkos::View& workload ) override { Kokkos::Array lower_corner; for ( std::size_t d = 0; d < num_space_dim; ++d ) diff --git a/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp b/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp index 62e87108f..8ed83ee37 100644 --- a/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp +++ b/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp @@ -56,7 +56,7 @@ class SparseMapWorkloadSetter : public WorkloadSetter } //! \brief Called by DynamicPartitioner to compute workload - void run( Kokkos::View& workload ) override + void compute( Kokkos::View& workload ) override { Kokkos::parallel_for( "compute_local_workload_sparsmap", From 2109f2f50eb73cbd6ed741dc33e3fed54b63aa8b Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Thu, 21 Jul 2022 00:59:38 -0700 Subject: [PATCH 27/42] Rename setter into measurer --- ..._ParticleDynamicPartitionerPerformance.cpp | 11 +++---- ...SparseMapDynamicPartitionerPerformance.cpp | 3 +- cajita/src/Cajita_DynamicPartitioner.hpp | 13 ++++++-- .../src/Cajita_ParticleDynamicPartitioner.hpp | 30 ++++++++++--------- .../Cajita_SparseMapDynamicPartitioner.hpp | 17 +++++++---- .../tstParticleDynamicPartitioner.hpp | 9 +++--- .../tstSparseMapDynamicPartitioner.hpp | 8 ++--- 7 files changed, 53 insertions(+), 38 deletions(-) diff --git a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp index 7a13669ce..9f23bec20 100644 --- a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp @@ -164,11 +164,12 @@ void performanceTest( ParticleWorkloadTag, std::ostream& stream, MPI_Comm comm, // compute local workload local_workload_timer.start( p ); - auto pws = Cajita::createParticleWorkloadSetter< - partitioner.cell_num_per_tile_dim, - partitioner.num_space_dim, Device>( - pos_view, par_num, global_low_corner, - 1.0f / num_cells_per_dim[c], comm ); + auto pws = + Cajita::createParticleDynamicPartitionerWorkloadMeasurer< + partitioner.cell_num_per_tile_dim, + partitioner.num_space_dim, Device>( + pos_view, par_num, global_low_corner, + 1.0f / num_cells_per_dim[c], comm ); partitioner.setLocalWorkload( &pws ); local_workload_timer.stop( p ); diff --git a/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp index 51d04f8ce..d08390a72 100644 --- a/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp @@ -181,7 +181,8 @@ void performanceTest( SparseMapTag, std::ostream& stream, MPI_Comm comm, // compute local workload local_workload_timer.start( frac ); auto smws = - Cajita::createSparseMapWorkloadSetter( sis, comm ); + Cajita::createSparseMapDynamicPartitionerWorkloadMeasurer< + Device>( sis, comm ); partitioner.setLocalWorkload( &smws ); local_workload_timer.stop( frac ); diff --git a/cajita/src/Cajita_DynamicPartitioner.hpp b/cajita/src/Cajita_DynamicPartitioner.hpp index 9789225cf..f1368bded 100644 --- a/cajita/src/Cajita_DynamicPartitioner.hpp +++ b/cajita/src/Cajita_DynamicPartitioner.hpp @@ -29,7 +29,7 @@ namespace Cajita { template -class WorkloadSetter +class DynamicPartitionerWorkloadMeasurer { using memory_space = typename Device::memory_space; @@ -433,10 +433,17 @@ class DynamicPartitioner : public BlockPartitioner Kokkos::fence(); } - void setLocalWorkload( WorkloadSetter* setter ) + /*! + \brief compute workload in each MPI rank + \param measurer measurer defined by user to compute workload. + DynamicPartitionerWorkloadMeasurer is the base class and the user + should define their own measurer with compute() implemented. + */ + void + setLocalWorkload( DynamicPartitionerWorkloadMeasurer* measurer ) { resetWorkload(); - setter->compute( _workload_per_tile ); + measurer->compute( _workload_per_tile ); } /*! diff --git a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp index be6c60624..d645e86e8 100644 --- a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp +++ b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp @@ -40,7 +40,8 @@ namespace Cajita */ template -class ParticleWorkloadSetter : public WorkloadSetter +class ParticleDynamicPartitionerWorkloadMeasurer + : public DynamicPartitionerWorkloadMeasurer { using memory_space = typename Device::memory_space; using execution_space = typename Device::execution_space; @@ -63,9 +64,9 @@ class ParticleWorkloadSetter : public WorkloadSetter \param dx The global grid resolution. \param comm MPI communicator to use for computing workload. */ - ParticleWorkloadSetter( const ParticlePosViewType& view, int particle_num, - const ArrayType& global_lower_corner, - const CellUnit dx, MPI_Comm comm ) + ParticleDynamicPartitionerWorkloadMeasurer( + const ParticlePosViewType& view, int particle_num, + const ArrayType& global_lower_corner, const CellUnit dx, MPI_Comm comm ) : view( view ) , particle_num( particle_num ) , global_lower_corner( global_lower_corner ) @@ -111,19 +112,20 @@ class ParticleWorkloadSetter : public WorkloadSetter }; //---------------------------------------------------------------------------// -//! Creation function for ParticleWorkloadSetter from Kokkos::View +//! Creation function for ParticleDynamicPartitionerWorkloadMeasurer from +//! Kokkos::View template -ParticleWorkloadSetter -createParticleWorkloadSetter( const ParticlePosViewType& view, int particle_num, - const ArrayType& global_lower_corner, - const CellUnit dx, MPI_Comm comm ) +ParticleDynamicPartitionerWorkloadMeasurer +createParticleDynamicPartitionerWorkloadMeasurer( + const ParticlePosViewType& view, int particle_num, + const ArrayType& global_lower_corner, const CellUnit dx, MPI_Comm comm ) { - return ParticleWorkloadSetter( - view, particle_num, global_lower_corner, dx, comm ); + return ParticleDynamicPartitionerWorkloadMeasurer< + ParticlePosViewType, ArrayType, CellUnit, CellPerTileDim, num_space_dim, + Device>( view, particle_num, global_lower_corner, dx, comm ); } } // end namespace Cajita diff --git a/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp b/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp index 8ed83ee37..27dbf037a 100644 --- a/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp +++ b/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp @@ -35,7 +35,8 @@ namespace Cajita \tparam Partitioner's device type */ template -class SparseMapWorkloadSetter : public WorkloadSetter +class SparseMapDynamicPartitionerWorkloadMeasurer + : public DynamicPartitionerWorkloadMeasurer { using memory_space = typename Device::memory_space; using execution_space = typename Device::execution_space; @@ -49,7 +50,8 @@ class SparseMapWorkloadSetter : public WorkloadSetter \param sparseMap Sparse map used in workload computation. \param comm MPI communicator to use for computing workload. */ - SparseMapWorkloadSetter( const SparseMapType& sparseMap, MPI_Comm comm ) + SparseMapDynamicPartitionerWorkloadMeasurer( const SparseMapType& sparseMap, + MPI_Comm comm ) : sparseMap( sparseMap ) , comm( comm ) { @@ -78,12 +80,15 @@ class SparseMapWorkloadSetter : public WorkloadSetter }; //---------------------------------------------------------------------------// -//! Creation function for SparseMapWorkloadSetter from SparseMap +//! Creation function for SparseMapDynamicPartitionerWorkloadMeasurer from +//! SparseMap template -SparseMapWorkloadSetter -createSparseMapWorkloadSetter( const SparseMapType& sparseMap, MPI_Comm comm ) +SparseMapDynamicPartitionerWorkloadMeasurer +createSparseMapDynamicPartitionerWorkloadMeasurer( + const SparseMapType& sparseMap, MPI_Comm comm ) { - return SparseMapWorkloadSetter( sparseMap, comm ); + return SparseMapDynamicPartitionerWorkloadMeasurer( + sparseMap, comm ); } } // end namespace Cajita diff --git a/cajita/unit_test/tstParticleDynamicPartitioner.hpp b/cajita/unit_test/tstParticleDynamicPartitioner.hpp index 54f143d8c..098aa2323 100644 --- a/cajita/unit_test/tstParticleDynamicPartitioner.hpp +++ b/cajita/unit_test/tstParticleDynamicPartitioner.hpp @@ -208,11 +208,10 @@ void random_distribution_automatic_rank( int occupy_num_per_rank ) gt_partition, cart_rank, occupy_num_per_rank, global_low_corner, cell_size, cell_per_tile_dim ); // compute workload from a particle view and do partition optimization - auto pws = - createParticleWorkloadSetter( - particle_view, occupy_num_per_rank, global_low_corner, cell_size, - MPI_COMM_WORLD ); + auto pws = createParticleDynamicPartitionerWorkloadMeasurer< + partitioner.cell_num_per_tile_dim, partitioner.num_space_dim, + TEST_DEVICE>( particle_view, occupy_num_per_rank, global_low_corner, + cell_size, MPI_COMM_WORLD ); partitioner.setLocalWorkload( &pws ); partitioner.optimizePartition( MPI_COMM_WORLD ); diff --git a/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp b/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp index c61ff984a..e3ee0ed12 100644 --- a/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp +++ b/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp @@ -147,8 +147,8 @@ void uniform_distribution_automatic_rank() Kokkos::fence(); // compute workload and do partition optimization - auto smws = - createSparseMapWorkloadSetter( sis, MPI_COMM_WORLD ); + auto smws = createSparseMapDynamicPartitionerWorkloadMeasurer( + sis, MPI_COMM_WORLD ); partitioner.setLocalWorkload( &smws ); partitioner.optimizePartition( MPI_COMM_WORLD ); @@ -364,8 +364,8 @@ void random_distribution_automatic_rank( int occupy_num_per_rank ) Kokkos::fence(); // compute workload from a sparseMap and do partition optimization - auto smws = - createSparseMapWorkloadSetter( sis, MPI_COMM_WORLD ); + auto smws = createSparseMapDynamicPartitionerWorkloadMeasurer( + sis, MPI_COMM_WORLD ); partitioner.setLocalWorkload( &smws ); partitioner.optimizePartition( MPI_COMM_WORLD ); From 8576b8dc56ca1852e1d3d13fd9147596c85c618d Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Thu, 21 Jul 2022 01:01:18 -0700 Subject: [PATCH 28/42] Format --- cajita/src/Cajita_DynamicPartitioner.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cajita/src/Cajita_DynamicPartitioner.hpp b/cajita/src/Cajita_DynamicPartitioner.hpp index f1368bded..931d9b1d5 100644 --- a/cajita/src/Cajita_DynamicPartitioner.hpp +++ b/cajita/src/Cajita_DynamicPartitioner.hpp @@ -437,7 +437,7 @@ class DynamicPartitioner : public BlockPartitioner \brief compute workload in each MPI rank \param measurer measurer defined by user to compute workload. DynamicPartitionerWorkloadMeasurer is the base class and the user - should define their own measurer with compute() implemented. + should define derived measurer with compute() implemented. */ void setLocalWorkload( DynamicPartitionerWorkloadMeasurer* measurer ) From d39e1fac90c818c757f635391793b3c9317f2ec4 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Thu, 21 Jul 2022 01:02:29 -0700 Subject: [PATCH 29/42] Detailed comment --- cajita/src/Cajita_DynamicPartitioner.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cajita/src/Cajita_DynamicPartitioner.hpp b/cajita/src/Cajita_DynamicPartitioner.hpp index 931d9b1d5..566506fc6 100644 --- a/cajita/src/Cajita_DynamicPartitioner.hpp +++ b/cajita/src/Cajita_DynamicPartitioner.hpp @@ -437,7 +437,7 @@ class DynamicPartitioner : public BlockPartitioner \brief compute workload in each MPI rank \param measurer measurer defined by user to compute workload. DynamicPartitionerWorkloadMeasurer is the base class and the user - should define derived measurer with compute() implemented. + should define a derived measurer class with compute() implemented. */ void setLocalWorkload( DynamicPartitionerWorkloadMeasurer* measurer ) From 2d96eaa04145659ef0082ccbf002ebfd300e1224 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Thu, 21 Jul 2022 10:24:35 -0700 Subject: [PATCH 30/42] Clean up LAMBDA --- cajita/src/Cajita_SparseMapDynamicPartitioner.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp b/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp index 27dbf037a..ef1dd8642 100644 --- a/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp +++ b/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp @@ -60,15 +60,16 @@ class SparseMapDynamicPartitionerWorkloadMeasurer //! \brief Called by DynamicPartitioner to compute workload void compute( Kokkos::View& workload ) override { + const SparseMapType& sparseMap_proxy = sparseMap; Kokkos::parallel_for( "compute_local_workload_sparsmap", Kokkos::RangePolicy( 0, sparseMap.capacity() ), KOKKOS_LAMBDA( uint32_t i ) { - if ( sparseMap.valid_at( i ) ) + if ( sparseMap_proxy.valid_at( i ) ) { - auto key = sparseMap.key_at( i ); + auto key = sparseMap_proxy.key_at( i ); int ti, tj, tk; - sparseMap.key2ijk( key, ti, tj, tk ); + sparseMap_proxy.key2ijk( key, ti, tj, tk ); Kokkos::atomic_increment( &workload( ti + 1, tj + 1, tk + 1 ) ); } From 78e3d785f27b043fb67113a317f1c5b14e2cf264 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Thu, 21 Jul 2022 10:33:45 -0700 Subject: [PATCH 31/42] Rebase master and fix compile --- cajita/unit_test/tstSparseLocalGrid.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cajita/unit_test/tstSparseLocalGrid.hpp b/cajita/unit_test/tstSparseLocalGrid.hpp index 97a1c393b..d45b4c5be 100644 --- a/cajita/unit_test/tstSparseLocalGrid.hpp +++ b/cajita/unit_test/tstSparseLocalGrid.hpp @@ -10,7 +10,7 @@ ****************************************************************************/ #include #include -#include +#include #include #include @@ -46,7 +46,7 @@ void sparseLocalGridTest( EntityType t2 ) // Create and initialize sparse partitioner std::array periodic = { false, false, false }; - SparseDimPartitioner partitioner( + DynamicPartitioner partitioner( MPI_COMM_WORLD, 1.5, 16 * 32, 100, global_num_cell, 10 ); auto ranks_per_dim = partitioner.ranksPerDimension( MPI_COMM_WORLD, global_num_cell ); From 17359393f10a8d22049f2f58f44ce479fdc29f63 Mon Sep 17 00:00:00 2001 From: squarefk Date: Mon, 25 Jul 2022 23:18:13 -0700 Subject: [PATCH 32/42] Update cajita/src/Cajita_ParticleDynamicPartitioner.hpp Co-authored-by: Sam Reeve <6740307+streeve@users.noreply.github.com> --- cajita/src/Cajita_ParticleDynamicPartitioner.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp index d645e86e8..618af1d5c 100644 --- a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp +++ b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp @@ -84,9 +84,8 @@ class ParticleDynamicPartitionerWorkloadMeasurer lower_corner[d] = global_lower_corner[d]; } - CellUnit dx_proxy = dx; - unsigned long long cell_bits_per_tile_dim_proxy = - cell_bits_per_tile_dim; + auto dx_copy = dx; + auto cell_bits_per_tile_dim_copy = cell_bits_per_tile_dim; Kokkos::parallel_for( "compute_local_workload_parpos", Kokkos::RangePolicy( 0, particle_num ), From ade851caca16b34e3092dfee4af9ec9b2096505a Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Mon, 25 Jul 2022 23:21:13 -0700 Subject: [PATCH 33/42] Clean up workload tags --- .../Cajita_ParticleDynamicPartitionerPerformance.cpp | 12 +++--------- ...Cajita_SparseMapDynamicPartitionerPerformance.cpp | 12 +++--------- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp index 9f23bec20..40798630a 100644 --- a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp @@ -28,12 +28,6 @@ #include -//---------------------------------------------------------------------------// -// Helper functions. -struct ParticleWorkloadTag -{ -}; - // generate average partitioner std::array, 3> computeAveragePartition( const int tile_per_dim, @@ -57,7 +51,7 @@ computeAveragePartition( const int tile_per_dim, //---------------------------------------------------------------------------// // Performance test. template -void performanceTest( ParticleWorkloadTag, std::ostream& stream, MPI_Comm comm, +void performanceTest( std::ostream& stream, MPI_Comm comm, const std::string& test_prefix, std::vector problem_sizes, std::vector num_cells_per_dim ) @@ -285,11 +279,11 @@ int main( int argc, char* argv[] ) // Don't rerun on the CPU if already done or if turned off. if ( !std::is_same{} ) { - performanceTest( ParticleWorkloadTag(), file, + performanceTest( file, MPI_COMM_WORLD, "device_particleWL_", problem_sizes, num_cells_per_dim ); } - performanceTest( ParticleWorkloadTag(), file, + performanceTest( file, MPI_COMM_WORLD, "host_particleWL_", problem_sizes, num_cells_per_dim ); diff --git a/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp index d08390a72..7240e6ba4 100644 --- a/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp @@ -28,12 +28,6 @@ #include -//---------------------------------------------------------------------------// -// Helper functions. -struct SparseMapTag -{ -}; - // generate a random tile sequence int current = 0; int uniqueNumber() { return current++; } @@ -79,7 +73,7 @@ std::array, 3> computeAveragePartition( //---------------------------------------------------------------------------// // Performance test. template -void performanceTest( SparseMapTag, std::ostream& stream, MPI_Comm comm, +void performanceTest( std::ostream& stream, MPI_Comm comm, const std::string& test_prefix, std::vector occupy_fraction, std::vector num_cells_per_dim ) @@ -299,11 +293,11 @@ int main( int argc, char* argv[] ) // Don't rerun on the CPU if already done or if turned off. if ( !std::is_same{} ) { - performanceTest( SparseMapTag(), file, MPI_COMM_WORLD, + performanceTest( file, MPI_COMM_WORLD, "device_sparsemapWL_", occupy_fraction, num_cells_per_dim ); } - performanceTest( SparseMapTag(), file, MPI_COMM_WORLD, + performanceTest( file, MPI_COMM_WORLD, "host_sparsemapWL_", occupy_fraction, num_cells_per_dim ); From c6c70badc395256767bee5012a4510244ca56ae5 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Mon, 25 Jul 2022 23:23:14 -0700 Subject: [PATCH 34/42] Rename _proxy into _copy --- cajita/src/Cajita_ParticleDynamicPartitioner.hpp | 12 ++++++------ cajita/src/Cajita_SparseMapDynamicPartitioner.hpp | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp index 618af1d5c..2fea96604 100644 --- a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp +++ b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp @@ -92,16 +92,16 @@ class ParticleDynamicPartitionerWorkloadMeasurer KOKKOS_LAMBDA( const int i ) { int ti = static_cast( - ( view( i, 0 ) - lower_corner[0] ) / dx_proxy - 0.5 ) >> - cell_bits_per_tile_dim_proxy; + ( view( i, 0 ) - lower_corner[0] ) / dx_copy - 0.5 ) >> + cell_bits_per_tile_dim_copy; int tj = static_cast( - ( view( i, 1 ) - lower_corner[1] ) / dx_proxy - 0.5 ) >> - cell_bits_per_tile_dim_proxy; + ( view( i, 1 ) - lower_corner[1] ) / dx_copy - 0.5 ) >> + cell_bits_per_tile_dim_copy; int tz = static_cast( - ( view( i, 2 ) - lower_corner[2] ) / dx_proxy - 0.5 ) >> - cell_bits_per_tile_dim_proxy; + ( view( i, 2 ) - lower_corner[2] ) / dx_copy - 0.5 ) >> + cell_bits_per_tile_dim_copy; Kokkos::atomic_increment( &workload( ti + 1, tj + 1, tz + 1 ) ); } ); Kokkos::fence(); diff --git a/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp b/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp index ef1dd8642..009be1e53 100644 --- a/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp +++ b/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp @@ -60,16 +60,16 @@ class SparseMapDynamicPartitionerWorkloadMeasurer //! \brief Called by DynamicPartitioner to compute workload void compute( Kokkos::View& workload ) override { - const SparseMapType& sparseMap_proxy = sparseMap; + const SparseMapType& sparse_map_copy = sparseMap; Kokkos::parallel_for( "compute_local_workload_sparsmap", Kokkos::RangePolicy( 0, sparseMap.capacity() ), KOKKOS_LAMBDA( uint32_t i ) { - if ( sparseMap_proxy.valid_at( i ) ) + if ( sparse_map_copy.valid_at( i ) ) { - auto key = sparseMap_proxy.key_at( i ); + auto key = sparse_map_copy.key_at( i ); int ti, tj, tk; - sparseMap_proxy.key2ijk( key, ti, tj, tk ); + sparse_map_copy.key2ijk( key, ti, tj, tk ); Kokkos::atomic_increment( &workload( ti + 1, tj + 1, tk + 1 ) ); } From ec7b0b53b469d5c77244355d59a9b3026c40df06 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Mon, 25 Jul 2022 23:44:15 -0700 Subject: [PATCH 35/42] Clean up constructor --- ..._ParticleDynamicPartitionerPerformance.cpp | 9 +----- ...SparseMapDynamicPartitionerPerformance.cpp | 7 +---- cajita/src/Cajita_DynamicPartitioner.hpp | 30 +++---------------- cajita/unit_test/tstGlobalGrid.hpp | 7 +---- .../tstParticleDynamicPartitioner.hpp | 7 +---- cajita/unit_test/tstSparseLocalGrid.hpp | 2 +- .../tstSparseMapDynamicPartitioner.hpp | 14 ++------- 7 files changed, 11 insertions(+), 65 deletions(-) diff --git a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp index 40798630a..0ba99611a 100644 --- a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp @@ -82,13 +82,7 @@ void performanceTest( std::ostream& stream, MPI_Comm comm, int num_run = 10; // Basic settings for partitioenr - float max_workload_coeff = 1.5; int max_optimize_iteration = 10; - int num_step_rebalance = 100; - - // compute the max number of particles handled by the current MPI rank - int max_par_num = problem_sizes.back() / comm_size + - ( problem_sizes.back() % comm_size < comm_rank ? 1 : 0 ); // Create random sets of particle positions. using position_type = Kokkos::View; @@ -112,8 +106,7 @@ void performanceTest( std::ostream& stream, MPI_Comm comm, // set up partitioner Cajita::DynamicPartitioner partitioner( - comm, max_workload_coeff, max_par_num, num_step_rebalance, - global_num_cell, max_optimize_iteration ); + comm, global_num_cell, max_optimize_iteration ); auto ranks_per_dim = partitioner.ranksPerDimension( comm, global_num_cell ); auto ave_partition = diff --git a/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp index 7240e6ba4..671993f6b 100644 --- a/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp @@ -95,9 +95,7 @@ void performanceTest( std::ostream& stream, MPI_Comm comm, int num_run = 10; // Basic settings for partitioenr - float max_workload_coeff = 1.5; int max_optimize_iteration = 10; - int num_step_rebalance = 100; for ( int c = 0; c < num_cells_per_dim_size; ++c ) { @@ -119,11 +117,8 @@ void performanceTest( std::ostream& stream, MPI_Comm comm, typename Device::memory_space(), tiles_host ); // set up partitioner - auto total_num = - num_tiles_per_dim * num_tiles_per_dim * num_tiles_per_dim; Cajita::DynamicPartitioner partitioner( - comm, max_workload_coeff, total_num, num_step_rebalance, - global_num_cell, max_optimize_iteration ); + comm, global_num_cell, max_optimize_iteration ); auto ranks_per_dim = partitioner.ranksPerDimension( comm, global_num_cell ); auto ave_partition = diff --git a/cajita/src/Cajita_DynamicPartitioner.hpp b/cajita/src/Cajita_DynamicPartitioner.hpp index 566506fc6..8c2771cd7 100644 --- a/cajita/src/Cajita_DynamicPartitioner.hpp +++ b/cajita/src/Cajita_DynamicPartitioner.hpp @@ -89,23 +89,14 @@ class DynamicPartitioner : public BlockPartitioner \brief Constructor - automatically compute ranks_per_dim from MPI communicator \param comm MPI communicator to decide the rank nums in each dimension - \param max_workload_coeff threshold factor for re-partition - \param workload_num total workload(particle/tile) number, used to compute - workload_threshold - \param num_step_rebalance the simulation step number after which one - should check if repartition is needed \param global_cells_per_dim 3D array, global cells in each dimension \param max_optimize_iteration max iteration number to run the optimization */ DynamicPartitioner( - MPI_Comm comm, float max_workload_coeff, int workload_num, - int num_step_rebalance, + MPI_Comm comm, const std::array& global_cells_per_dim, int max_optimize_iteration = 10 ) - : _workload_threshold( - static_cast( max_workload_coeff * workload_num ) ) - , _num_step_rebalance( num_step_rebalance ) - , _max_optimize_iteration( max_optimize_iteration ) + : _max_optimize_iteration( max_optimize_iteration ) { // compute the ranks_per_dim from MPI communicator allocate( global_cells_per_dim ); @@ -116,26 +107,17 @@ class DynamicPartitioner : public BlockPartitioner \brief Constructor - user-defined ranks_per_dim communicator \param comm MPI communicator to decide the rank nums in each dimension - \param max_workload_coeff threshold factor for re-partition - \param workload_num total workload(particle/tile) number, used to compute - workload_threshold - \param num_step_rebalance the simulation step number after which one - should check if repartition is needed \param ranks_per_dim 3D array, user-defined MPI rank constrains in per dimension \param global_cells_per_dim 3D array, global cells in each dimension \param max_optimize_iteration max iteration number to run the optimization */ DynamicPartitioner( - MPI_Comm comm, float max_workload_coeff, int workload_num, - int num_step_rebalance, + MPI_Comm comm, const std::array& ranks_per_dim, const std::array& global_cells_per_dim, int max_optimize_iteration = 10 ) - : _workload_threshold( - static_cast( max_workload_coeff * workload_num ) ) - , _num_step_rebalance( num_step_rebalance ) - , _max_optimize_iteration( max_optimize_iteration ) + : _max_optimize_iteration( max_optimize_iteration ) { allocate( global_cells_per_dim ); std::copy( ranks_per_dim.begin(), ranks_per_dim.end(), @@ -751,10 +733,6 @@ class DynamicPartitioner : public BlockPartitioner }; private: - // workload_threshold - int _workload_threshold; - // default check point for re-balance - int _num_step_rebalance; // max_optimize iterations int _max_optimize_iteration; diff --git a/cajita/unit_test/tstGlobalGrid.hpp b/cajita/unit_test/tstGlobalGrid.hpp index 03efceeaf..6d40623db 100644 --- a/cajita/unit_test/tstGlobalGrid.hpp +++ b/cajita/unit_test/tstGlobalGrid.hpp @@ -425,15 +425,10 @@ void sparseGridTest3d() global_low_corner, global_high_corner, global_num_cell ); // Sparse paritioner - float max_workload_coeff = 1.5; - int workload_num = - global_num_cell[0] * global_num_cell[1] * global_num_cell[2]; - int num_step_rebalance = 100; int max_optimize_iteration = 10; DynamicPartitioner partitioner( - MPI_COMM_WORLD, max_workload_coeff, workload_num, num_step_rebalance, - global_num_cell, max_optimize_iteration ); + MPI_COMM_WORLD, global_num_cell, max_optimize_iteration ); // test ranks per dim auto ranks_per_dim = diff --git a/cajita/unit_test/tstParticleDynamicPartitioner.hpp b/cajita/unit_test/tstParticleDynamicPartitioner.hpp index 098aa2323..539f358bd 100644 --- a/cajita/unit_test/tstParticleDynamicPartitioner.hpp +++ b/cajita/unit_test/tstParticleDynamicPartitioner.hpp @@ -98,13 +98,9 @@ void random_distribution_automatic_rank( int occupy_num_per_rank ) constexpr int size_tile_per_dim = 32; constexpr int cell_per_tile_dim = 4; constexpr int size_per_dim = size_tile_per_dim * cell_per_tile_dim; - constexpr int total_size = size_per_dim * size_per_dim * size_per_dim; srand( time( 0 ) ); // some settings for partitioner - float max_workload_coeff = 1.5; - int particle_num = total_size; - int num_step_rebalance = 100; int max_optimize_iteration = 10; std::array global_cells_per_dim = { size_per_dim, size_per_dim, @@ -112,8 +108,7 @@ void random_distribution_automatic_rank( int occupy_num_per_rank ) // partitioner DynamicPartitioner partitioner( - MPI_COMM_WORLD, max_workload_coeff, particle_num, num_step_rebalance, - global_cells_per_dim, max_optimize_iteration ); + MPI_COMM_WORLD, global_cells_per_dim, max_optimize_iteration ); // check the value of some pre-computed constants auto cbptd = partitioner.cell_bits_per_tile_dim; diff --git a/cajita/unit_test/tstSparseLocalGrid.hpp b/cajita/unit_test/tstSparseLocalGrid.hpp index d45b4c5be..6debc1041 100644 --- a/cajita/unit_test/tstSparseLocalGrid.hpp +++ b/cajita/unit_test/tstSparseLocalGrid.hpp @@ -47,7 +47,7 @@ void sparseLocalGridTest( EntityType t2 ) // Create and initialize sparse partitioner std::array periodic = { false, false, false }; DynamicPartitioner partitioner( - MPI_COMM_WORLD, 1.5, 16 * 32, 100, global_num_cell, 10 ); + MPI_COMM_WORLD, global_num_cell, 10 ); auto ranks_per_dim = partitioner.ranksPerDimension( MPI_COMM_WORLD, global_num_cell ); std::array, 3> rec_partitions; diff --git a/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp b/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp index e3ee0ed12..0374ab9e5 100644 --- a/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp +++ b/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp @@ -34,12 +34,8 @@ void uniform_distribution_automatic_rank() constexpr int size_tile_per_dim = 16; constexpr int cell_per_tile_dim = 4; constexpr int size_per_dim = size_tile_per_dim * cell_per_tile_dim; - constexpr int total_size = size_per_dim * size_per_dim * size_per_dim; // some settings for partitioner - float max_workload_coeff = 1.5; - int workload_num = total_size; - int num_step_rebalance = 100; int max_optimize_iteration = 10; std::array global_cells_per_dim = { size_tile_per_dim * cell_per_tile_dim, @@ -48,8 +44,7 @@ void uniform_distribution_automatic_rank() // partitioner DynamicPartitioner partitioner( - MPI_COMM_WORLD, max_workload_coeff, workload_num, num_step_rebalance, - global_cells_per_dim, max_optimize_iteration ); + MPI_COMM_WORLD, global_cells_per_dim, max_optimize_iteration ); // check the value of some pre-computed constants auto cbptd = partitioner.cell_bits_per_tile_dim; @@ -236,13 +231,9 @@ void random_distribution_automatic_rank( int occupy_num_per_rank ) constexpr int size_tile_per_dim = 32; constexpr int cell_per_tile_dim = 4; constexpr int size_per_dim = size_tile_per_dim * cell_per_tile_dim; - constexpr int total_size = size_per_dim * size_per_dim * size_per_dim; srand( time( 0 ) ); // some settings for partitioner - float max_workload_coeff = 1.5; - int particle_num = total_size; - int num_step_rebalance = 100; int max_optimize_iteration = 10; std::array global_cells_per_dim = { size_per_dim, size_per_dim, @@ -250,8 +241,7 @@ void random_distribution_automatic_rank( int occupy_num_per_rank ) // partitioner DynamicPartitioner partitioner( - MPI_COMM_WORLD, max_workload_coeff, particle_num, num_step_rebalance, - global_cells_per_dim, max_optimize_iteration ); + MPI_COMM_WORLD, global_cells_per_dim, max_optimize_iteration ); // check the value of some pre-computed constants auto cbptd = partitioner.cell_bits_per_tile_dim; From 16cea3e0c90111a5d598d877537209376bcae74f Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Mon, 25 Jul 2022 23:44:39 -0700 Subject: [PATCH 36/42] Format --- .../Cajita_ParticleDynamicPartitionerPerformance.cpp | 9 ++++----- cajita/src/Cajita_DynamicPartitioner.hpp | 3 +-- cajita/unit_test/tstSparseLocalGrid.hpp | 6 +++--- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp index 0ba99611a..e28ec18cd 100644 --- a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp @@ -272,12 +272,11 @@ int main( int argc, char* argv[] ) // Don't rerun on the CPU if already done or if turned off. if ( !std::is_same{} ) { - performanceTest( file, - MPI_COMM_WORLD, "device_particleWL_", - problem_sizes, num_cells_per_dim ); + performanceTest( file, MPI_COMM_WORLD, + "device_particleWL_", problem_sizes, + num_cells_per_dim ); } - performanceTest( file, - MPI_COMM_WORLD, "host_particleWL_", + performanceTest( file, MPI_COMM_WORLD, "host_particleWL_", problem_sizes, num_cells_per_dim ); // Close the output file on rank 0. diff --git a/cajita/src/Cajita_DynamicPartitioner.hpp b/cajita/src/Cajita_DynamicPartitioner.hpp index 8c2771cd7..f13534b9a 100644 --- a/cajita/src/Cajita_DynamicPartitioner.hpp +++ b/cajita/src/Cajita_DynamicPartitioner.hpp @@ -113,8 +113,7 @@ class DynamicPartitioner : public BlockPartitioner \param max_optimize_iteration max iteration number to run the optimization */ DynamicPartitioner( - MPI_Comm comm, - const std::array& ranks_per_dim, + MPI_Comm comm, const std::array& ranks_per_dim, const std::array& global_cells_per_dim, int max_optimize_iteration = 10 ) : _max_optimize_iteration( max_optimize_iteration ) diff --git a/cajita/unit_test/tstSparseLocalGrid.hpp b/cajita/unit_test/tstSparseLocalGrid.hpp index 6debc1041..e6a10f674 100644 --- a/cajita/unit_test/tstSparseLocalGrid.hpp +++ b/cajita/unit_test/tstSparseLocalGrid.hpp @@ -8,9 +8,9 @@ * * * SPDX-License-Identifier: BSD-3-Clause * ****************************************************************************/ +#include #include #include -#include #include #include @@ -46,8 +46,8 @@ void sparseLocalGridTest( EntityType t2 ) // Create and initialize sparse partitioner std::array periodic = { false, false, false }; - DynamicPartitioner partitioner( - MPI_COMM_WORLD, global_num_cell, 10 ); + DynamicPartitioner partitioner( MPI_COMM_WORLD, + global_num_cell, 10 ); auto ranks_per_dim = partitioner.ranksPerDimension( MPI_COMM_WORLD, global_num_cell ); std::array, 3> rec_partitions; From 88ed450832fab05c9ea166c19b0b92739e2192c1 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Tue, 26 Jul 2022 08:51:34 -0700 Subject: [PATCH 37/42] Move initializeRecPartition into constructor --- ..._ParticleDynamicPartitionerPerformance.cpp | 4 +- ...SparseMapDynamicPartitionerPerformance.cpp | 4 +- cajita/src/Cajita_DynamicPartitioner.hpp | 43 ++++++++++++++++--- cajita/unit_test/tstGlobalGrid.hpp | 4 +- .../tstParticleDynamicPartitioner.hpp | 17 -------- cajita/unit_test/tstSparseLocalGrid.hpp | 19 -------- .../tstSparseMapDynamicPartitioner.hpp | 5 --- 7 files changed, 43 insertions(+), 53 deletions(-) diff --git a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp index e28ec18cd..5c5aaf9de 100644 --- a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp @@ -146,8 +146,8 @@ void performanceTest( std::ostream& stream, MPI_Comm comm, for ( int t = 0; t < num_run; ++t ) { // ensure every optimization process starts from the same status - partitioner.initializeRecPartition( - ave_partition[0], ave_partition[1], ave_partition[2] ); + partitioner.initializePartitionByAverage( comm, + global_num_cell ); // compute local workload local_workload_timer.start( p ); diff --git a/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp index 671993f6b..19b5f0533 100644 --- a/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp @@ -164,8 +164,8 @@ void performanceTest( std::ostream& stream, MPI_Comm comm, for ( int t = 0; t < num_run; ++t ) { // ensure every optimization process starts from the same status - partitioner.initializeRecPartition( - ave_partition[0], ave_partition[1], ave_partition[2] ); + partitioner.initializePartitionByAverage( comm, + global_num_cell ); // compute local workload local_workload_timer.start( frac ); diff --git a/cajita/src/Cajita_DynamicPartitioner.hpp b/cajita/src/Cajita_DynamicPartitioner.hpp index f13534b9a..af21ebde4 100644 --- a/cajita/src/Cajita_DynamicPartitioner.hpp +++ b/cajita/src/Cajita_DynamicPartitioner.hpp @@ -101,6 +101,7 @@ class DynamicPartitioner : public BlockPartitioner // compute the ranks_per_dim from MPI communicator allocate( global_cells_per_dim ); ranksPerDimension( comm ); + initializePartitionByAverage( comm, global_cells_per_dim ); } /*! @@ -126,6 +127,7 @@ class DynamicPartitioner : public BlockPartitioner int comm_size; MPI_Comm_size( comm, &comm_size ); MPI_Dims_create( comm_size, num_space_dim, _ranks_per_dim.data() ); + initializePartitionByAverage( comm, global_cells_per_dim ); } /*! @@ -267,18 +269,49 @@ class DynamicPartitioner : public BlockPartitioner } /*! - \brief Initialize the tile partition; partition in each dimension + \brief Initialize the tile partition by average size + \param comm The communicator to use for initializing partitioning + \param global_cells_per_dim 3D array, global cells in each dimension + */ + void initializePartitionByAverage( + MPI_Comm comm, + const std::array& global_cells_per_dim ) + { + std::array global_num_tile = { + global_cells_per_dim[0] / (int)cell_num_per_tile_dim, + global_cells_per_dim[1] / (int)cell_num_per_tile_dim, + global_cells_per_dim[2] / (int)cell_num_per_tile_dim }; + + auto ranks_per_dim = ranksPerDimension( comm, global_cells_per_dim ); + std::array, 3> rec_partitions; + for ( int d = 0; d < 3; ++d ) + { + int ele = global_num_tile[d] / ranks_per_dim[d]; + int part = 0; + for ( int i = 0; i < ranks_per_dim[d]; ++i ) + { + rec_partitions[d].push_back( part ); + part += ele; + } + rec_partitions[d].push_back( global_num_tile[d] ); + } + + setRecPartition( rec_partitions[0], rec_partitions[1], + rec_partitions[2] ); + } + + /*! + \brief Set the tile partition; partition in each dimension has the form [0, p_1, ..., p_n, total_tile_num], so the partition would be [0, p_1), [p_1, p_2) ... [p_n, total_tile_num] \param rec_partition_i partition array in dimension i \param rec_partition_j partition array in dimension j \param rec_partition_k partition array in dimension k */ - void initializeRecPartition( std::vector& rec_partition_i, - std::vector& rec_partition_j, - std::vector& rec_partition_k ) + void setRecPartition( std::vector& rec_partition_i, + std::vector& rec_partition_j, + std::vector& rec_partition_k ) { - int max_size = 0; for ( std::size_t d = 0; d < num_space_dim; ++d ) max_size = diff --git a/cajita/unit_test/tstGlobalGrid.hpp b/cajita/unit_test/tstGlobalGrid.hpp index 6d40623db..1e0e243ff 100644 --- a/cajita/unit_test/tstGlobalGrid.hpp +++ b/cajita/unit_test/tstGlobalGrid.hpp @@ -446,8 +446,6 @@ void sparseGridTest3d() } rec_partitions[d].push_back( global_num_tile[d] ); } - partitioner.initializeRecPartition( rec_partitions[0], rec_partitions[1], - rec_partitions[2] ); // Create spares global grid auto global_grid = createGlobalGrid( MPI_COMM_WORLD, global_mesh, @@ -562,7 +560,7 @@ void sparseGridTest3d() for ( int id = 1; id < ranks_per_dim[d]; id++ ) part[d][id] += 1; - partitioner.initializeRecPartition( part[0], part[1], part[2] ); + partitioner.setRecPartition( part[0], part[1], part[2] ); std::array new_owned_num_cell; std::array new_global_cell_offset; diff --git a/cajita/unit_test/tstParticleDynamicPartitioner.hpp b/cajita/unit_test/tstParticleDynamicPartitioner.hpp index 539f358bd..a06e96162 100644 --- a/cajita/unit_test/tstParticleDynamicPartitioner.hpp +++ b/cajita/unit_test/tstParticleDynamicPartitioner.hpp @@ -177,23 +177,6 @@ void random_distribution_automatic_rank( int occupy_num_per_rank ) MPI_Barrier( MPI_COMM_WORLD ); } - // init partitions (average partition) - std::array, 3> rec_partitions; - for ( int d = 0; d < 3; ++d ) - { - int ele = size_tile_per_dim / ranks_per_dim[d]; - int part = 0; - for ( int i = 0; i < ranks_per_dim[d]; ++i ) - { - rec_partitions[d].push_back( part ); - part += ele; - } - rec_partitions[d].push_back( size_tile_per_dim ); - } - - partitioner.initializeRecPartition( rec_partitions[0], rec_partitions[1], - rec_partitions[2] ); - // basic settings for domain size and position double cell_size = 0.1; std::array global_low_corner = { 1.2, 3.3, -2.8 }; diff --git a/cajita/unit_test/tstSparseLocalGrid.hpp b/cajita/unit_test/tstSparseLocalGrid.hpp index e6a10f674..dfb4b3209 100644 --- a/cajita/unit_test/tstSparseLocalGrid.hpp +++ b/cajita/unit_test/tstSparseLocalGrid.hpp @@ -33,9 +33,6 @@ void sparseLocalGridTest( EntityType t2 ) double cell_size = 0.23; std::array global_num_cell = { 16, 32, 64 }; int cell_num_per_tile_dim = 4; - std::array global_num_tile = { 16 / cell_num_per_tile_dim, - 32 / cell_num_per_tile_dim, - 64 / cell_num_per_tile_dim }; std::array global_low_corner = { 1.2, 3.3, -2.8 }; std::array global_high_corner = { global_low_corner[0] + cell_size * global_num_cell[0], @@ -48,22 +45,6 @@ void sparseLocalGridTest( EntityType t2 ) std::array periodic = { false, false, false }; DynamicPartitioner partitioner( MPI_COMM_WORLD, global_num_cell, 10 ); - auto ranks_per_dim = - partitioner.ranksPerDimension( MPI_COMM_WORLD, global_num_cell ); - std::array, 3> rec_partitions; - for ( int d = 0; d < 3; ++d ) - { - int ele = global_num_tile[d] / ranks_per_dim[d]; - int part = 0; - for ( int i = 0; i < ranks_per_dim[d]; ++i ) - { - rec_partitions[d].push_back( part ); - part += ele; - } - rec_partitions[d].push_back( global_num_tile[d] ); - } - partitioner.initializeRecPartition( rec_partitions[0], rec_partitions[1], - rec_partitions[2] ); // Create global grid auto global_grid_ptr = Cajita::createGlobalGrid( diff --git a/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp b/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp index 0374ab9e5..fb30ddb09 100644 --- a/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp +++ b/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp @@ -74,8 +74,6 @@ void uniform_distribution_automatic_rank() } rec_partitions[d].push_back( size_tile_per_dim ); } - partitioner.initializeRecPartition( rec_partitions[0], rec_partitions[1], - rec_partitions[2] ); // test getCurrentPartition function { @@ -324,9 +322,6 @@ void random_distribution_automatic_rank( int occupy_num_per_rank ) rec_partitions[d].push_back( size_tile_per_dim ); } - partitioner.initializeRecPartition( rec_partitions[0], rec_partitions[1], - rec_partitions[2] ); - // basic settings for domain size and position double cell_size = 0.1; int pre_alloc_size = size_per_dim * size_per_dim; From 46ccece154ed484b77200eccaf33801d48d77fed Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Wed, 3 Aug 2022 23:34:45 -0700 Subject: [PATCH 38/42] Fix view --- .../src/Cajita_ParticleDynamicPartitioner.hpp | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp index 2fea96604..2719f428f 100644 --- a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp +++ b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp @@ -86,22 +86,23 @@ class ParticleDynamicPartitionerWorkloadMeasurer auto dx_copy = dx; auto cell_bits_per_tile_dim_copy = cell_bits_per_tile_dim; + auto view_copy = view; Kokkos::parallel_for( "compute_local_workload_parpos", Kokkos::RangePolicy( 0, particle_num ), KOKKOS_LAMBDA( const int i ) { - int ti = - static_cast( - ( view( i, 0 ) - lower_corner[0] ) / dx_copy - 0.5 ) >> - cell_bits_per_tile_dim_copy; - int tj = - static_cast( - ( view( i, 1 ) - lower_corner[1] ) / dx_copy - 0.5 ) >> - cell_bits_per_tile_dim_copy; - int tz = - static_cast( - ( view( i, 2 ) - lower_corner[2] ) / dx_copy - 0.5 ) >> - cell_bits_per_tile_dim_copy; + int ti = static_cast( + ( view_copy( i, 0 ) - lower_corner[0] ) / dx_copy - + 0.5 ) >> + cell_bits_per_tile_dim_copy; + int tj = static_cast( + ( view_copy( i, 1 ) - lower_corner[1] ) / dx_copy - + 0.5 ) >> + cell_bits_per_tile_dim_copy; + int tz = static_cast( + ( view_copy( i, 2 ) - lower_corner[2] ) / dx_copy - + 0.5 ) >> + cell_bits_per_tile_dim_copy; Kokkos::atomic_increment( &workload( ti + 1, tj + 1, tz + 1 ) ); } ); Kokkos::fence(); From 9b3aeb1d1b5f33627eff6f929f11d7a9998cf380 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Thu, 4 Aug 2022 10:51:27 -0700 Subject: [PATCH 39/42] Fix compile --- cajita/unit_test/tstSparseArray.hpp | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/cajita/unit_test/tstSparseArray.hpp b/cajita/unit_test/tstSparseArray.hpp index 6234bcd76..8ffa28737 100644 --- a/cajita/unit_test/tstSparseArray.hpp +++ b/cajita/unit_test/tstSparseArray.hpp @@ -10,7 +10,7 @@ ****************************************************************************/ #include -#include +#include #include #include @@ -181,13 +181,9 @@ void sparse_array_test( int par_num, EntityType entity ) std::array is_dim_periodic = { false, false, false }; // sparse partitioner - T max_workload_coeff = 1.5; - int workload_num = size_per_dim * size_per_dim * size_per_dim; - int num_step_rebalance = 200; int max_optimize_iteration = 10; - SparseDimPartitioner partitioner( - MPI_COMM_WORLD, max_workload_coeff, workload_num, num_step_rebalance, - global_num_cell, max_optimize_iteration ); + DynamicPartitioner partitioner( + MPI_COMM_WORLD, global_num_cell, max_optimize_iteration ); // rank-related information Kokkos::Array cart_rank; @@ -207,8 +203,8 @@ void sparse_array_test( int par_num, EntityType entity ) // scene initialization auto gt_partitions = generate_random_partition( ranks_per_dim, size_tile_per_dim ); - partitioner.initializeRecPartition( gt_partitions[0], gt_partitions[1], - gt_partitions[2] ); + partitioner.setRecPartition( gt_partitions[0], gt_partitions[1], + gt_partitions[2] ); std::set> tile_set; std::set> par_pos_set; @@ -427,13 +423,9 @@ void full_occupy_test( EntityType entity ) std::array is_dim_periodic = { false, false, false }; // sparse partitioner - T max_workload_coeff = 1.5; - int workload_num = size_per_dim * size_per_dim * size_per_dim; - int num_step_rebalance = 200; int max_optimize_iteration = 10; - SparseDimPartitioner partitioner( - MPI_COMM_WORLD, max_workload_coeff, workload_num, num_step_rebalance, - global_num_cell, max_optimize_iteration ); + DynamicPartitioner partitioner( + MPI_COMM_WORLD, global_num_cell, max_optimize_iteration ); // rank-related information Kokkos::Array cart_rank; @@ -453,8 +445,8 @@ void full_occupy_test( EntityType entity ) // scene initialization auto gt_partitions = generate_random_partition( ranks_per_dim, size_tile_per_dim ); - partitioner.initializeRecPartition( gt_partitions[0], gt_partitions[1], - gt_partitions[2] ); + partitioner.setRecPartition( gt_partitions[0], gt_partitions[1], + gt_partitions[2] ); // mesh/grid related initialization auto global_mesh = createSparseGlobalMesh( From d989a815fee8beaf23cc2916b5c39dc84c06adb1 Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Thu, 4 Aug 2022 12:06:28 -0700 Subject: [PATCH 40/42] Format --- cajita/unit_test/tstSparseArray.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cajita/unit_test/tstSparseArray.hpp b/cajita/unit_test/tstSparseArray.hpp index 8ffa28737..a84c20414 100644 --- a/cajita/unit_test/tstSparseArray.hpp +++ b/cajita/unit_test/tstSparseArray.hpp @@ -9,8 +9,8 @@ * SPDX-License-Identifier: BSD-3-Clause * ****************************************************************************/ -#include #include +#include #include #include From 4698e0c6dee98461fba502e5c65f02d29c16036b Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Mon, 8 Aug 2022 10:17:20 -0700 Subject: [PATCH 41/42] Add comments --- cajita/src/Cajita_DynamicPartitioner.hpp | 31 ++++++++++++++++++------ 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/cajita/src/Cajita_DynamicPartitioner.hpp b/cajita/src/Cajita_DynamicPartitioner.hpp index af21ebde4..3ee3ae2d6 100644 --- a/cajita/src/Cajita_DynamicPartitioner.hpp +++ b/cajita/src/Cajita_DynamicPartitioner.hpp @@ -28,13 +28,24 @@ namespace Cajita { +//---------------------------------------------------------------------------// +/*! + Workload measurer for DynamicPartitioner. It can be customized by the user + to compute workload on each rank. + + \tparam Device Kokkos device type. +*/ template class DynamicPartitionerWorkloadMeasurer { using memory_space = typename Device::memory_space; public: - virtual void compute( Kokkos::View& ) = 0; + /*! + \brief this function need to be overwrited to compute workload + \param workload workload computed on each rank + */ + virtual void compute( Kokkos::View& workload ) = 0; }; //---------------------------------------------------------------------------// @@ -769,18 +780,22 @@ class DynamicPartitioner : public BlockPartitioner int _max_optimize_iteration; protected: - // represent the rectangle partition in each dimension - // with form [0, p_1, ..., p_n, cell_num], n = rank num in current - // dimension, partition in this dimension would be [0, p_1), [p_1, p_2) ... - // [p_n, total_tile_num] (unit: tile) + //! represent the rectangle partition in each dimension + //! with form [0, p_1, ..., p_n, cell_num], n = rank num in current + //! dimension, partition in this dimension would be [0, p_1), [p_1, p_2) ... + //! [p_n, total_tile_num] (unit: tile) partition_view _rectangle_partition_dev; - // the workload of each tile on current + //! the workload of each tile on current workload_view _workload_per_tile; - // 3d prefix sum of the workload of each tile on current + //! 3d prefix sum of the workload of each tile on current workload_view _workload_prefix_sum; - // ranks per dimension + //! ranks per dimension Kokkos::Array _ranks_per_dim; + /*! + \brief allocate internal data structure for the partition algorithm + \param global_cells_per_dim grid size along each dimension + */ void allocate( const std::array& global_cells_per_dim ) { From 76578b747d2e9893a3a4c1df914759d51874a85a Mon Sep 17 00:00:00 2001 From: Yu Fang Date: Tue, 6 Sep 2022 16:52:07 -0700 Subject: [PATCH 42/42] Fix compiling NVCC 11.4 --- .../Cajita_ParticleDynamicPartitionerPerformance.cpp | 5 +++-- cajita/unit_test/tstParticleDynamicPartitioner.hpp | 8 +++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp index 5c5aaf9de..59b3e6115 100644 --- a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp @@ -151,10 +151,11 @@ void performanceTest( std::ostream& stream, MPI_Comm comm, // compute local workload local_workload_timer.start( p ); + constexpr int cell_num_per_tile_dim = 4; + constexpr int num_space_dim = 3; auto pws = Cajita::createParticleDynamicPartitionerWorkloadMeasurer< - partitioner.cell_num_per_tile_dim, - partitioner.num_space_dim, Device>( + cell_num_per_tile_dim, num_space_dim, Device>( pos_view, par_num, global_low_corner, 1.0f / num_cells_per_dim[c], comm ); partitioner.setLocalWorkload( &pws ); diff --git a/cajita/unit_test/tstParticleDynamicPartitioner.hpp b/cajita/unit_test/tstParticleDynamicPartitioner.hpp index a06e96162..e2322b553 100644 --- a/cajita/unit_test/tstParticleDynamicPartitioner.hpp +++ b/cajita/unit_test/tstParticleDynamicPartitioner.hpp @@ -186,10 +186,12 @@ void random_distribution_automatic_rank( int occupy_num_per_rank ) gt_partition, cart_rank, occupy_num_per_rank, global_low_corner, cell_size, cell_per_tile_dim ); // compute workload from a particle view and do partition optimization + constexpr int cell_num_per_tile_dim = 4; + constexpr int num_space_dim = 3; auto pws = createParticleDynamicPartitionerWorkloadMeasurer< - partitioner.cell_num_per_tile_dim, partitioner.num_space_dim, - TEST_DEVICE>( particle_view, occupy_num_per_rank, global_low_corner, - cell_size, MPI_COMM_WORLD ); + cell_num_per_tile_dim, num_space_dim, TEST_DEVICE>( + particle_view, occupy_num_per_rank, global_low_corner, cell_size, + MPI_COMM_WORLD ); partitioner.setLocalWorkload( &pws ); partitioner.optimizePartition( MPI_COMM_WORLD );