From d3fa3115cf4450d8c8e3b92ad9ea40b50633ae9c Mon Sep 17 00:00:00 2001 From: Adrian-Diaz Date: Tue, 1 Oct 2024 12:44:16 -0600 Subject: [PATCH] ENH: nested parallel reductions --- examples/main_kokkos.cpp | 49 ++++++++++++++++++++++++++++++++++---- src/include/kokkos_types.h | 26 ++++++++++++-------- src/include/macros.h | 20 +++++++--------- 3 files changed, 69 insertions(+), 26 deletions(-) diff --git a/examples/main_kokkos.cpp b/examples/main_kokkos.cpp index 5049119d..5bdc2740 100644 --- a/examples/main_kokkos.cpp +++ b/examples/main_kokkos.cpp @@ -875,11 +875,11 @@ int main(int argc, char* argv[]) FOR_ALL(i_i, 0, hiersize, j_j, 0, hiersize, k_k, 0, hiersize, { hierTest3D(i_i, j_j, k_k) = 0.0; }); - FOR_FIRST(hiersize, { + FOR_FIRST(i_i,hiersize, { // Kokkos::parallel_for( \ //Kokkos::TeamPolicy<>( 32, Kokkos::AUTO, 32 ), \ //KOKKOS_LAMBDA ( const Kokkos::TeamPolicy<>::member_type &teamMember ) { - const int i_i = TEAM_ID; + //const int i_i = TEAM_ID; FOR_SECOND(j_j, i_i, hiersize, { // Kokkos::parallel_for( \ //Kokkos::TeamThreadRange( teamMember, istart, iend ), [&] ( const int (j_j) ) { @@ -905,12 +905,16 @@ int main(int argc, char* argv[]) FOR_ALL(i_i, 0, hiersize, j_j, 0, hiersize, k_k, 0, hiersize, { hierTest3D(i_i, j_j, k_k) = i_i*hiersize*hiersize+j_j*hiersize+k_k; }); - FOR_FIRST_EASY(i_i,hiersize, { + + printf("\n\n\nHierarchical Reduce\n"); + //2D nesting + FOR_FIRST(i_i,hiersize, { // Kokkos::parallel_for( \ //Kokkos::TeamPolicy<>( 32, Kokkos::AUTO, 32 ), \ //KOKKOS_LAMBDA ( const Kokkos::TeamPolicy<>::member_type &teamMember ) { //const int i_i = TEAM_ID; double result = 0; + double lsum; FOR_REDUCE_SUM_SECOND(j_j, i_i, hiersize, lsum, { lsum += hierTest3D(i_i,j_j,0); // Kokkos::parallel_for( \ @@ -920,11 +924,46 @@ int main(int argc, char* argv[]) // int jend = (j_j+1)*32; }, result); hierTest1D(i_i)= result; + //printf("value at %d is %f\n", i_i, hierTest1D(i_i)); + }); + Kokkos::fence(); + for (int ppp = 0; ppp < hiersize; ppp++) { + //printf("%f\n", hierTest1D(ppp)); + // printf("%f\n", hierTest2D(3,ppp)); + // printf("%f\n", hierTest3D(3,3,ppp)); + } + printf("\n\n"); + + printf("\n\n\nHierarchical Vectorized Reduce\n"); + //3D vector nesting + FOR_FIRST(i_i,hiersize, { + // Kokkos::parallel_for( \ + //Kokkos::TeamPolicy<>( 32, Kokkos::AUTO, 32 ), \ + //KOKKOS_LAMBDA ( const Kokkos::TeamPolicy<>::member_type &teamMember ) { + //const int i_i = TEAM_ID; + double result = 0; + double lsum; + FOR_SECOND(j_j, i_i, hiersize, { + // Kokkos::parallel_for( \ + //Kokkos::TeamThreadRange( teamMember, istart, iend ), [&] ( const int (j_j) ) { + // hierTest2D(i_i,j_j) = i_i * (j_j+1); + // int jstart = j_j*32; + // int jend = (j_j+1)*32; + FOR_REDUCE_SUM_THIRD(k_k, i_i, j_j, lsum, { + lsum += hierTest3D(i_i,j_j,k_k); + // Kokkos::parallel_for( \ + //Kokkos::TeamThreadRange( teamMember, istart, iend ), [&] ( const int (j_j) ) { + // hierTest2D(i_i,j_j) = i_i * (j_j+1); + // int jstart = j_j*32; + // int jend = (j_j+1)*32; + }, result); + hierTest2D(i_i,j_j)= result; + //printf("value at %d , %d is %f\n", i_i, j_j, hierTest2D(i_i,j_j)); + }); }); Kokkos::fence(); - printf("\n\n\nHierarchical Reduce\n"); for (int ppp = 0; ppp < hiersize; ppp++) { - printf("%f\n", hierTest1D(ppp)); + //printf("%f\n", hierTest1D(ppp)); // printf("%f\n", hierTest2D(3,ppp)); // printf("%f\n", hierTest3D(3,3,ppp)); } diff --git a/src/include/kokkos_types.h b/src/include/kokkos_types.h index 82127ce1..514da94f 100644 --- a/src/include/kokkos_types.h +++ b/src/include/kokkos_types.h @@ -8040,11 +8040,14 @@ void DynamicRaggedRightArrayKokkos::set_values( template KOKKOS_INLINE_FUNCTION void DynamicRaggedRightArrayKokkos::set_values_sparse(T val) { - Kokkos::parallel_for( Kokkos::TeamPolicy<>( dim1_, Kokkos::AUTO, 32 ), KOKKOS_CLASS_LAMBDA ( const Kokkos::TeamPolicy<>::member_type &teamMember ) { - const int i_i = teamMember.league_rank(); - Kokkos::parallel_for( Kokkos::TeamThreadRange( teamMember, 0, stride_(i_i) ), [&] ( const int (j_j) ) { - array_(dim2_*i_i+j_j) = val; - }); + // Kokkos::parallel_for( Kokkos::TeamPolicy<>( dim1_, Kokkos::AUTO, 32 ), KOKKOS_CLASS_LAMBDA ( const Kokkos::TeamPolicy<>::member_type &teamMember ) { + // const int i_i = teamMember.league_rank(); + // Kokkos::parallel_for( Kokkos::TeamThreadRange( teamMember, 0, stride_(i_i) ), [&] ( const int (j_j) ) { + // array_(dim2_*i_i+j_j) = val; + // }); + // }); + Kokkos::parallel_for("SetValues_DynamicRaggedRightArrayKokkos", length_, KOKKOS_CLASS_LAMBDA(const int i) { + array_(i) = val; }); } // Get the name of the view @@ -8256,11 +8259,14 @@ void DynamicRaggedDownArrayKokkos::set_values(T template KOKKOS_INLINE_FUNCTION void DynamicRaggedDownArrayKokkos::set_values_sparse(T val) { - Kokkos::parallel_for( Kokkos::TeamPolicy<>( dim2_, Kokkos::AUTO, 32 ), KOKKOS_CLASS_LAMBDA ( const Kokkos::TeamPolicy<>::member_type &teamMember ) { - const int j_j = teamMember.league_rank(); - Kokkos::parallel_for( Kokkos::TeamThreadRange( teamMember, 0, stride_(j_j) ), [&] ( const int (i_i) ) { - array_(dim1_*j_j+i_i) = val; - }); + // Kokkos::parallel_for( Kokkos::TeamPolicy<>( dim2_, Kokkos::AUTO, 32 ), KOKKOS_CLASS_LAMBDA ( const Kokkos::TeamPolicy<>::member_type &teamMember ) { + // const int j_j = teamMember.league_rank(); + // Kokkos::parallel_for( Kokkos::TeamThreadRange( teamMember, 0, stride_(j_j) ), [&] ( const int (i_i) ) { + // array_(dim1_*j_j+i_i) = val; + // }); + // }); + Kokkos::parallel_for("SetValues_DynamicRaggedDownArrayKokkos", length_, KOKKOS_CLASS_LAMBDA(const int i) { + array_(i) = val; }); } // Get the name of the view diff --git a/src/include/macros.h b/src/include/macros.h index 68fbebf1..13c3f455 100644 --- a/src/include/macros.h +++ b/src/include/macros.h @@ -445,19 +445,11 @@ THREAD_ID \ teamMember.team_rank() #define \ -FOR_FIRST(x1, fcn) \ +FOR_FIRST(i, x1, fcn) \ Kokkos::parallel_for( \ Kokkos::TeamPolicy<>( x1, Kokkos::AUTO, 32 ), \ KOKKOS_LAMBDA ( const Kokkos::TeamPolicy<>::member_type &teamMember ) \ - {fcn} ) - -#define \ -FOR_FIRST_EASY(i, x1, fcn) \ -Kokkos::parallel_for( \ - Kokkos::TeamPolicy<>( x1, Kokkos::AUTO, 32 ), \ - KOKKOS_LAMBDA ( const Kokkos::TeamPolicy<>::member_type &teamMember ) \ - { const int i = TEAM_ID; \ - fcn} ) + { const int i = TEAM_ID; fcn} ) #define \ FOR_SECOND(j, y0, y1, fcn) \ @@ -468,7 +460,7 @@ Kokkos::parallel_for( \ #define \ FOR_REDUCE_SUM_SECOND(j, y0, y1, lsum, fcn, result) \ Kokkos::parallel_reduce( \ - Kokkos::TeamThreadRange( teamMember, y0, y1 ), [&] ( const int (j), decltype(result) &(lsum) ) \ + Kokkos::TeamThreadRange( teamMember, y0, y1 ), [&] ( const int (j), decltype(lsum) &(lsum) ) \ {fcn}, result ) #define \ @@ -477,6 +469,12 @@ Kokkos::parallel_for( \ Kokkos::ThreadVectorRange( teamMember, z0, z1 ), [&] ( const int (k) ) \ {fcn} ) +#define \ +FOR_REDUCE_SUM_THIRD(k, z0, z1, lsum, fcn, result) \ +Kokkos::parallel_reduce( \ + Kokkos::ThreadVectorRange( teamMember, z0, z1 ), [&] ( const int (k), decltype(lsum) &(lsum) ) \ + {fcn}, result ) + //Kokkos Initialize #define \ MATAR_KOKKOS_INIT \