From 5e84cd5071ab7c4ad98dac4a0ed7b8e937aa2c81 Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Wed, 12 Jul 2023 13:32:32 -0400 Subject: [PATCH 1/2] Loop: Support multiple iterations and varying numbers of threads in loops --- Loop/src/loop.hxx | 51 ++++++++++-------- Loop/src/loop_device.hxx | 113 ++++++++++++++++++++++++--------------- 2 files changed, 99 insertions(+), 65 deletions(-) diff --git a/Loop/src/loop.hxx b/Loop/src/loop.hxx index 9efda23d3..e0705c278 100644 --- a/Loop/src/loop.hxx +++ b/Loop/src/loop.hxx @@ -64,6 +64,7 @@ struct PointDesc { units_t DI; // direction unit vectors vect I; // grid point + int iter; // iteration // outward boundary normal (if in outer boundary), else zero vect NI; vect I0; // nearest interior point @@ -91,15 +92,15 @@ struct PointDesc { PointDesc &operator=(PointDesc &&) = default; constexpr CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_DEVICE CCTK_HOST - PointDesc(const vect &I, const vect &NI, + PointDesc(const vect &I, const int iter, const vect &NI, const vect &I0, const vect &BI, const vect &bnd_min, const vect &bnd_max, const vect &loop_min, const vect &loop_max, const vect &X, const vect &DX) - : I(I), NI(NI), I0(I0), BI(BI), bnd_min(bnd_min), bnd_max(bnd_max), - loop_min(loop_min), loop_max(loop_max), X(X), DX(DX), imin(loop_min[0]), - imax(loop_max[0]), i(I[0]), j(I[1]), k(I[2]), x(X[0]), y(X[1]), z(X[2]), - dx(DX[0]), dy(DX[1]), dz(DX[2]) {} + : I(I), iter(iter), NI(NI), I0(I0), BI(BI), bnd_min(bnd_min), + bnd_max(bnd_max), loop_min(loop_min), loop_max(loop_max), X(X), DX(DX), + imin(loop_min[0]), imax(loop_max[0]), i(I[0]), j(I[1]), k(I[2]), + x(X[0]), y(X[1]), z(X[2]), dx(DX[0]), dy(DX[1]), dz(DX[2]) {} friend std::ostream &operator<<(std::ostream &os, const PointDesc &p); }; @@ -131,7 +132,7 @@ public: GridDescBase(const cGH *cctkGH); constexpr CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_DEVICE CCTK_HOST PointDesc - point_desc(const vect &CI, const vect &I, + point_desc(const vect &CI, const vect &I, const int iter, const vect &NI, const vect &I0, const vect &BI, const vect &bnd_min, const vect &bnd_max, const vect &loop_min, @@ -139,8 +140,8 @@ public: const vect X = x0 + (lbnd + I - vect(!CI) / 2) * dx; const vect DX = dx; - return PointDesc(I, NI, I0, BI, bnd_min, bnd_max, loop_min, loop_max, X, - DX); + return PointDesc(I, iter, NI, I0, BI, bnd_min, bnd_max, loop_min, loop_max, + X, DX); } // Loop over a given box @@ -148,29 +149,33 @@ public: void loop_box(const F &f, const vect &restrict bnd_min, const vect &restrict bnd_max, const vect &restrict loop_min, - const vect &restrict loop_max) const { + const vect &restrict loop_max, + const int niters = 1) const { static_assert(CI == 0 || CI == 1); static_assert(CJ == 0 || CJ == 1); static_assert(CK == 0 || CK == 1); static_assert(VS > 0); - if (any(loop_min >= loop_max)) + if (niters <= 0 || any(loop_max <= loop_min)) return; - for (int k = loop_min[2]; k < loop_max[2]; ++k) { - for (int j = loop_min[1]; j < loop_max[1]; ++j) { + for (int iter = 0; iter < niters; ++iter) { + for (int k = loop_min[2]; k < loop_max[2]; ++k) { + for (int j = loop_min[1]; j < loop_max[1]; ++j) { #pragma omp simd - for (int i = loop_min[0]; i < loop_max[0]; i += VS) { - const vect I = {i, j, k}; - const vect NI = - vect(I > bnd_max - 1) - vect(I < bnd_min); - const vect I0 = - if_else(NI == 0, 0, if_else(NI < 0, bnd_min, bnd_max - 1)); - const vect BI = - vect(I == bnd_max - 1) - vect(I == bnd_min); - const PointDesc p = point_desc({CI, CJ, CK}, I, NI, I0, BI, bnd_min, - bnd_max, loop_min, loop_max); - f(p); + for (int i = loop_min[0]; i < loop_max[0]; i += VS) { + const vect I = {i, j, k}; + const vect NI = + vect(I > bnd_max - 1) - vect(I < bnd_min); + const vect I0 = + if_else(NI == 0, 0, if_else(NI < 0, bnd_min, bnd_max - 1)); + const vect BI = + vect(I == bnd_max - 1) - vect(I == bnd_min); + const PointDesc p = + point_desc({CI, CJ, CK}, I, iter, NI, I0, BI, bnd_min, bnd_max, + loop_min, loop_max); + f(p); + } } } } diff --git a/Loop/src/loop_device.hxx b/Loop/src/loop_device.hxx index aa27ec145..e6002aebe 100644 --- a/Loop/src/loop_device.hxx +++ b/Loop/src/loop_device.hxx @@ -39,15 +39,17 @@ public: GridDescBaseDevice(const cGH *cctkGH) : GridDescBase(cctkGH) {} // Loop over a given box - template + template void loop_box_device(const F &f, const vect &restrict bnd_min, const vect &restrict bnd_max, const vect &restrict loop_min, - const vect &restrict loop_max) const { + const vect &restrict loop_max, + const int niters = 1) const { #ifndef AMREX_USE_GPU return this->template loop_box(f, bnd_min, bnd_max, - loop_min, loop_max); + loop_min, loop_max, niters); #else @@ -55,8 +57,9 @@ public: static_assert(CJ == 0 || CJ == 1); static_assert(CK == 0 || CK == 1); static_assert(VS > 0); + static_assert(NT > 0); - if (any(loop_min >= loop_max)) + if (niters <= 0 || any(loop_max <= loop_min)) return; // Run on GPU @@ -77,35 +80,55 @@ public: CJ ? amrex::IndexType::CELL : amrex::IndexType::NODE, CK ? amrex::IndexType::CELL : amrex::IndexType::NODE)); - amrex::ParallelFor( - box, [=, *this] CCTK_DEVICE(const int i, const int j, - const int k) CCTK_ATTRIBUTE_ALWAYS_INLINE { - const vect I = {i, j, k}; - const vect NI = - vect(I > bnd_max1 - 1) - vect(I < bnd_min1); - const vect I0 = - if_else(NI == 0, 0, if_else(NI < 0, bnd_min1, bnd_max1 - 1)); - const vect BI = - vect(I == bnd_max1 - 1) - vect(I == bnd_min1); - const PointDesc p = point_desc({CI, CJ, CK}, I, NI, I0, BI, bnd_min1, - bnd_max1, loop_min1, loop_max1); - f(p); - }); + if (niters == 1) + amrex::ParallelFor( + box, + [=, *this] CCTK_DEVICE(const int i, const int j, + const int k) CCTK_ATTRIBUTE_ALWAYS_INLINE { + const vect I = {i, j, k}; + const vect NI = + vect(I > bnd_max1 - 1) - vect(I < bnd_min1); + const vect I0 = + if_else(NI == 0, 0, if_else(NI < 0, bnd_min1, bnd_max1 - 1)); + const vect BI = vect(I == bnd_max1 - 1) - + vect(I == bnd_min1); + constexpr int iter = 0; + const PointDesc p = + point_desc({CI, CJ, CK}, I, iter, NI, I0, BI, bnd_min1, + bnd_max1, loop_min1, loop_max1); + f(p); + }); + else + amrex::ParallelFor( + box, + [=, *this] CCTK_DEVICE(const int i, const int j, + const int k) CCTK_ATTRIBUTE_ALWAYS_INLINE { + const vect I = {i, j, k}; + const vect NI = + vect(I > bnd_max1 - 1) - vect(I < bnd_min1); + const vect I0 = + if_else(NI == 0, 0, if_else(NI < 0, bnd_min1, bnd_max1 - 1)); + const vect BI = vect(I == bnd_max1 - 1) - + vect(I == bnd_min1); + for (int iter = 0; iter < niters; ++iter) { + const PointDesc p = + point_desc({CI, CJ, CK}, I, iter, NI, I0, BI, bnd_min1, + bnd_max1, loop_min1, loop_max1); + f(p); + } + }); #endif #ifdef AMREX_USE_GPU - static bool have_gpu_sync_after_every_kernel = false; - static bool gpu_sync_after_every_kernel; - if (!have_gpu_sync_after_every_kernel) { + static const bool gpu_sync_after_every_kernel = []() { int type; - const void *const gpu_sync_after_every_kernel_ptr = + const void *const ptr = CCTK_ParameterGet("gpu_sync_after_every_kernel", "CarpetX", &type); - assert(gpu_sync_after_every_kernel_ptr); + assert(ptr); assert(type == PARAMETER_BOOLEAN); - gpu_sync_after_every_kernel = - *static_cast(gpu_sync_after_every_kernel_ptr); - } + return *static_cast(ptr); + }(); if (gpu_sync_after_every_kernel) { amrex::Gpu::synchronize(); AMREX_GPU_ERROR_CHECK(); @@ -114,30 +137,33 @@ public: } // Loop over all points - template + template inline CCTK_ATTRIBUTE_ALWAYS_INLINE void loop_all_device(const vect &group_nghostzones, const F &f) const { vect bnd_min, bnd_max; boundary_box(group_nghostzones, bnd_min, bnd_max); vect imin, imax; box_all(group_nghostzones, imin, imax); - loop_box_device(f, bnd_min, bnd_max, imin, imax); + loop_box_device(f, bnd_min, bnd_max, imin, imax); } // Loop over all interior points - template + template inline CCTK_ATTRIBUTE_ALWAYS_INLINE void loop_int_device(const vect &group_nghostzones, const F &f) const { vect bnd_min, bnd_max; boundary_box(group_nghostzones, bnd_min, bnd_max); vect imin, imax; box_int(group_nghostzones, imin, imax); - loop_box_device(f, bnd_min, bnd_max, imin, imax); + loop_box_device(f, bnd_min, bnd_max, imin, imax); } // Loop over a part of the domain. Loop over the interior first, // then faces, then edges, then corners. - template + template inline CCTK_ATTRIBUTE_ALWAYS_INLINE void loop_there_device(const vect &group_nghostzones, const vect, dim>, dim> &there, @@ -185,8 +211,8 @@ public: imax[d] = min(tmax[d], imax[d]); } - loop_box_device(f, bnd_min, bnd_max, imin, - imax); + loop_box_device(f, bnd_min, bnd_max, imin, + imax); } } // if rank } @@ -199,7 +225,8 @@ public: // Loop over all outer boundary points. This excludes ghost faces, but // includes ghost edges/corners on non-ghost faces. Loop over faces first, // then edges, then corners. - template + template inline CCTK_ATTRIBUTE_ALWAYS_INLINE void loop_bnd_device(const vect &group_nghostzones, const F &f) const { vect bnd_min, bnd_max; @@ -245,8 +272,8 @@ public: imax[d] = min(tmax[d], imax[d]); } - loop_box_device(f, bnd_min, bnd_max, imin, - imax); + loop_box_device(f, bnd_min, bnd_max, imin, + imax); } } // if rank } @@ -259,7 +286,8 @@ public: #if 0 // Loop over all outer ghost points. This includes ghost edges/corners on // non-ghost faces. Loop over faces first, then edges, then corners. - template + template inline CCTK_ATTRIBUTE_ALWAYS_INLINE void loop_ghosts_inclusive_device(const vect &group_nghostzones, const F &f) const { @@ -307,8 +335,8 @@ public: imax[d] = std::min(tmax[d], imax[d]); } - loop_box_boundary_device(f, imin, imax, - inormal); + loop_box_boundary_device(f, imin, imax, + inormal); } } // if rank } @@ -320,7 +348,8 @@ public: // Loop over all outer ghost points. This excludes ghost edges/corners on // non-ghost faces. Loop over faces first, then edges, then corners. - template + template inline CCTK_ATTRIBUTE_ALWAYS_INLINE void loop_ghosts_device(const vect &group_nghostzones, const F &f) const { @@ -367,8 +396,8 @@ public: imax[d] = min(tmax[d], imax[d]); } - loop_box_device(f, bnd_min, bnd_max, imin, - imax); + loop_box_device(f, bnd_min, bnd_max, imin, + imax); } } // if rank } From b5bf9fff02397c67103c40b93db5b20951224f8f Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Thu, 13 Jul 2023 13:05:33 -0400 Subject: [PATCH 2/2] Loop: Pass number of iterations as template argument --- Loop/src/loop.hxx | 32 ++++++------ Loop/src/loop_device.hxx | 110 ++++++++++++++++----------------------- 2 files changed, 60 insertions(+), 82 deletions(-) diff --git a/Loop/src/loop.hxx b/Loop/src/loop.hxx index e0705c278..50543865a 100644 --- a/Loop/src/loop.hxx +++ b/Loop/src/loop.hxx @@ -145,21 +145,21 @@ public: } // Loop over a given box - template - void loop_box(const F &f, const vect &restrict bnd_min, + template + void loop_box(const vect &restrict bnd_min, const vect &restrict bnd_max, const vect &restrict loop_min, - const vect &restrict loop_max, - const int niters = 1) const { + const vect &restrict loop_max, const F &f) const { static_assert(CI == 0 || CI == 1); static_assert(CJ == 0 || CJ == 1); static_assert(CK == 0 || CK == 1); + static_assert(N >= 0); static_assert(VS > 0); - if (niters <= 0 || any(loop_max <= loop_min)) + if (N == 0 || any(loop_max <= loop_min)) return; - for (int iter = 0; iter < niters; ++iter) { + for (int iter = 0; iter < N; ++iter) { for (int k = loop_min[2]; k < loop_max[2]; ++k) { for (int j = loop_min[1]; j < loop_max[1]; ++j) { #pragma omp simd @@ -238,30 +238,30 @@ public: } // Loop over all points - template + template inline CCTK_ATTRIBUTE_ALWAYS_INLINE void loop_all(const vect &group_nghostzones, const F &f) const { vect bnd_min, bnd_max; boundary_box(group_nghostzones, bnd_min, bnd_max); vect imin, imax; box_all(group_nghostzones, imin, imax); - loop_box(f, bnd_min, bnd_max, imin, imax); + loop_box(bnd_min, bnd_max, imin, imax, f); } // Loop over all interior points - template + template inline CCTK_ATTRIBUTE_ALWAYS_INLINE void loop_int(const vect &group_nghostzones, const F &f) const { vect bnd_min, bnd_max; boundary_box(group_nghostzones, bnd_min, bnd_max); vect imin, imax; box_int(group_nghostzones, imin, imax); - loop_box(f, bnd_min, bnd_max, imin, imax); + loop_box(bnd_min, bnd_max, imin, imax, f); } // Loop over a part of the domain. Loop over the interior first, // then faces, then edges, then corners. - template + template inline CCTK_ATTRIBUTE_ALWAYS_INLINE void loop_there(const vect &group_nghostzones, const vect, dim>, dim> &there, @@ -309,7 +309,7 @@ public: imax[d] = min(tmax[d], imax[d]); } - loop_box(f, bnd_min, bnd_max, imin, imax); + loop_box(bnd_min, bnd_max, imin, imax, f); } } // if rank } @@ -322,7 +322,7 @@ public: // Loop over all outer boundary points. This excludes ghost faces, but // includes ghost edges/corners on non-ghost faces. Loop over faces first, // then edges, then corners. - template + template inline CCTK_ATTRIBUTE_ALWAYS_INLINE void loop_bnd(const vect &group_nghostzones, const F &f) const { vect bnd_min, bnd_max; @@ -368,7 +368,7 @@ public: imax[d] = min(tmax[d], imax[d]); } - loop_box(f, bnd_min, bnd_max, imin, imax); + loop_box(bnd_min, bnd_max, imin, imax, f); } } // if rank } @@ -462,7 +462,7 @@ public: // Loop over all outer ghost points. This excludes ghost edges/corners on // non-ghost faces. Loop over faces first, then edges, then corners. - template + template inline CCTK_ATTRIBUTE_ALWAYS_INLINE void loop_ghosts(const vect &group_nghostzones, const F &f) const { vect bnd_min, bnd_max; @@ -508,7 +508,7 @@ public: imax[d] = min(tmax[d], imax[d]); } - loop_box(f, bnd_min, bnd_max, imin, imax); + loop_box(bnd_min, bnd_max, imin, imax, f); } } // if rank } diff --git a/Loop/src/loop_device.hxx b/Loop/src/loop_device.hxx index e6002aebe..4c256a2da 100644 --- a/Loop/src/loop_device.hxx +++ b/Loop/src/loop_device.hxx @@ -39,32 +39,33 @@ public: GridDescBaseDevice(const cGH *cctkGH) : GridDescBase(cctkGH) {} // Loop over a given box - template - void loop_box_device(const F &f, const vect &restrict bnd_min, + template + void loop_box_device(const vect &restrict bnd_min, const vect &restrict bnd_max, const vect &restrict loop_min, const vect &restrict loop_max, - const int niters = 1) const { + const F &f) const { #ifndef AMREX_USE_GPU - return this->template loop_box(f, bnd_min, bnd_max, - loop_min, loop_max, niters); + return this->template loop_box(bnd_min, bnd_max, + loop_min, loop_max, f); #else + // Run on GPU static_assert(CI == 0 || CI == 1); static_assert(CJ == 0 || CJ == 1); static_assert(CK == 0 || CK == 1); + static_assert(N >= 0); static_assert(VS > 0); static_assert(NT > 0); - if (niters <= 0 || any(loop_max <= loop_min)) - return; - - // Run on GPU static_assert(VS == 1, "Only vector size 1 is supported on GPUs"); + if (N == 0 || any(loop_max <= loop_min)) + return; + // For some reason, the arguments loop_min and loop_max cannot be captured // correctly in CUDA, but copies of them can const auto bnd_min1 = bnd_min; @@ -80,47 +81,24 @@ public: CJ ? amrex::IndexType::CELL : amrex::IndexType::NODE, CK ? amrex::IndexType::CELL : amrex::IndexType::NODE)); - if (niters == 1) - amrex::ParallelFor( - box, - [=, *this] CCTK_DEVICE(const int i, const int j, - const int k) CCTK_ATTRIBUTE_ALWAYS_INLINE { - const vect I = {i, j, k}; - const vect NI = - vect(I > bnd_max1 - 1) - vect(I < bnd_min1); - const vect I0 = - if_else(NI == 0, 0, if_else(NI < 0, bnd_min1, bnd_max1 - 1)); - const vect BI = vect(I == bnd_max1 - 1) - - vect(I == bnd_min1); - constexpr int iter = 0; + amrex::ParallelFor( + box, [=, *this] CCTK_DEVICE(const int i, const int j, + const int k) CCTK_ATTRIBUTE_ALWAYS_INLINE { + const vect I = {i, j, k}; + const vect NI = + vect(I > bnd_max1 - 1) - vect(I < bnd_min1); + const vect I0 = + if_else(NI == 0, 0, if_else(NI < 0, bnd_min1, bnd_max1 - 1)); + const vect BI = + vect(I == bnd_max1 - 1) - vect(I == bnd_min1); + for (int iter = 0; iter < N; ++iter) { const PointDesc p = point_desc({CI, CJ, CK}, I, iter, NI, I0, BI, bnd_min1, bnd_max1, loop_min1, loop_max1); f(p); - }); - else - amrex::ParallelFor( - box, - [=, *this] CCTK_DEVICE(const int i, const int j, - const int k) CCTK_ATTRIBUTE_ALWAYS_INLINE { - const vect I = {i, j, k}; - const vect NI = - vect(I > bnd_max1 - 1) - vect(I < bnd_min1); - const vect I0 = - if_else(NI == 0, 0, if_else(NI < 0, bnd_min1, bnd_max1 - 1)); - const vect BI = vect(I == bnd_max1 - 1) - - vect(I == bnd_min1); - for (int iter = 0; iter < niters; ++iter) { - const PointDesc p = - point_desc({CI, CJ, CK}, I, iter, NI, I0, BI, bnd_min1, - bnd_max1, loop_min1, loop_max1); - f(p); - } - }); - -#endif + } + }); -#ifdef AMREX_USE_GPU static const bool gpu_sync_after_every_kernel = []() { int type; const void *const ptr = @@ -137,33 +115,33 @@ public: } // Loop over all points - template + template inline CCTK_ATTRIBUTE_ALWAYS_INLINE void loop_all_device(const vect &group_nghostzones, const F &f) const { vect bnd_min, bnd_max; boundary_box(group_nghostzones, bnd_min, bnd_max); vect imin, imax; box_all(group_nghostzones, imin, imax); - loop_box_device(f, bnd_min, bnd_max, imin, imax); + loop_box_device(bnd_min, bnd_max, imin, imax, f); } // Loop over all interior points - template + template inline CCTK_ATTRIBUTE_ALWAYS_INLINE void loop_int_device(const vect &group_nghostzones, const F &f) const { vect bnd_min, bnd_max; boundary_box(group_nghostzones, bnd_min, bnd_max); vect imin, imax; box_int(group_nghostzones, imin, imax); - loop_box_device(f, bnd_min, bnd_max, imin, imax); + loop_box_device(bnd_min, bnd_max, imin, imax, f); } // Loop over a part of the domain. Loop over the interior first, // then faces, then edges, then corners. - template + template inline CCTK_ATTRIBUTE_ALWAYS_INLINE void loop_there_device(const vect &group_nghostzones, const vect, dim>, dim> &there, @@ -211,8 +189,8 @@ public: imax[d] = min(tmax[d], imax[d]); } - loop_box_device(f, bnd_min, bnd_max, imin, - imax); + loop_box_device(bnd_min, bnd_max, imin, + imax, f); } } // if rank } @@ -225,8 +203,8 @@ public: // Loop over all outer boundary points. This excludes ghost faces, but // includes ghost edges/corners on non-ghost faces. Loop over faces first, // then edges, then corners. - template + template inline CCTK_ATTRIBUTE_ALWAYS_INLINE void loop_bnd_device(const vect &group_nghostzones, const F &f) const { vect bnd_min, bnd_max; @@ -272,8 +250,8 @@ public: imax[d] = min(tmax[d], imax[d]); } - loop_box_device(f, bnd_min, bnd_max, imin, - imax); + loop_box_device(bnd_min, bnd_max, imin, + imax, f); } } // if rank } @@ -286,7 +264,7 @@ public: #if 0 // Loop over all outer ghost points. This includes ghost edges/corners on // non-ghost faces. Loop over faces first, then edges, then corners. - template inline CCTK_ATTRIBUTE_ALWAYS_INLINE void loop_ghosts_inclusive_device(const vect &group_nghostzones, @@ -335,7 +313,7 @@ public: imax[d] = std::min(tmax[d], imax[d]); } - loop_box_boundary_device(f, imin, imax, + loop_box_boundary_device( imin, imax, inormal); } } // if rank @@ -348,8 +326,8 @@ public: // Loop over all outer ghost points. This excludes ghost edges/corners on // non-ghost faces. Loop over faces first, then edges, then corners. - template + template inline CCTK_ATTRIBUTE_ALWAYS_INLINE void loop_ghosts_device(const vect &group_nghostzones, const F &f) const { @@ -396,8 +374,8 @@ public: imax[d] = min(tmax[d], imax[d]); } - loop_box_device(f, bnd_min, bnd_max, imin, - imax); + loop_box_device(bnd_min, bnd_max, imin, + imax, f); } } // if rank }