Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Loop: Support multiple iterations and varying numbers of threads in l… #174

Merged
merged 2 commits into from
Jul 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 40 additions & 35 deletions Loop/src/loop.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ struct PointDesc {
units_t<int, dim> DI; // direction unit vectors

vect<int, dim> I; // grid point
int iter; // iteration
// outward boundary normal (if in outer boundary), else zero
vect<int, dim> NI;
vect<int, dim> I0; // nearest interior point
Expand Down Expand Up @@ -91,15 +92,15 @@ struct PointDesc {
PointDesc &operator=(PointDesc &&) = default;

constexpr CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_DEVICE CCTK_HOST
PointDesc(const vect<int, dim> &I, const vect<int, dim> &NI,
PointDesc(const vect<int, dim> &I, const int iter, const vect<int, dim> &NI,
const vect<int, dim> &I0, const vect<int, dim> &BI,
const vect<int, dim> &bnd_min, const vect<int, dim> &bnd_max,
const vect<int, dim> &loop_min, const vect<int, dim> &loop_max,
const vect<CCTK_REAL, dim> &X, const vect<CCTK_REAL, dim> &DX)
: I(I), NI(NI), I0(I0), BI(BI), bnd_min(bnd_min), bnd_max(bnd_max),
loop_min(loop_min), loop_max(loop_max), X(X), DX(DX), imin(loop_min[0]),
imax(loop_max[0]), i(I[0]), j(I[1]), k(I[2]), x(X[0]), y(X[1]), z(X[2]),
dx(DX[0]), dy(DX[1]), dz(DX[2]) {}
: I(I), iter(iter), NI(NI), I0(I0), BI(BI), bnd_min(bnd_min),
bnd_max(bnd_max), loop_min(loop_min), loop_max(loop_max), X(X), DX(DX),
imin(loop_min[0]), imax(loop_max[0]), i(I[0]), j(I[1]), k(I[2]),
x(X[0]), y(X[1]), z(X[2]), dx(DX[0]), dy(DX[1]), dz(DX[2]) {}

friend std::ostream &operator<<(std::ostream &os, const PointDesc &p);
};
Expand Down Expand Up @@ -131,46 +132,50 @@ public:
GridDescBase(const cGH *cctkGH);

constexpr CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_DEVICE CCTK_HOST PointDesc
point_desc(const vect<bool, dim> &CI, const vect<int, dim> &I,
point_desc(const vect<bool, dim> &CI, const vect<int, dim> &I, const int iter,
const vect<int, dim> &NI, const vect<int, dim> &I0,
const vect<int, dim> &BI, const vect<int, dim> &bnd_min,
const vect<int, dim> &bnd_max, const vect<int, dim> &loop_min,
const vect<int, dim> &loop_max) const {
const vect<CCTK_REAL, dim> X =
x0 + (lbnd + I - vect<CCTK_REAL, dim>(!CI) / 2) * dx;
const vect<CCTK_REAL, dim> DX = dx;
return PointDesc(I, NI, I0, BI, bnd_min, bnd_max, loop_min, loop_max, X,
DX);
return PointDesc(I, iter, NI, I0, BI, bnd_min, bnd_max, loop_min, loop_max,
X, DX);
}

// Loop over a given box
template <int CI, int CJ, int CK, int VS = 1, typename F>
void loop_box(const F &f, const vect<int, dim> &restrict bnd_min,
template <int CI, int CJ, int CK, int VS = 1, int N = 1, typename F>
void loop_box(const vect<int, dim> &restrict bnd_min,
const vect<int, dim> &restrict bnd_max,
const vect<int, dim> &restrict loop_min,
const vect<int, dim> &restrict loop_max) const {
const vect<int, dim> &restrict loop_max, const F &f) const {
static_assert(CI == 0 || CI == 1);
static_assert(CJ == 0 || CJ == 1);
static_assert(CK == 0 || CK == 1);
static_assert(N >= 0);
static_assert(VS > 0);

if (any(loop_min >= loop_max))
if (N == 0 || any(loop_max <= loop_min))
return;

for (int k = loop_min[2]; k < loop_max[2]; ++k) {
for (int j = loop_min[1]; j < loop_max[1]; ++j) {
for (int iter = 0; iter < N; ++iter) {
for (int k = loop_min[2]; k < loop_max[2]; ++k) {
for (int j = loop_min[1]; j < loop_max[1]; ++j) {
#pragma omp simd
for (int i = loop_min[0]; i < loop_max[0]; i += VS) {
const vect<int, dim> I = {i, j, k};
const vect<int, dim> NI =
vect<int, dim>(I > bnd_max - 1) - vect<int, dim>(I < bnd_min);
const vect<int, dim> I0 =
if_else(NI == 0, 0, if_else(NI < 0, bnd_min, bnd_max - 1));
const vect<int, dim> BI =
vect<int, dim>(I == bnd_max - 1) - vect<int, dim>(I == bnd_min);
const PointDesc p = point_desc({CI, CJ, CK}, I, NI, I0, BI, bnd_min,
bnd_max, loop_min, loop_max);
f(p);
for (int i = loop_min[0]; i < loop_max[0]; i += VS) {
const vect<int, dim> I = {i, j, k};
const vect<int, dim> NI =
vect<int, dim>(I > bnd_max - 1) - vect<int, dim>(I < bnd_min);
const vect<int, dim> I0 =
if_else(NI == 0, 0, if_else(NI < 0, bnd_min, bnd_max - 1));
const vect<int, dim> BI =
vect<int, dim>(I == bnd_max - 1) - vect<int, dim>(I == bnd_min);
const PointDesc p =
point_desc({CI, CJ, CK}, I, iter, NI, I0, BI, bnd_min, bnd_max,
loop_min, loop_max);
f(p);
}
}
}
}
Expand Down Expand Up @@ -233,30 +238,30 @@ public:
}

// Loop over all points
template <int CI, int CJ, int CK, int VS = 1, typename F>
template <int CI, int CJ, int CK, int VS = 1, int N = 1, typename F>
inline CCTK_ATTRIBUTE_ALWAYS_INLINE void
loop_all(const vect<int, dim> &group_nghostzones, const F &f) const {
vect<int, dim> bnd_min, bnd_max;
boundary_box<CI, CJ, CK>(group_nghostzones, bnd_min, bnd_max);
vect<int, dim> imin, imax;
box_all<CI, CJ, CK>(group_nghostzones, imin, imax);
loop_box<CI, CJ, CK, VS>(f, bnd_min, bnd_max, imin, imax);
loop_box<CI, CJ, CK, VS, N>(bnd_min, bnd_max, imin, imax, f);
}

// Loop over all interior points
template <int CI, int CJ, int CK, int VS = 1, typename F>
template <int CI, int CJ, int CK, int VS = 1, int N = 1, typename F>
inline CCTK_ATTRIBUTE_ALWAYS_INLINE void
loop_int(const vect<int, dim> &group_nghostzones, const F &f) const {
vect<int, dim> bnd_min, bnd_max;
boundary_box<CI, CJ, CK>(group_nghostzones, bnd_min, bnd_max);
vect<int, dim> imin, imax;
box_int<CI, CJ, CK>(group_nghostzones, imin, imax);
loop_box<CI, CJ, CK, VS>(f, bnd_min, bnd_max, imin, imax);
loop_box<CI, CJ, CK, VS, N>(bnd_min, bnd_max, imin, imax, f);
}

// Loop over a part of the domain. Loop over the interior first,
// then faces, then edges, then corners.
template <int CI, int CJ, int CK, int VS = 1, typename F>
template <int CI, int CJ, int CK, int VS = 1, int N = 1, typename F>
inline CCTK_ATTRIBUTE_ALWAYS_INLINE void
loop_there(const vect<int, dim> &group_nghostzones,
const vect<vect<vect<bool, dim>, dim>, dim> &there,
Expand Down Expand Up @@ -304,7 +309,7 @@ public:
imax[d] = min(tmax[d], imax[d]);
}

loop_box<CI, CJ, CK, VS>(f, bnd_min, bnd_max, imin, imax);
loop_box<CI, CJ, CK, VS, N>(bnd_min, bnd_max, imin, imax, f);
}
} // if rank
}
Expand All @@ -317,7 +322,7 @@ public:
// Loop over all outer boundary points. This excludes ghost faces, but
// includes ghost edges/corners on non-ghost faces. Loop over faces first,
// then edges, then corners.
template <int CI, int CJ, int CK, int VS = 1, typename F>
template <int CI, int CJ, int CK, int VS = 1, int N = 1, typename F>
inline CCTK_ATTRIBUTE_ALWAYS_INLINE void
loop_bnd(const vect<int, dim> &group_nghostzones, const F &f) const {
vect<int, dim> bnd_min, bnd_max;
Expand Down Expand Up @@ -363,7 +368,7 @@ public:
imax[d] = min(tmax[d], imax[d]);
}

loop_box<CI, CJ, CK, VS>(f, bnd_min, bnd_max, imin, imax);
loop_box<CI, CJ, CK, VS, N>(bnd_min, bnd_max, imin, imax, f);
}
} // if rank
}
Expand Down Expand Up @@ -457,7 +462,7 @@ public:

// Loop over all outer ghost points. This excludes ghost edges/corners on
// non-ghost faces. Loop over faces first, then edges, then corners.
template <int CI, int CJ, int CK, int VS = 1, typename F>
template <int CI, int CJ, int CK, int VS = 1, int N = 1, typename F>
inline CCTK_ATTRIBUTE_ALWAYS_INLINE void
loop_ghosts(const vect<int, dim> &group_nghostzones, const F &f) const {
vect<int, dim> bnd_min, bnd_max;
Expand Down Expand Up @@ -503,7 +508,7 @@ public:
imax[d] = min(tmax[d], imax[d]);
}

loop_box<CI, CJ, CK, VS>(f, bnd_min, bnd_max, imin, imax);
loop_box<CI, CJ, CK, VS, N>(bnd_min, bnd_max, imin, imax, f);
}
} // if rank
}
Expand Down
87 changes: 47 additions & 40 deletions Loop/src/loop_device.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -39,29 +39,33 @@ public:
GridDescBaseDevice(const cGH *cctkGH) : GridDescBase(cctkGH) {}

// Loop over a given box
template <int CI, int CJ, int CK, int VS = 1, typename F>
void loop_box_device(const F &f, const vect<int, dim> &restrict bnd_min,
template <int CI, int CJ, int CK, int VS = 1, int N = 1,
int NT = AMREX_GPU_MAX_THREADS, typename F>
void loop_box_device(const vect<int, dim> &restrict bnd_min,
const vect<int, dim> &restrict bnd_max,
const vect<int, dim> &restrict loop_min,
const vect<int, dim> &restrict loop_max) const {
const vect<int, dim> &restrict loop_max,
const F &f) const {
#ifndef AMREX_USE_GPU

return this->template loop_box<CI, CJ, CK, VS>(f, bnd_min, bnd_max,
loop_min, loop_max);
return this->template loop_box<CI, CJ, CK, VS, N>(bnd_min, bnd_max,
loop_min, loop_max, f);

#else
// Run on GPU

static_assert(CI == 0 || CI == 1);
static_assert(CJ == 0 || CJ == 1);
static_assert(CK == 0 || CK == 1);
static_assert(N >= 0);
static_assert(VS > 0);
static_assert(NT > 0);

if (any(loop_min >= loop_max))
return;

// Run on GPU
static_assert(VS == 1, "Only vector size 1 is supported on GPUs");

if (N == 0 || any(loop_max <= loop_min))
return;

// For some reason, the arguments loop_min and loop_max cannot be captured
// correctly in CUDA, but copies of them can
const auto bnd_min1 = bnd_min;
Expand All @@ -77,7 +81,7 @@ public:
CJ ? amrex::IndexType::CELL : amrex::IndexType::NODE,
CK ? amrex::IndexType::CELL : amrex::IndexType::NODE));

amrex::ParallelFor(
amrex::ParallelFor<NT>(
box, [=, *this] CCTK_DEVICE(const int i, const int j,
const int k) CCTK_ATTRIBUTE_ALWAYS_INLINE {
const vect<int, dim> I = {i, j, k};
Expand All @@ -87,25 +91,22 @@ public:
if_else(NI == 0, 0, if_else(NI < 0, bnd_min1, bnd_max1 - 1));
const vect<int, dim> BI =
vect<int, dim>(I == bnd_max1 - 1) - vect<int, dim>(I == bnd_min1);
const PointDesc p = point_desc({CI, CJ, CK}, I, NI, I0, BI, bnd_min1,
bnd_max1, loop_min1, loop_max1);
f(p);
for (int iter = 0; iter < N; ++iter) {
const PointDesc p =
point_desc({CI, CJ, CK}, I, iter, NI, I0, BI, bnd_min1,
bnd_max1, loop_min1, loop_max1);
f(p);
}
});

#endif

#ifdef AMREX_USE_GPU
static bool have_gpu_sync_after_every_kernel = false;
static bool gpu_sync_after_every_kernel;
if (!have_gpu_sync_after_every_kernel) {
static const bool gpu_sync_after_every_kernel = []() {
int type;
const void *const gpu_sync_after_every_kernel_ptr =
const void *const ptr =
CCTK_ParameterGet("gpu_sync_after_every_kernel", "CarpetX", &type);
assert(gpu_sync_after_every_kernel_ptr);
assert(ptr);
assert(type == PARAMETER_BOOLEAN);
gpu_sync_after_every_kernel =
*static_cast<const CCTK_INT *>(gpu_sync_after_every_kernel_ptr);
}
return *static_cast<const CCTK_INT *>(ptr);
}();
if (gpu_sync_after_every_kernel) {
amrex::Gpu::synchronize();
AMREX_GPU_ERROR_CHECK();
Expand All @@ -114,30 +115,33 @@ public:
}

// Loop over all points
template <int CI, int CJ, int CK, int VS = 1, typename F>
template <int CI, int CJ, int CK, int VS = 1, int N = 1,
int NT = AMREX_GPU_MAX_THREADS, typename F>
inline CCTK_ATTRIBUTE_ALWAYS_INLINE void
loop_all_device(const vect<int, dim> &group_nghostzones, const F &f) const {
vect<int, dim> bnd_min, bnd_max;
boundary_box<CI, CJ, CK>(group_nghostzones, bnd_min, bnd_max);
vect<int, dim> imin, imax;
box_all<CI, CJ, CK>(group_nghostzones, imin, imax);
loop_box_device<CI, CJ, CK, VS>(f, bnd_min, bnd_max, imin, imax);
loop_box_device<CI, CJ, CK, VS, N, NT>(bnd_min, bnd_max, imin, imax, f);
}

// Loop over all interior points
template <int CI, int CJ, int CK, int VS = 1, typename F>
template <int CI, int CJ, int CK, int VS = 1, int N = 1,
int NT = AMREX_GPU_MAX_THREADS, typename F>
inline CCTK_ATTRIBUTE_ALWAYS_INLINE void
loop_int_device(const vect<int, dim> &group_nghostzones, const F &f) const {
vect<int, dim> bnd_min, bnd_max;
boundary_box<CI, CJ, CK>(group_nghostzones, bnd_min, bnd_max);
vect<int, dim> imin, imax;
box_int<CI, CJ, CK>(group_nghostzones, imin, imax);
loop_box_device<CI, CJ, CK, VS>(f, bnd_min, bnd_max, imin, imax);
loop_box_device<CI, CJ, CK, VS, N, NT>(bnd_min, bnd_max, imin, imax, f);
}

// Loop over a part of the domain. Loop over the interior first,
// then faces, then edges, then corners.
template <int CI, int CJ, int CK, int VS = 1, typename F>
template <int CI, int CJ, int CK, int VS = 1, int N = 1,
int NT = AMREX_GPU_MAX_THREADS, typename F>
inline CCTK_ATTRIBUTE_ALWAYS_INLINE void
loop_there_device(const vect<int, dim> &group_nghostzones,
const vect<vect<vect<bool, dim>, dim>, dim> &there,
Expand Down Expand Up @@ -185,8 +189,8 @@ public:
imax[d] = min(tmax[d], imax[d]);
}

loop_box_device<CI, CJ, CK, VS>(f, bnd_min, bnd_max, imin,
imax);
loop_box_device<CI, CJ, CK, VS, N, NT>(bnd_min, bnd_max, imin,
imax, f);
}
} // if rank
}
Expand All @@ -199,7 +203,8 @@ public:
// Loop over all outer boundary points. This excludes ghost faces, but
// includes ghost edges/corners on non-ghost faces. Loop over faces first,
// then edges, then corners.
template <int CI, int CJ, int CK, int VS = 1, typename F>
template <int CI, int CJ, int CK, int VS = 1, int N = 1,
int NT = AMREX_GPU_MAX_THREADS, typename F>
inline CCTK_ATTRIBUTE_ALWAYS_INLINE void
loop_bnd_device(const vect<int, dim> &group_nghostzones, const F &f) const {
vect<int, dim> bnd_min, bnd_max;
Expand Down Expand Up @@ -245,8 +250,8 @@ public:
imax[d] = min(tmax[d], imax[d]);
}

loop_box_device<CI, CJ, CK, VS>(f, bnd_min, bnd_max, imin,
imax);
loop_box_device<CI, CJ, CK, VS, N, NT>(bnd_min, bnd_max, imin,
imax, f);
}
} // if rank
}
Expand All @@ -259,7 +264,8 @@ public:
#if 0
// Loop over all outer ghost points. This includes ghost edges/corners on
// non-ghost faces. Loop over faces first, then edges, then corners.
template <int CI, int CJ, int CK, int VS = 1, typename F>
template <int CI, int CJ, int CK, int N=1,int VS = 1, int NT = AMREX_GPU_MAX_THREADS,
typename F>
inline CCTK_ATTRIBUTE_ALWAYS_INLINE void
loop_ghosts_inclusive_device(const vect<int, dim> &group_nghostzones,
const F &f) const {
Expand Down Expand Up @@ -307,8 +313,8 @@ public:
imax[d] = std::min(tmax[d], imax[d]);
}

loop_box_boundary_device<CI, CJ, CK, VS>(f, imin, imax,
inormal);
loop_box_boundary_device<CI, CJ, CK,VS, N, NT>( imin, imax,
inormal);
}
} // if rank
}
Expand All @@ -320,7 +326,8 @@ public:

// Loop over all outer ghost points. This excludes ghost edges/corners on
// non-ghost faces. Loop over faces first, then edges, then corners.
template <int CI, int CJ, int CK, int VS = 1, typename F>
template <int CI, int CJ, int CK, int VS = 1, int N = 1,
int NT = AMREX_GPU_MAX_THREADS, typename F>
inline CCTK_ATTRIBUTE_ALWAYS_INLINE void
loop_ghosts_device(const vect<int, dim> &group_nghostzones,
const F &f) const {
Expand Down Expand Up @@ -367,8 +374,8 @@ public:
imax[d] = min(tmax[d], imax[d]);
}

loop_box_device<CI, CJ, CK, VS>(f, bnd_min, bnd_max, imin,
imax);
loop_box_device<CI, CJ, CK, VS, N, NT>(bnd_min, bnd_max, imin,
imax, f);
}
} // if rank
}
Expand Down