From b12790ebe15c37113110b2ac5fe2beeeb1bb8af7 Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Wed, 5 Jul 2023 14:23:30 -0400 Subject: [PATCH 1/3] Arith: Mark more functions as constexpr CCTK_HOST CCTK_DEVICE --- Arith/src/defs.hxx | 50 +++++++++++++++++++++++++++------------------- Arith/src/sum.hxx | 19 +++++++++--------- 2 files changed, 39 insertions(+), 30 deletions(-) diff --git a/Arith/src/defs.hxx b/Arith/src/defs.hxx index c18d58475..cdecb4aaa 100644 --- a/Arith/src/defs.hxx +++ b/Arith/src/defs.hxx @@ -191,71 +191,74 @@ constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST auto if_else(bool c, const T &x, //////////////////////////////////////////////////////////////////////////////// // bitsign(i) = (-1)^i, the converse to signbit -constexpr ARITH_DEVICE ARITH_HOST int bitsign(bool c) { +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST int bitsign(bool c) { return if_else(c, -1, 1); } -constexpr ARITH_DEVICE ARITH_HOST int bitsign(int i) { +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST int bitsign(int i) { return bitsign(i % 2 != 0); } // Return x if y>0, -x if y<0 template -inline ARITH_INLINE ARITH_DEVICE ARITH_HOST T flipsign(const T &x, const T &y) { +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T flipsign(const T &x, + const T &y) { using std::copysign; return copysign(T(1), y) * x; } // A max function that returns nan when any argument is nan template -inline ARITH_INLINE ARITH_DEVICE ARITH_HOST T max1(const T &x, const T &y) { +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T max1(const T &x, const T &y) { using std::max; return if_else(x != x, x, if_else(y != y, y, max(x, y))); } // The maximum of the absolute values. This is reduces over containers. template -inline ARITH_INLINE ARITH_DEVICE ARITH_HOST T maxabs(const T &x) { +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T maxabs(const T &x) { using std::abs; return abs(x); } template -inline ARITH_INLINE ARITH_DEVICE ARITH_HOST T sumabs(const T &x) { +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T sumabs(const T &x) { using std::abs; return abs(x); } // A min function that returns nan when any argument is nan template -inline ARITH_INLINE ARITH_DEVICE ARITH_HOST T min1(const T &x, const T &y) { +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T min1(const T &x, const T &y) { using std::min; return if_else(x != x, x, if_else(y != y, y, min(x, y))); } // Multiply-add template -inline ARITH_INLINE ARITH_DEVICE ARITH_HOST T muladd(const T &x, const T &y, - const T &z) { +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T muladd(const T &x, const T &y, + const T &z) { return x * y + z; } template -inline ARITH_INLINE ARITH_DEVICE ARITH_HOST T mulsub(const T &x, const T &y, - const T &z) { +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T mulsub(const T &x, const T &y, + const T &z) { return x * y - z; } template -inline ARITH_INLINE ARITH_DEVICE ARITH_HOST T negmuladd(const T &x, const T &y, - const T &z) { +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T negmuladd(const T &x, + const T &y, + const T &z) { return -(x * y) + z; } template -inline ARITH_INLINE ARITH_DEVICE ARITH_HOST T negmulsub(const T &x, const T &y, - const T &z) { +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T negmulsub(const T &x, + const T &y, + const T &z) { return -(x * y) - z; } // Factorial -inline ARITH_INLINE ARITH_DEVICE ARITH_HOST int factorial(int n) { +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST int factorial(int n) { int r = 1; while (n > 0) r *= n--; @@ -263,7 +266,8 @@ inline ARITH_INLINE ARITH_DEVICE ARITH_HOST int factorial(int n) { } namespace detail { -template constexpr T pown(const T &x, int n) { +template +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T pown(const T &x, int n) { T r{1}; T y{x}; // invariant: initial(x^n) == r * y^n @@ -278,18 +282,22 @@ template constexpr T pown(const T &x, int n) { } // namespace detail // Raise a value to an integer power, calculated efficiently -template constexpr T pown(const T &x, const int n) { +template +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T pown(const T &x, const int n) { return n >= 0 ? detail::pown(x, n) : 1 / detail::pown(x, -n); } -template constexpr T pow2(const T &x) { return x * x; } +template +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T pow2(const T &x) { + return x * x; +} //////////////////////////////////////////////////////////////////////////////// // Linear combination template -constexpr T lincom(const I &i0, const T &x0, const I &i1, const T &x1, - const I &i) { +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T +lincom(const I &i0, const T &x0, const I &i1, const T &x1, const I &i) { return T(i - i1) / T(i0 - i1) * x0 + T(i - i0) / T(i1 - i0) * x1; } diff --git a/Arith/src/sum.hxx b/Arith/src/sum.hxx index 3f918f4fb..d7c96adb6 100644 --- a/Arith/src/sum.hxx +++ b/Arith/src/sum.hxx @@ -18,8 +18,9 @@ using namespace std; // summand. template -constexpr ARITH_INLINE remove_cv_t > > -sum(F f) { +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST + remove_cv_t > > + sum(F f) { typedef remove_cv_t > > R; R s = zero(); for (int x = 0; x < D; ++x) @@ -28,8 +29,8 @@ sum(F f) { } template -constexpr - ARITH_INLINE remove_cv_t > > +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST + remove_cv_t > > sum(F f) { typedef remove_cv_t > > R; R s = zero(); @@ -40,7 +41,7 @@ constexpr } template -constexpr ARITH_INLINE +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST remove_cv_t > > sum(F f) { typedef remove_cv_t > > R; @@ -53,7 +54,7 @@ constexpr ARITH_INLINE } template -constexpr ARITH_INLINE +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST remove_cv_t > > sum(F f) { typedef remove_cv_t > > @@ -68,8 +69,8 @@ constexpr ARITH_INLINE } template -constexpr - ARITH_INLINE remove_cv_t > > +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST + remove_cv_t > > sum_symm(F f) { typedef remove_cv_t > > R; R s = zero(); @@ -80,7 +81,7 @@ constexpr } template -constexpr ARITH_INLINE +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST remove_cv_t > > sum_symm(F f) { typedef remove_cv_t > > R; From 06b1dbda99ad7af2fe71a3c98303d34ed81b52be Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Wed, 5 Jul 2023 14:24:23 -0400 Subject: [PATCH 2/3] CarpetX: Avoid run-time errors on Summit with CUDA 11.5.2 --- CarpetX/src/boundaries.hxx | 12 ++++++------ CarpetX/src/boundaries_impl.hxx | 14 ++++++++------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/CarpetX/src/boundaries.hxx b/CarpetX/src/boundaries.hxx index 1a0e52fcc..4e3011189 100644 --- a/CarpetX/src/boundaries.hxx +++ b/CarpetX/src/boundaries.hxx @@ -108,17 +108,17 @@ struct BoundaryKernel { static constexpr int maxncomps = 10; - std::array dirichlet_values; + Arith::vect dirichlet_values; Arith::vect neumann_source; Arith::vect linear_extrapolation_source; Arith::vect robin_source; - std::array robin_values; + Arith::vect robin_values; Arith::vect reflection_offset; - std::array reflection_parities; + Arith::vect reflection_parities; inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_DEVICE void operator()(const Arith::vect &dst, int cmin, int cmax) const; @@ -158,13 +158,13 @@ struct BoundaryCondition { const Arith::vect symmetries, const Arith::vect boundaries, // - const std::array &dirichlet_values, + const Arith::vect &dirichlet_values, const Arith::vect &neumann_source, const Arith::vect &linear_extrapolation_source, const Arith::vect &robin_source, - const std::array &robin_values, + const Arith::vect &robin_values, const Arith::vect &reflection_offset, - const std::array + const Arith::vect &reflection_parities) const { return BoundaryKernel{ #ifdef CCTK_DEBUG diff --git a/CarpetX/src/boundaries_impl.hxx b/CarpetX/src/boundaries_impl.hxx index df967c1e2..dbceb3943 100644 --- a/CarpetX/src/boundaries_impl.hxx +++ b/CarpetX/src/boundaries_impl.hxx @@ -219,7 +219,7 @@ void BoundaryCondition::apply_on_face_symbcxyz( } else { // This is the generic case for applying boundary conditions. - std::array dirichlet_values; + Arith::vect dirichlet_values; for (int comp = 0; comp < ncomps; ++comp) dirichlet_values[comp] = groupdata.dirichlet_values.at(comp); @@ -269,7 +269,7 @@ void BoundaryCondition::apply_on_face_symbcxyz( } } - std::array robin_values; + Arith::vect robin_values; for (int comp = 0; comp < ncomps; ++comp) robin_values[comp] = groupdata.robin_values.at(comp); @@ -289,7 +289,7 @@ void BoundaryCondition::apply_on_face_symbcxyz( } } - std::array reflection_parities; + Arith::vect reflection_parities; for (int comp = 0; comp < ncomps; ++comp) { CCTK_REAL reflection_parity = +1; for (int d = 0; d < dim; ++d) @@ -300,12 +300,15 @@ void BoundaryCondition::apply_on_face_symbcxyz( reflection_parities[comp] = reflection_parity; } + // We cannot capture `destptr` directly (on Summit, with CUDA 11.5.2) + CCTK_REAL *restrict const destptr1 = destptr; + const auto kernel = [ #ifdef CCTK_DEBUG amin = amin, amax = amax, dmin = dmin, dmax = dmax, #endif - xmin = xmin, dx = dx, layout = layout, destptr = destptr, + xmin = xmin, dx = dx, layout = layout, destptr = destptr1, // cmin, cmax, dirichlet_values, neumann_source, linear_extrapolation_source, robin_source, robin_values, @@ -367,8 +370,7 @@ void BoundaryCondition::apply_on_face_symbcxyz( } } #ifdef CCTK_DEBUG - for (int d = 0; d < dim; ++d) - assert(src[d] >= dmin[d] && src[d] < dmax[d]); + assert(all(src >= dmin && src < dmax)); assert(!all(src == dst)); #endif From 535c9efb7b9e4b987fb8eb5edd2478b96bfd3ca1 Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Wed, 5 Jul 2023 18:06:39 -0400 Subject: [PATCH 3/3] CarpetX: Don't use `restrict` in boundary kernels --- CarpetX/src/boundaries_impl.hxx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CarpetX/src/boundaries_impl.hxx b/CarpetX/src/boundaries_impl.hxx index dbceb3943..7d0d1fb12 100644 --- a/CarpetX/src/boundaries_impl.hxx +++ b/CarpetX/src/boundaries_impl.hxx @@ -301,7 +301,8 @@ void BoundaryCondition::apply_on_face_symbcxyz( } // We cannot capture `destptr` directly (on Summit, with CUDA 11.5.2) - CCTK_REAL *restrict const destptr1 = destptr; + // We cannot use a `restrict` declaration either. + CCTK_REAL *const destptr1 = destptr; const auto kernel = [