From b12790ebe15c37113110b2ac5fe2beeeb1bb8af7 Mon Sep 17 00:00:00 2001
From: Erik Schnetter <schnetter@gmail.com>
Date: Wed, 5 Jul 2023 14:23:30 -0400
Subject: [PATCH 1/3] Arith: Mark more functions as constexpr CCTK_HOST
 CCTK_DEVICE

---
 Arith/src/defs.hxx | 50 +++++++++++++++++++++++++++-------------------
 Arith/src/sum.hxx  | 19 +++++++++---------
 2 files changed, 39 insertions(+), 30 deletions(-)
diff --git a/Arith/src/defs.hxx b/Arith/src/defs.hxx
index c18d58475..cdecb4aaa 100644
--- a/Arith/src/defs.hxx
+++ b/Arith/src/defs.hxx
@@ -191,71 +191,74 @@ constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST auto if_else(bool c, const T &x,
 ////////////////////////////////////////////////////////////////////////////////
 
 // bitsign(i) = (-1)^i, the converse to signbit
-constexpr ARITH_DEVICE ARITH_HOST int bitsign(bool c) {
+constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST int bitsign(bool c) {
   return if_else(c, -1, 1);
 }
-constexpr ARITH_DEVICE ARITH_HOST int bitsign(int i) {
+constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST int bitsign(int i) {
   return bitsign(i % 2 != 0);
 }
 
 // Return x if y>0, -x if y<0
 template <typename T>
-inline ARITH_INLINE ARITH_DEVICE ARITH_HOST T flipsign(const T &x, const T &y) {
+constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T flipsign(const T &x,
+                                                          const T &y) {
   using std::copysign;
   return copysign(T(1), y) * x;
 }
 
 // A max function that returns nan when any argument is nan
 template <typename T>
-inline ARITH_INLINE ARITH_DEVICE ARITH_HOST T max1(const T &x, const T &y) {
+constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T max1(const T &x, const T &y) {
   using std::max;
   return if_else(x != x, x, if_else(y != y, y, max(x, y)));
 }
 
 // The maximum of the absolute values. This is reduces over containers.
 template <typename T>
-inline ARITH_INLINE ARITH_DEVICE ARITH_HOST T maxabs(const T &x) {
+constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T maxabs(const T &x) {
   using std::abs;
   return abs(x);
 }
 
 template <typename T>
-inline ARITH_INLINE ARITH_DEVICE ARITH_HOST T sumabs(const T &x) {
+constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T sumabs(const T &x) {
   using std::abs;
   return abs(x);
 }
 
 // A min function that returns nan when any argument is nan
 template <typename T>
-inline ARITH_INLINE ARITH_DEVICE ARITH_HOST T min1(const T &x, const T &y) {
+constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T min1(const T &x, const T &y) {
   using std::min;
   return if_else(x != x, x, if_else(y != y, y, min(x, y)));
 }
 
 // Multiply-add
 template <typename T>
-inline ARITH_INLINE ARITH_DEVICE ARITH_HOST T muladd(const T &x, const T &y,
-                                                     const T &z) {
+constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T muladd(const T &x, const T &y,
+                                                        const T &z) {
   return x * y + z;
 }
 template <typename T>
-inline ARITH_INLINE ARITH_DEVICE ARITH_HOST T mulsub(const T &x, const T &y,
-                                                     const T &z) {
+constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T mulsub(const T &x, const T &y,
+                                                        const T &z) {
   return x * y - z;
 }
 template <typename T>
-inline ARITH_INLINE ARITH_DEVICE ARITH_HOST T negmuladd(const T &x, const T &y,
-                                                        const T &z) {
+constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T negmuladd(const T &x,
+                                                           const T &y,
+                                                           const T &z) {
   return -(x * y) + z;
 }
 template <typename T>
-inline ARITH_INLINE ARITH_DEVICE ARITH_HOST T negmulsub(const T &x, const T &y,
-                                                        const T &z) {
+constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T negmulsub(const T &x,
+                                                           const T &y,
+                                                           const T &z) {
   return -(x * y) - z;
 }
 
 // Factorial
-inline ARITH_INLINE ARITH_DEVICE ARITH_HOST int factorial(int n) {
+constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST int factorial(int n) {
   int r = 1;
   while (n > 0)
     r *= n--;
@@ -263,7 +266,8 @@ inline ARITH_INLINE ARITH_DEVICE ARITH_HOST int factorial(int n) {
 }
 
 namespace detail {
-template <typename T> constexpr T pown(const T &x, int n) {
+template <typename T>
+constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T pown(const T &x, int n) {
   T r{1};
   T y{x};
   // invariant: initial(x^n) == r * y^n
@@ -278,18 +282,22 @@ template <typename T> constexpr T pown(const T &x, int n) {
 } // namespace detail
 
 // Raise a value to an integer power, calculated efficiently
-template <typename T> constexpr T pown(const T &x, const int n) {
+template <typename T>
+constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T pown(const T &x, const int n) {
   return n >= 0 ? detail::pown(x, n) : 1 / detail::pown(x, -n);
 }
 
-template <typename T> constexpr T pow2(const T &x) { return x * x; }
+template <typename T>
+constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T pow2(const T &x) {
+  return x * x;
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 
 // Linear combination
 template <typename T, typename I>
-constexpr T lincom(const I &i0, const T &x0, const I &i1, const T &x1,
-                   const I &i) {
+constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T
+lincom(const I &i0, const T &x0, const I &i1, const T &x1, const I &i) {
   return T(i - i1) / T(i0 - i1) * x0 + T(i - i0) / T(i1 - i0) * x1;
 }
 
diff --git a/Arith/src/sum.hxx b/Arith/src/sum.hxx
index 3f918f4fb..d7c96adb6 100644
--- a/Arith/src/sum.hxx
+++ b/Arith/src/sum.hxx
@@ -18,8 +18,9 @@ using namespace std;
 // summand.
 
 template <int D, typename F>
-constexpr ARITH_INLINE remove_cv_t<remove_reference_t<result_of_t<F(int)> > >
-sum(F f) {
+constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST
+    remove_cv_t<remove_reference_t<result_of_t<F(int)> > >
+    sum(F f) {
   typedef remove_cv_t<remove_reference_t<result_of_t<F(int)> > > R;
   R s = zero<R>();
   for (int x = 0; x < D; ++x)
@@ -28,8 +29,8 @@ sum(F f) {
 }
 
 template <int D, typename F>
-constexpr
-    ARITH_INLINE remove_cv_t<remove_reference_t<result_of_t<F(int, int)> > >
+constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST
+    remove_cv_t<remove_reference_t<result_of_t<F(int, int)> > >
     sum(F f) {
   typedef remove_cv_t<remove_reference_t<result_of_t<F(int, int)> > > R;
   R s = zero<R>();
@@ -40,7 +41,7 @@ constexpr
 }
 
 template <int D, typename F>
-constexpr ARITH_INLINE
+constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST
     remove_cv_t<remove_reference_t<result_of_t<F(int, int, int)> > >
     sum(F f) {
   typedef remove_cv_t<remove_reference_t<result_of_t<F(int, int, int)> > > R;
@@ -53,7 +54,7 @@ constexpr ARITH_INLINE
 }
 
 template <int D, typename F>
-constexpr ARITH_INLINE
+constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST
     remove_cv_t<remove_reference_t<result_of_t<F(int, int, int, int)> > >
     sum(F f) {
   typedef remove_cv_t<remove_reference_t<result_of_t<F(int, int, int, int)> > >
@@ -68,8 +69,8 @@ constexpr ARITH_INLINE
 }
 
 template <int D, typename F>
-constexpr
-    ARITH_INLINE remove_cv_t<remove_reference_t<result_of_t<F(int, int)> > >
+constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST
+    remove_cv_t<remove_reference_t<result_of_t<F(int, int)> > >
     sum_symm(F f) {
   typedef remove_cv_t<remove_reference_t<result_of_t<F(int, int)> > > R;
   R s = zero<R>();
@@ -80,7 +81,7 @@ constexpr
 }
 
 template <int D, typename F>
-constexpr ARITH_INLINE
+constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST
     remove_cv_t<remove_reference_t<result_of_t<F(int, int, int)> > >
     sum_symm(F f) {
   typedef remove_cv_t<remove_reference_t<result_of_t<F(int, int, int)> > > R;

From 06b1dbda99ad7af2fe71a3c98303d34ed81b52be Mon Sep 17 00:00:00 2001
From: Erik Schnetter <schnetter@gmail.com>
Date: Wed, 5 Jul 2023 14:24:23 -0400
Subject: [PATCH 2/3] CarpetX: Avoid run-time errors on Summit with CUDA 11.5.2

---
 CarpetX/src/boundaries.hxx      | 12 ++++++------
 CarpetX/src/boundaries_impl.hxx | 14 ++++++++------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/CarpetX/src/boundaries.hxx b/CarpetX/src/boundaries.hxx
index 1a0e52fcc..4e3011189 100644
--- a/CarpetX/src/boundaries.hxx
+++ b/CarpetX/src/boundaries.hxx
@@ -108,17 +108,17 @@ struct BoundaryKernel {
 
   static constexpr int maxncomps = 10;
 
-  std::array<CCTK_REAL, maxncomps> dirichlet_values;
+  Arith::vect<CCTK_REAL, maxncomps> dirichlet_values;
 
   Arith::vect<int, dim> neumann_source;
 
   Arith::vect<int, dim> linear_extrapolation_source;
 
   Arith::vect<int, dim> robin_source;
-  std::array<CCTK_REAL, maxncomps> robin_values;
+  Arith::vect<CCTK_REAL, maxncomps> robin_values;
 
   Arith::vect<int, dim> reflection_offset;
-  std::array<CCTK_REAL, maxncomps> reflection_parities;
+  Arith::vect<CCTK_REAL, maxncomps> reflection_parities;
 
   inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_DEVICE void
   operator()(const Arith::vect<int, dim> &dst, int cmin, int cmax) const;
@@ -158,13 +158,13 @@ struct BoundaryCondition {
       const Arith::vect<symmetry_t, dim> symmetries,
       const Arith::vect<boundary_t, dim> boundaries,
       //
-      const std::array<CCTK_REAL, BoundaryKernel::maxncomps> &dirichlet_values,
+      const Arith::vect<CCTK_REAL, BoundaryKernel::maxncomps> &dirichlet_values,
       const Arith::vect<int, dim> &neumann_source,
       const Arith::vect<int, dim> &linear_extrapolation_source,
       const Arith::vect<int, dim> &robin_source,
-      const std::array<CCTK_REAL, BoundaryKernel::maxncomps> &robin_values,
+      const Arith::vect<CCTK_REAL, BoundaryKernel::maxncomps> &robin_values,
       const Arith::vect<int, dim> &reflection_offset,
-      const std::array<CCTK_REAL, BoundaryKernel::maxncomps>
+      const Arith::vect<CCTK_REAL, BoundaryKernel::maxncomps>
           &reflection_parities) const {
     return BoundaryKernel{
 #ifdef CCTK_DEBUG
diff --git a/CarpetX/src/boundaries_impl.hxx b/CarpetX/src/boundaries_impl.hxx
index df967c1e2..dbceb3943 100644
--- a/CarpetX/src/boundaries_impl.hxx
+++ b/CarpetX/src/boundaries_impl.hxx
@@ -219,7 +219,7 @@ void BoundaryCondition::apply_on_face_symbcxyz(
   } else {
     // This is the generic case for applying boundary conditions.
 
-    std::array<CCTK_REAL, BoundaryKernel::maxncomps> dirichlet_values;
+    Arith::vect<CCTK_REAL, BoundaryKernel::maxncomps> dirichlet_values;
     for (int comp = 0; comp < ncomps; ++comp)
       dirichlet_values[comp] = groupdata.dirichlet_values.at(comp);
 
@@ -269,7 +269,7 @@ void BoundaryCondition::apply_on_face_symbcxyz(
       }
     }
 
-    std::array<CCTK_REAL, BoundaryKernel::maxncomps> robin_values;
+    Arith::vect<CCTK_REAL, BoundaryKernel::maxncomps> robin_values;
     for (int comp = 0; comp < ncomps; ++comp)
       robin_values[comp] = groupdata.robin_values.at(comp);
 
@@ -289,7 +289,7 @@ void BoundaryCondition::apply_on_face_symbcxyz(
       }
     }
 
-    std::array<CCTK_REAL, BoundaryKernel::maxncomps> reflection_parities;
+    Arith::vect<CCTK_REAL, BoundaryKernel::maxncomps> reflection_parities;
     for (int comp = 0; comp < ncomps; ++comp) {
       CCTK_REAL reflection_parity = +1;
       for (int d = 0; d < dim; ++d)
@@ -300,12 +300,15 @@ void BoundaryCondition::apply_on_face_symbcxyz(
       reflection_parities[comp] = reflection_parity;
     }
 
+    // We cannot capture `destptr` directly (on Summit, with CUDA 11.5.2)
+    CCTK_REAL *restrict const destptr1 = destptr;
+
     const auto kernel =
         [
 #ifdef CCTK_DEBUG
             amin = amin, amax = amax, dmin = dmin, dmax = dmax,
 #endif
-            xmin = xmin, dx = dx, layout = layout, destptr = destptr,
+            xmin = xmin, dx = dx, layout = layout, destptr = destptr1,
             //
             cmin, cmax, dirichlet_values, neumann_source,
             linear_extrapolation_source, robin_source, robin_values,
@@ -367,8 +370,7 @@ void BoundaryCondition::apply_on_face_symbcxyz(
                 }
               }
 #ifdef CCTK_DEBUG
-              for (int d = 0; d < dim; ++d)
-                assert(src[d] >= dmin[d] && src[d] < dmax[d]);
+              assert(all(src >= dmin && src < dmax));
               assert(!all(src == dst));
 #endif
 

From 535c9efb7b9e4b987fb8eb5edd2478b96bfd3ca1 Mon Sep 17 00:00:00 2001
From: Erik Schnetter <schnetter@gmail.com>
Date: Wed, 5 Jul 2023 18:06:39 -0400
Subject: [PATCH 3/3] CarpetX: Don't use `restrict` in boundary kernels

---
 CarpetX/src/boundaries_impl.hxx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CarpetX/src/boundaries_impl.hxx b/CarpetX/src/boundaries_impl.hxx
index dbceb3943..7d0d1fb12 100644
--- a/CarpetX/src/boundaries_impl.hxx
+++ b/CarpetX/src/boundaries_impl.hxx
@@ -301,7 +301,8 @@ void BoundaryCondition::apply_on_face_symbcxyz(
     }
 
     // We cannot capture `destptr` directly (on Summit, with CUDA 11.5.2)
-    CCTK_REAL *restrict const destptr1 = destptr;
+    // We cannot use a `restrict` declaration either.
+    CCTK_REAL *const destptr1 = destptr;
 
     const auto kernel =
         [