From b225a2cefd44b919058ac0fde12393fb4d2bc07e Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Thu, 18 Nov 2021 19:23:49 -0800
Subject: [PATCH 01/21] CUDA and HIP forall, with tests.

---
 include/RAJA/policy/cuda/forall.hpp           | 14 +++---
 include/RAJA/policy/cuda/policy.hpp           | 16 +++++--
 include/RAJA/policy/cuda/scan.hpp             | 13 +++--
 include/RAJA/policy/cuda/sort.hpp             | 48 +++++++++----------
 include/RAJA/policy/hip/forall.hpp            | 14 +++---
 include/RAJA/policy/hip/policy.hpp            | 16 +++++--
 include/RAJA/policy/hip/scan.hpp              | 13 +++--
 include/RAJA/policy/hip/sort.hpp              | 48 +++++++++----------
 include/RAJA/util/resource.hpp                | 20 ++++++++
 test/include/RAJA_test-forall-execpol.hpp     |  6 ++-
 .../algorithm/tests/test-algorithm-sort.hpp   |  6 ++-
 .../tests/test-algorithm-stable-sort.hpp      |  6 ++-
 12 files changed, 134 insertions(+), 86 deletions(-)
diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
index 626b506796..aedfdf2e27 100644
--- a/include/RAJA/policy/cuda/forall.hpp
+++ b/include/RAJA/policy/cuda/forall.hpp
@@ -130,10 +130,11 @@ __device__ __forceinline__ unsigned int getGlobalNumThreads_3D_3D()
  ******************************************************************************
  */
 template <size_t BlockSize,
+          size_t BlocksPerSM,
           typename Iterator,
           typename LOOP_BODY,
           typename IndexType>
-__launch_bounds__(BlockSize, 1) __global__
+__launch_bounds__(BlockSize, BlocksPerSM) __global__
     void forall_cuda_kernel(LOOP_BODY loop_body,
                             const Iterator idx,
                             IndexType length)
@@ -157,9 +158,9 @@ __launch_bounds__(BlockSize, 1) __global__
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async>
+template <typename Iterable, typename LoopBody, size_t BlockSize, size_t BlocksPerSM, bool Async>
 RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(resources::Cuda cuda_res,
-                                                    cuda_exec<BlockSize, Async>,
+                                                    cuda_exec_explicit<BlockSize, BlocksPerSM, Async>,
                                                     Iterable&& iter,
                                                     LoopBody&& loop_body)
 {
@@ -167,7 +168,7 @@ RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(resources::Cuda c
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
-  auto func = impl::forall_cuda_kernel<BlockSize, Iterator, LOOP_BODY, IndexType>;
+  auto func = impl::forall_cuda_kernel<BlockSize, BlocksPerSM, Iterator, LOOP_BODY, IndexType>;
 
   //
   // Compute the requested iteration space size
@@ -238,11 +239,12 @@ RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(resources::Cuda c
  */
 template <typename LoopBody,
           size_t BlockSize,
+          size_t BlocksPerSM,
           bool Async,
           typename... SegmentTypes>
 RAJA_INLINE resources::EventProxy<resources::Cuda>
 forall_impl(resources::Cuda r,
-            ExecPolicy<seq_segit, cuda_exec<BlockSize, Async>>,
+            ExecPolicy<seq_segit, cuda_exec_explicit<BlockSize, BlocksPerSM, Async>>,
             const TypedIndexSet<SegmentTypes...>& iset,
             LoopBody&& loop_body)
 {
@@ -251,7 +253,7 @@ forall_impl(resources::Cuda r,
     iset.segmentCall(r,
                      isi,
                      detail::CallForall(),
-                     cuda_exec<BlockSize, true>(),
+                     cuda_exec_explicit<BlockSize, BlocksPerSM, true>(),
                      loop_body);
   }  // iterate over segments of index set
 
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 855d59deb5..9dedf5cc89 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -74,8 +74,8 @@ namespace policy
 namespace cuda
 {
 
-template <size_t BLOCK_SIZE, bool Async = false>
-struct cuda_exec : public RAJA::make_policy_pattern_launch_platform_t<
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
+struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Policy::cuda,
                        RAJA::Pattern::forall,
                        detail::get_launch<Async>::value,
@@ -220,10 +220,16 @@ struct cuda_synchronize : make_policy_pattern_launch_t<Policy::cuda,
 }  // end namespace cuda
 }  // end namespace policy
 
-using policy::cuda::cuda_exec;
+using policy::cuda::cuda_exec_explicit;
 
-template <size_t BLOCK_SIZE>
-using cuda_exec_async = policy::cuda::cuda_exec<BLOCK_SIZE, true>;
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
+using cuda_exec_explicit_async = policy::cuda::cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>;
+
+template <size_t BLOCK_SIZE, bool ASYNC = false>
+using cuda_exec = policy::cuda::cuda_exec_explicit<BLOCK_SIZE, 1, ASYNC>;
+
+template <size_t BLOCK_SIZE, bool ASYNC = true>
+using cuda_exec_async = policy::cuda::cuda_exec_explicit<BLOCK_SIZE, 1, true>;
 
 using policy::cuda::cuda_work;
 
diff --git a/include/RAJA/policy/cuda/scan.hpp b/include/RAJA/policy/cuda/scan.hpp
index 978a1cb0a1..d7ed28c39c 100644
--- a/include/RAJA/policy/cuda/scan.hpp
+++ b/include/RAJA/policy/cuda/scan.hpp
@@ -42,12 +42,12 @@ namespace scan
         \brief explicit inclusive inplace scan given range, function, and
    initial value
 */
-template <size_t BLOCK_SIZE, bool Async, typename InputIter, typename Function>
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename InputIter, typename Function>
 RAJA_INLINE
 resources::EventProxy<resources::Cuda>
 inclusive_inplace(
     resources::Cuda cuda_res,
-    cuda_exec<BLOCK_SIZE, Async>,
+    cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op)
@@ -90,6 +90,7 @@ inclusive_inplace(
    initial value
 */
 template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
           bool Async,
           typename InputIter,
           typename Function,
@@ -98,7 +99,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Cuda>
 exclusive_inplace(
     resources::Cuda cuda_res,
-    cuda_exec<BLOCK_SIZE, Async>,
+    cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op,
@@ -144,6 +145,7 @@ exclusive_inplace(
    initial value
 */
 template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
           bool Async,
           typename InputIter,
           typename OutputIter,
@@ -152,7 +154,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Cuda>
 inclusive(
     resources::Cuda cuda_res,
-    cuda_exec<BLOCK_SIZE, Async>,
+    cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     InputIter begin,
     InputIter end,
     OutputIter out,
@@ -196,6 +198,7 @@ inclusive(
    initial value
 */
 template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
           bool Async,
           typename InputIter,
           typename OutputIter,
@@ -205,7 +208,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Cuda>
 exclusive(
     resources::Cuda cuda_res,
-    cuda_exec<BLOCK_SIZE, Async>,
+    cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     InputIter begin,
     InputIter end,
     OutputIter out,
diff --git a/include/RAJA/policy/cuda/sort.hpp b/include/RAJA/policy/cuda/sort.hpp
index e91831bbc8..53c8661ec1 100644
--- a/include/RAJA/policy/cuda/sort.hpp
+++ b/include/RAJA/policy/cuda/sort.hpp
@@ -44,7 +44,7 @@ namespace sort
 /*!
         \brief static assert unimplemented stable sort
 */
-template <size_t BLOCK_SIZE, bool Async, typename Iter, typename Compare>
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       concepts::negate<concepts::all_of<
                         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
@@ -54,7 +54,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
 stable(
     resources::Cuda cuda_res,
-    cuda_exec<BLOCK_SIZE, Async>,
+    cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     Iter,
     Iter,
     Compare)
@@ -75,13 +75,13 @@ stable(
 /*!
         \brief stable sort given range in ascending order
 */
-template <size_t BLOCK_SIZE, bool Async, typename Iter>
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 stable(
     resources::Cuda cuda_res,
-    cuda_exec<BLOCK_SIZE, Async>,
+    cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     Iter begin,
     Iter end,
     operators::less<RAJA::detail::IterVal<Iter>>)
@@ -143,13 +143,13 @@ stable(
 /*!
         \brief stable sort given range in descending order
 */
-template <size_t BLOCK_SIZE, bool Async, typename Iter>
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 stable(
     resources::Cuda cuda_res,
-    cuda_exec<BLOCK_SIZE, Async>,
+    cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     Iter begin,
     Iter end,
     operators::greater<RAJA::detail::IterVal<Iter>>)
@@ -212,7 +212,7 @@ stable(
 /*!
         \brief static assert unimplemented sort
 */
-template <size_t BLOCK_SIZE, bool Async, typename Iter, typename Compare>
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       concepts::negate<concepts::all_of<
                         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
@@ -222,7 +222,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
 unstable(
     resources::Cuda cuda_res,
-    cuda_exec<BLOCK_SIZE, Async>,
+    cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     Iter,
     Iter,
     Compare)
@@ -243,13 +243,13 @@ unstable(
 /*!
         \brief sort given range in ascending order
 */
-template <size_t BLOCK_SIZE, bool Async, typename Iter>
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 unstable(
     resources::Cuda cuda_res,
-    cuda_exec<BLOCK_SIZE, Async> p,
+    cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> p,
     Iter begin,
     Iter end,
     operators::less<RAJA::detail::IterVal<Iter>> comp)
@@ -260,13 +260,13 @@ unstable(
 /*!
         \brief sort given range in descending order
 */
-template <size_t BLOCK_SIZE, bool Async, typename Iter>
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 unstable(
     resources::Cuda cuda_res,
-    cuda_exec<BLOCK_SIZE, Async> p,
+    cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> p,
     Iter begin,
     Iter end,
     operators::greater<RAJA::detail::IterVal<Iter>> comp)
@@ -278,7 +278,7 @@ unstable(
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       concepts::negate<concepts::all_of<
@@ -290,7 +290,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 stable_pairs(
     resources::Cuda cuda_res,
-    cuda_exec<BLOCK_SIZE, Async>,
+    cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     KeyIter,
     KeyIter,
     ValIter,
@@ -314,7 +314,7 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -322,7 +322,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       std::is_pointer<ValIter>>
 stable_pairs(
     resources::Cuda cuda_res,
-    cuda_exec<BLOCK_SIZE, Async>,
+    cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -396,7 +396,7 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -404,7 +404,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       std::is_pointer<ValIter>>
 stable_pairs(
     resources::Cuda cuda_res,
-    cuda_exec<BLOCK_SIZE, Async>,
+    cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -479,7 +479,7 @@ stable_pairs(
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       concepts::negate<concepts::all_of<
@@ -491,7 +491,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 unstable_pairs(
     resources::Cuda cuda_res,
-    cuda_exec<BLOCK_SIZE, Async>,
+    cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     KeyIter,
     KeyIter,
     ValIter,
@@ -515,7 +515,7 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -523,7 +523,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Cuda cuda_res,
-    cuda_exec<BLOCK_SIZE, Async> p,
+    cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -535,7 +535,7 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -543,7 +543,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Cuda cuda_res,
-    cuda_exec<BLOCK_SIZE, Async> p,
+    cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index 99e1b8a63a..6e643459f5 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -128,10 +128,11 @@ __device__ __forceinline__ unsigned int getGlobalNumThreads_3D_3D()
  ******************************************************************************
  */
 template <size_t BlockSize,
+          size_t BlocksPerSM,
           typename Iterator,
           typename LOOP_BODY,
           typename IndexType>
-__launch_bounds__(BlockSize, 1) __global__
+__launch_bounds__(BlockSize, BlocksPerSM) __global__
     void forall_hip_kernel(LOOP_BODY loop_body,
                             const Iterator idx,
                             IndexType length)
@@ -155,9 +156,9 @@ __launch_bounds__(BlockSize, 1) __global__
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async>
+template <typename Iterable, typename LoopBody, size_t BlockSize, size_t BlocksPerSM, bool Async>
 RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(resources::Hip hip_res,
-                                                    hip_exec<BlockSize, Async>,
+                                                    hip_exec_explicit<BlockSize, Async>,
                                                     Iterable&& iter,
                                                     LoopBody&& loop_body)
 {
@@ -165,7 +166,7 @@ RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(resources::Hip hip
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
-  auto func = impl::forall_hip_kernel<BlockSize, Iterator, LOOP_BODY, IndexType>;
+  auto func = impl::forall_hip_kernel<BlockSize, BlocksPerSM, Iterator, LOOP_BODY, IndexType>;
 
   //
   // Compute the requested iteration space size
@@ -235,11 +236,12 @@ RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(resources::Hip hip
  */
 template <typename LoopBody,
           size_t BlockSize,
+          size_t BlocksPerSM,
           bool Async,
           typename... SegmentTypes>
 RAJA_INLINE resources::EventProxy<resources::Hip>
 forall_impl(resources::Hip r,
-            ExecPolicy<seq_segit, hip_exec<BlockSize, Async>>,
+            ExecPolicy<seq_segit, hip_exec_explicit<BlockSize, BlocksPerSM, Async>>,
             const TypedIndexSet<SegmentTypes...>& iset,
             LoopBody&& loop_body)
 {
@@ -248,7 +250,7 @@ forall_impl(resources::Hip r,
     iset.segmentCall(r,
                      isi,
                      detail::CallForall(),
-                     hip_exec<BlockSize, true>(),
+                     hip_exec_explicit<BlockSize, BlocksPerSM, true>(),
                      loop_body);
   }  // iterate over segments of index set
 
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index d9b8d62fff..8d0db60ca1 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -70,8 +70,8 @@ namespace policy
 namespace hip
 {
 
-template <size_t BLOCK_SIZE, bool Async = false>
-struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t<
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
+struct hip_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Policy::hip,
                        RAJA::Pattern::forall,
                        detail::get_launch<Async>::value,
@@ -222,10 +222,16 @@ using hip_atomic = hip_atomic_explicit<loop_atomic>;
 }  // end namespace hip
 }  // end namespace policy
 
-using policy::hip::hip_exec;
+using policy::hip::hip_exec_explicit;
 
-template <size_t BLOCK_SIZE>
-using hip_exec_async = policy::hip::hip_exec<BLOCK_SIZE, true>;
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
+using hip_exec_explicit_async = policy::hip::hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>;
+
+template <size_t BLOCK_SIZE, bool ASYNC = false>
+using hip_exec = policy::hip::hip_exec_explicit<BLOCK_SIZE, 1, ASYNC>;
+
+template <size_t BLOCK_SIZE, bool ASYNC = true>
+using hip_exec_async = policy::hip::hip_exec_explicit<BLOCK_SIZE, 1, true>;
 
 using policy::hip::hip_work;
 
diff --git a/include/RAJA/policy/hip/scan.hpp b/include/RAJA/policy/hip/scan.hpp
index 85bc494abb..999837586c 100644
--- a/include/RAJA/policy/hip/scan.hpp
+++ b/include/RAJA/policy/hip/scan.hpp
@@ -47,12 +47,12 @@ namespace scan
         \brief explicit inclusive inplace scan given range, function, and
    initial value
 */
-template <size_t BLOCK_SIZE, bool Async, typename InputIter, typename Function>
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename InputIter, typename Function>
 RAJA_INLINE
 resources::EventProxy<resources::Hip>
 inclusive_inplace(
     resources::Hip hip_res,
-    hip_exec<BLOCK_SIZE, Async>,
+    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op)
@@ -116,6 +116,7 @@ inclusive_inplace(
    initial value
 */
 template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
           bool Async,
           typename InputIter,
           typename Function,
@@ -124,7 +125,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Hip>
 exclusive_inplace(
     resources::Hip hip_res,
-    hip_exec<BLOCK_SIZE, Async>,
+    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op,
@@ -192,6 +193,7 @@ exclusive_inplace(
    initial value
 */
 template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
           bool Async,
           typename InputIter,
           typename OutputIter,
@@ -200,7 +202,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Hip>
 inclusive(
     resources::Hip hip_res,
-    hip_exec<BLOCK_SIZE, Async>,
+    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     InputIter begin,
     InputIter end,
     OutputIter out,
@@ -264,6 +266,7 @@ inclusive(
    initial value
 */
 template <size_t BLOCK_SIZE,
+          size_t BLOCKS_PER_SM,
           bool Async,
           typename InputIter,
           typename OutputIter,
@@ -273,7 +276,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Hip>
 exclusive(
     resources::Hip hip_res,
-    hip_exec<BLOCK_SIZE, Async>,
+    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     InputIter begin,
     InputIter end,
     OutputIter out,
diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp
index 9090721ff5..05537342cb 100644
--- a/include/RAJA/policy/hip/sort.hpp
+++ b/include/RAJA/policy/hip/sort.hpp
@@ -73,7 +73,7 @@ namespace detail
 /*!
         \brief static assert unimplemented stable sort
 */
-template <size_t BLOCK_SIZE, bool Async, typename Iter, typename Compare>
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       concepts::negate<concepts::all_of<
                         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
@@ -83,7 +83,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
 stable(
     resources::Hip hip_res,
-    hip_exec<BLOCK_SIZE, Async>,
+    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM Async>,
     Iter,
     Iter,
     Compare)
@@ -102,13 +102,13 @@ stable(
 /*!
         \brief stable sort given range in ascending order
 */
-template <size_t BLOCK_SIZE, bool Async, typename Iter>
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 stable(
     resources::Hip hip_res,
-    hip_exec<BLOCK_SIZE, Async>,
+    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     Iter begin,
     Iter end,
     operators::less<RAJA::detail::IterVal<Iter>>)
@@ -190,13 +190,13 @@ stable(
 /*!
         \brief stable sort given range in descending order
 */
-template <size_t BLOCK_SIZE, bool Async, typename Iter>
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 stable(
     resources::Hip hip_res,
-    hip_exec<BLOCK_SIZE, Async>,
+    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     Iter begin,
     Iter end,
     operators::greater<RAJA::detail::IterVal<Iter>>)
@@ -279,7 +279,7 @@ stable(
 /*!
         \brief static assert unimplemented sort
 */
-template <size_t BLOCK_SIZE, bool Async, typename Iter, typename Compare>
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       concepts::negate<concepts::all_of<
                         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
@@ -289,7 +289,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
 unstable(
     resources::Hip hip_res,
-    hip_exec<BLOCK_SIZE, Async>,
+    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     Iter,
     Iter,
     Compare)
@@ -308,13 +308,13 @@ unstable(
 /*!
         \brief sort given range in ascending order
 */
-template <size_t BLOCK_SIZE, bool Async, typename Iter>
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 unstable(
     resources::Hip hip_res,
-    hip_exec<BLOCK_SIZE, Async> p,
+    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> p,
     Iter begin,
     Iter end,
     operators::less<RAJA::detail::IterVal<Iter>> comp)
@@ -325,13 +325,13 @@ unstable(
 /*!
         \brief sort given range in descending order
 */
-template <size_t BLOCK_SIZE, bool Async, typename Iter>
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 unstable(
     resources::Hip hip_res,
-    hip_exec<BLOCK_SIZE, Async> p,
+    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> p,
     Iter begin,
     Iter end,
     operators::greater<RAJA::detail::IterVal<Iter>> comp)
@@ -343,7 +343,7 @@ unstable(
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       concepts::negate<concepts::all_of<
@@ -355,7 +355,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 stable_pairs(
     resources::Hip hip_res,
-    hip_exec<BLOCK_SIZE, Async>,
+    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     KeyIter,
     KeyIter,
     ValIter,
@@ -379,7 +379,7 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -387,7 +387,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       std::is_pointer<ValIter>>
 stable_pairs(
     resources::Hip hip_res,
-    hip_exec<BLOCK_SIZE, Async>,
+    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -483,7 +483,7 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -491,7 +491,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       std::is_pointer<ValIter>>
 stable_pairs(
     resources::Hip hip_res,
-    hip_exec<BLOCK_SIZE, Async>,
+    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -588,7 +588,7 @@ stable_pairs(
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       concepts::negate<concepts::all_of<
@@ -600,7 +600,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 unstable_pairs(
     resources::Hip hip_res,
-    hip_exec<BLOCK_SIZE, Async>,
+    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     KeyIter,
     KeyIter,
     ValIter,
@@ -624,7 +624,7 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -632,7 +632,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Hip hip_res,
-    hip_exec<BLOCK_SIZE, Async> p,
+    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -644,7 +644,7 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -652,7 +652,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Hip hip_res,
-    hip_exec<BLOCK_SIZE, Async> p,
+    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
diff --git a/include/RAJA/util/resource.hpp b/include/RAJA/util/resource.hpp
index dc50c2ea83..7fca8332bd 100644
--- a/include/RAJA/util/resource.hpp
+++ b/include/RAJA/util/resource.hpp
@@ -72,6 +72,16 @@ namespace RAJA
   struct get_resource<ExecPolicy<ISetIter, cuda_exec<BlockSize, Async>>>{
     using type = camp::resources::Cuda;
   };
+
+  template<size_t BlockSize, size_t BlocksPerSM, bool Async>
+  struct get_resource<cuda_exec_explicit<BlockSize, BlocksPerSM, Async>>{
+    using type = camp::resources::Cuda;
+  };
+
+  template<typename ISetIter, size_t BlockSize, size_t BlocksPerSM, bool Async>
+  struct get_resource<ExecPolicy<ISetIter, cuda_exec_explicit<BlockSize, BlocksPerSM, Async>>>{
+    using type = camp::resources::Cuda;
+  };
 #endif
 
 #if defined(RAJA_HIP_ACTIVE)
@@ -89,6 +99,16 @@ namespace RAJA
   struct get_resource<ExecPolicy<ISetIter, hip_exec<BlockSize, Async>>>{
     using type = camp::resources::Hip;
   };
+
+  template<size_t BlockSize, size_t BlocksPerSM, bool Async>
+  struct get_resource<hip_exec_explicit<BlockSize, BlocksPerSM, Async>>{
+    using type = camp::resources::Hip;
+  };
+
+  template<typename ISetIter, size_t BlockSize, size_t BlocksPerSM, bool Async>
+  struct get_resource<ExecPolicy<ISetIter, hip_exec_explicit<BlockSize, BlocksPerSM, Async>>>{
+    using type = camp::resources::Hip;
+  };
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp
index ce2039e248..27d4624049 100644
--- a/test/include/RAJA_test-forall-execpol.hpp
+++ b/test/include/RAJA_test-forall-execpol.hpp
@@ -123,7 +123,8 @@ using OpenMPTargetForallAtomicExecPols = OpenMPTargetForallExecPols;
 
 #if defined(RAJA_ENABLE_CUDA)
 using CudaForallExecPols = camp::list< RAJA::cuda_exec<128>,
-                                       RAJA::cuda_exec<256> >;
+                                       RAJA::cuda_exec<256>,
+                                       RAJA::cuda_exec_explicit<256,2> >;
 
 using CudaForallReduceExecPols = CudaForallExecPols;
 
@@ -133,7 +134,8 @@ using CudaForallAtomicExecPols = CudaForallExecPols;
 
 #if defined(RAJA_ENABLE_HIP)
 using HipForallExecPols = camp::list< RAJA::hip_exec<128>,
-                                      RAJA::hip_exec<256>  >;
+                                      RAJA::hip_exec<256>,
+                                      RAJA::hip_exec_explicit<256,2> >;
 
 using HipForallReduceExecPols = HipForallExecPols;
 
diff --git a/test/unit/algorithm/tests/test-algorithm-sort.hpp b/test/unit/algorithm/tests/test-algorithm-sort.hpp
index 046ed493ef..c0f584b80a 100644
--- a/test/unit/algorithm/tests/test-algorithm-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-sort.hpp
@@ -116,7 +116,8 @@ using TBBSortSorters =
 using CudaSortSorters =
   camp::list<
               PolicySort<RAJA::cuda_exec<128>>,
-              PolicySortPairs<RAJA::cuda_exec<128>>
+              PolicySortPairs<RAJA::cuda_exec<128>>,
+              PolicySort<RAJA::cuda_exec_explicit<128, 2>>
             >;
 
 #endif
@@ -126,7 +127,8 @@ using CudaSortSorters =
 using HipSortSorters =
   camp::list<
               PolicySort<RAJA::hip_exec<128>>,
-              PolicySortPairs<RAJA::hip_exec<128>>
+              PolicySortPairs<RAJA::hip_exec<128>>,
+              PolicySort<RAJA::hip_exec_explicit<128, 2>>
             >;
 
 #endif
diff --git a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
index 00f65fae86..1001d12970 100644
--- a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
@@ -116,7 +116,8 @@ using TBBStableSortSorters =
 using CudaStableSortSorters =
   camp::list<
               PolicyStableSort<RAJA::cuda_exec<128>>,
-              PolicyStableSortPairs<RAJA::cuda_exec<128>>
+              PolicyStableSortPairs<RAJA::cuda_exec<128>>,
+              PolicyStableSort<RAJA::cuda_exec_explicit<128, 2>>
             >;
 
 #endif
@@ -126,7 +127,8 @@ using CudaStableSortSorters =
 using HipStableSortSorters =
   camp::list<
               PolicyStableSort<RAJA::hip_exec<128>>,
-              PolicyStableSortPairs<RAJA::hip_exec<128>>
+              PolicyStableSortPairs<RAJA::hip_exec<128>>,
+              PolicyStableSort<RAJA::hip_exec_explicit<128, 2>>
             >;
 
 #endif

From f0722347434550e30b76050abf7e5a75e6f2431f Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Fri, 19 Nov 2021 09:42:09 -0800
Subject: [PATCH 02/21] Missing comma.

---
 include/RAJA/policy/hip/sort.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp
index 05537342cb..13b1151ca7 100644
--- a/include/RAJA/policy/hip/sort.hpp
+++ b/include/RAJA/policy/hip/sort.hpp
@@ -83,7 +83,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
 stable(
     resources::Hip hip_res,
-    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM Async>,
+    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     Iter,
     Iter,
     Compare)

From a8f424e13dc6b0df4b1c9664bf5fbead42ecbad8 Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Fri, 19 Nov 2021 16:11:00 -0800
Subject: [PATCH 03/21] Fix HIP, cleanup kernel.

---
 .../RAJA/policy/cuda/kernel/CudaKernel.hpp    | 40 +++++------
 include/RAJA/policy/hip/forall.hpp            |  2 +-
 include/RAJA/policy/hip/kernel/HipKernel.hpp  | 68 ++++++++++++-------
 3 files changed, 67 insertions(+), 43 deletions(-)

diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
index 276b2e9553..2655ee630b 100644
--- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
+++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
@@ -45,28 +45,30 @@ namespace RAJA
 
 /*!
  * CUDA kernel launch policy where the user may specify the number of physical
- * thread blocks and threads per block.
+ * thread blocks, threads per block, and blocks per SM.
  * If num_blocks is 0 and num_threads is non-zero then num_blocks is chosen at
  * runtime.
  * Num_blocks is chosen to maximize the number of blocks running concurrently.
- * If num_threads and num_blocks are both 0 then num_threads and num_blocks are
- * chosen at runtime.
- * Num_threads and num_blocks are determined by the CUDA occupancy calculator.
- * If num_threads is 0 and num_blocks is non-zero then num_threads is chosen at
- * runtime.
- * Num_threads is 1024, which may not be appropriate for all kernels.
+ * Blocks per SM must be chosen by the user.
  */
 template <bool async0, size_t num_blocks, size_t num_threads, size_t blocks_per_sm>
-struct cuda_launch {};
+struct cuda_explicit_launch {};
 
 /*!
  * CUDA kernel launch policy where the user specifies the number of physical
  * thread blocks and threads per block.
  * If num_blocks is 0 then num_blocks is chosen at runtime.
  * Num_blocks is chosen to maximize the number of blocks running concurrently.
+ * If num_threads and num_blocks are both 0 then num_threads and num_blocks are
+ * chosen at runtime.
+ * Num_threads and num_blocks are determined by the CUDA occupancy calculator.
+ * If num_threads is 0 and num_blocks is non-zero then num_threads is chosen at
+ * runtime.
+ * Num_threads is 1024, which may not be appropriate for all kernels.
+ * Blocks per SM defaults to 1.
  */
-template <bool async0, size_t num_blocks, size_t num_threads, size_t blocks_per_sm>
-using cuda_explicit_launch = cuda_launch<async0, num_blocks, num_threads, blocks_per_sm>;
+template <bool async0, size_t num_blocks, size_t num_threads>
+using cuda_launch = cuda_explicit_launch<async0, num_blocks, num_threads, 1>;
 
 
 /*!
@@ -75,7 +77,7 @@ using cuda_explicit_launch = cuda_launch<async0, num_blocks, num_threads, blocks
  * If num_threads is 0 then num_threads is chosen at runtime.
  */
 template <size_t num_threads0, bool async0>
-using cuda_occ_calc_launch = cuda_launch<async0, 0, num_threads0, 0>;
+using cuda_occ_calc_launch = cuda_explicit_launch<async0, 0, num_threads0, 0>;
 
 namespace statement
 {
@@ -87,7 +89,7 @@ namespace statement
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct CudaKernelExt
-    : public internal::Statement<cuda_exec<0>, EnclosedStmts...> {
+    : public internal::Statement<cuda_exec_explicit<0, 0>, EnclosedStmts...> {
 };
 
 
@@ -97,9 +99,9 @@ struct CudaKernelExt
  * calculator determine the unspecified values.
  * The kernel launch is synchronous.
  */
-template <size_t num_blocks, size_t num_threads, size_t blocks_per_sm, typename... EnclosedStmts>
+template <size_t num_blocks, size_t num_threads, typename... EnclosedStmts>
 using CudaKernelExp =
-    CudaKernelExt<cuda_launch<false, num_blocks, num_threads, blocks_per_sm>, EnclosedStmts...>;
+    CudaKernelExt<cuda_launch<false, num_blocks, num_threads>, EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with the flexibility
@@ -107,9 +109,9 @@ using CudaKernelExp =
  * calculator determine the unspecified values.
  * The kernel launch is asynchronous.
  */
-template <size_t num_blocks, size_t num_threads, size_t blocks_per_sm, typename... EnclosedStmts>
+template <size_t num_blocks, size_t num_threads, typename... EnclosedStmts>
 using CudaKernelExpAsync =
-    CudaKernelExt<cuda_launch<true, num_blocks, num_threads, blocks_per_sm>, EnclosedStmts...>;
+    CudaKernelExt<cuda_launch<true, num_blocks, num_threads>, EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel using the
@@ -136,7 +138,7 @@ using CudaKernelOccAsync =
  */
 template <size_t num_threads, typename... EnclosedStmts>
 using CudaKernelFixed =
-    CudaKernelExt<cuda_explicit_launch<false, operators::limits<size_t>::max(), num_threads, 1>,
+    CudaKernelExt<cuda_launch<false, operators::limits<size_t>::max(), num_threads>,
                   EnclosedStmts...>;
 
 /*!
@@ -156,7 +158,7 @@ using CudaKernelFixedSM =
  */
 template <size_t num_threads, typename... EnclosedStmts>
 using CudaKernelFixedAsync =
-    CudaKernelExt<cuda_explicit_launch<true, operators::limits<size_t>::max(), num_threads, 1>,
+    CudaKernelExt<cuda_launch<true, operators::limits<size_t>::max(), num_threads>,
                   EnclosedStmts...>;
 
 /*!
@@ -271,7 +273,7 @@ struct CudaLaunchHelper;
  * determined at runtime using the CUDA occupancy calculator.
  */
 template<bool async0, size_t num_blocks, size_t num_threads, size_t blocks_per_sm, typename StmtList, typename Data, typename Types>
-struct CudaLaunchHelper<cuda_launch<async0, num_blocks, num_threads, blocks_per_sm>,StmtList,Data,Types>
+struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, blocks_per_sm>,StmtList,Data,Types>
 {
   using Self = CudaLaunchHelper;
 
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index 6e643459f5..0ec9c5c1e9 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -158,7 +158,7 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__
 
 template <typename Iterable, typename LoopBody, size_t BlockSize, size_t BlocksPerSM, bool Async>
 RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(resources::Hip hip_res,
-                                                    hip_exec_explicit<BlockSize, Async>,
+                                                    hip_exec_explicit<BlockSize, BlocksPerSM, Async>,
                                                     Iterable&& iter,
                                                     LoopBody&& loop_body)
 {
diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp
index 3a905ca723..70b1afe88f 100644
--- a/include/RAJA/policy/hip/kernel/HipKernel.hpp
+++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp
@@ -45,28 +45,30 @@ namespace RAJA
 
 /*!
  * HIP kernel launch policy where the user may specify the number of physical
- * thread blocks and threads per block.
+ * thread blocks, threads per block, and blocks per SM.
  * If num_blocks is 0 and num_threads is non-zero then num_blocks is chosen at
  * runtime.
  * Num_blocks is chosen to maximize the number of blocks running concurrently.
- * If num_threads and num_blocks are both 0 then num_threads and num_blocks are
- * chosen at runtime.
- * Num_threads and num_blocks are determined by the HIP occupancy calculator.
- * If num_threads is 0 and num_blocks is non-zero then num_threads is chosen at
- * runtime.
- * Num_threads is 1024, which may not be appropriate for all kernels.
+ * Blocks per SM must be chosen by the user.
  */
-template <bool async0, int num_blocks, int num_threads>
-struct hip_launch {};
+template <bool async0, int num_blocks, int num_threads, int blocks_per_sm>
+struct hip_explicit_launch {};
 
 /*!
  * HIP kernel launch policy where the user specifies the number of physical
  * thread blocks and threads per block.
  * If num_blocks is 0 then num_blocks is chosen at runtime.
  * Num_blocks is chosen to maximize the number of blocks running concurrently.
+ * If num_threads and num_blocks are both 0 then num_threads and num_blocks are
+ * chosen at runtime.
+ * Num_threads and num_blocks are determined by the HIP occupancy calculator.
+ * If num_threads is 0 and num_blocks is non-zero then num_threads is chosen at
+ * runtime.
+ * Num_threads is 1024, which may not be appropriate for all kernels.
+ * Blocks per SM defaults to 1.
  */
 template <bool async0, int num_blocks, int num_threads>
-using hip_explicit_launch = hip_launch<async0, num_blocks, num_threads>;
+using hip_launch = hip_explicit_launch<async0, num_blocks, num_threads, 1>;
 
 
 /*!
@@ -75,7 +77,7 @@ using hip_explicit_launch = hip_launch<async0, num_blocks, num_threads>;
  * If num_threads is 0 then num_threads is chosen at runtime.
  */
 template <int num_threads0, bool async0>
-using hip_occ_calc_launch = hip_launch<async0, 0, num_threads0>;
+using hip_occ_calc_launch = hip_explicit_launch<async0, 0, num_threads0, 0>;
 
 namespace statement
 {
@@ -87,7 +89,7 @@ namespace statement
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct HipKernelExt
-    : public internal::Statement<hip_exec<0>, EnclosedStmts...> {
+    : public internal::Statement<hip_exec<0, 0>, EnclosedStmts...> {
 };
 
 
@@ -136,7 +138,17 @@ using HipKernelOccAsync =
  */
 template <int num_threads, typename... EnclosedStmts>
 using HipKernelFixed =
-    HipKernelExt<hip_explicit_launch<false, 0, num_threads>,
+    HipKernelExt<hip_launch<false, 0, num_threads>,
+                  EnclosedStmts...>;
+
+/*!
+ * A RAJA::kernel statement that launches a HIP kernel with a fixed
+ * number of threads (specified by num_threads) and min blocks per sm.
+ * The kernel launch is synchronous.
+ */
+template <size_t num_threads, size_t blocks_per_sm, typename... EnclosedStmts>
+using HipKernelFixedSM =
+    HipKernelExt<hip_explicit_launch<false, 0, num_threads, blocks_per_sm>,
                   EnclosedStmts...>;
 
 /*!
@@ -146,7 +158,17 @@ using HipKernelFixed =
  */
 template <int num_threads, typename... EnclosedStmts>
 using HipKernelFixedAsync =
-    HipKernelExt<hip_explicit_launch<true, 0, num_threads>, EnclosedStmts...>;
+    HipKernelExt<hip_launch<true, 0, num_threads>, EnclosedStmts...>;
+
+/*!
+ * A RAJA::kernel statement that launches a HIP kernel with a fixed
+ * number of threads (specified by num_threads) and min blocks per sm.
+ * The kernel launch is asynchronous.
+ */
+template <size_t num_threads, size_t blocks_per_sm, typename... EnclosedStmts>
+using HipKernelFixedSMAsync =
+    HipKernelExt<hip_explicit_launch<true, 0, num_threads, blocks_per_sm>,
+                  EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with 1024 threads
@@ -189,8 +211,8 @@ __global__ void HipKernelLauncher(Data data)
  *
  * This launcher is used by the HipKerelFixed policies.
  */
-template <size_t BlockSize, typename Data, typename Exec>
-__launch_bounds__(BlockSize, 1) __global__
+template <size_t BlockSize, size_t BlocksPerSM, typename Data, typename Exec>
+__launch_bounds__(BlockSize, BlocksPerSM) __global__
     void HipKernelLauncherFixed(Data data)
 {
 
@@ -210,13 +232,13 @@ __launch_bounds__(BlockSize, 1) __global__
  * The default case handles BlockSize != 0 and gets the fixed max block size
  * version of the kernel.
  */
-template<size_t BlockSize, typename Data, typename executor_t>
+template<size_t BlockSize, size_t BlocksPerSM, typename Data, typename executor_t>
 struct HipKernelLauncherGetter
 {
-  using type = camp::decay<decltype(&internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>)>;
+  using type = camp::decay<decltype(&internal::HipKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>)>;
   static constexpr type get() noexcept
   {
-    return internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>;
+    return internal::HipKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>;
   }
 };
 
@@ -225,7 +247,7 @@ struct HipKernelLauncherGetter
  * block size version of the kernel.
  */
 template<typename Data, typename executor_t>
-struct HipKernelLauncherGetter<0, Data, executor_t>
+struct HipKernelLauncherGetter<0, 0, Data, executor_t>
 {
   using type = camp::decay<decltype(&internal::HipKernelLauncher<Data, executor_t>)>;
   static constexpr type get() noexcept
@@ -249,8 +271,8 @@ struct HipLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the HIP occupancy calculator.
  */
-template<bool async0, int num_blocks, int num_threads, typename StmtList, typename Data, typename Types>
-struct HipLaunchHelper<hip_launch<async0, num_blocks, num_threads>,StmtList,Data,Types>
+template<bool async0, int num_blocks, int num_threads, int blocks_per_sm, typename StmtList, typename Data, typename Types>
+struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads, blocks_per_sm>,StmtList,Data,Types>
 {
   using Self = HipLaunchHelper;
 
@@ -258,7 +280,7 @@ struct HipLaunchHelper<hip_launch<async0, num_blocks, num_threads>,StmtList,Data
 
   using executor_t = internal::hip_statement_list_executor_t<StmtList, Data, Types>;
 
-  using kernelGetter_t = HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, Data, executor_t>;
+  using kernelGetter_t = HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, (blocks_per_sm <= 0) ? 0 : blocks_per_sm, Data, executor_t>;
 
   inline static void recommended_blocks_threads(int shmem_size,
       int &recommended_blocks, int &recommended_threads)

From 04c18b60b8c5f579316b0fab577fd628173843fa Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Fri, 19 Nov 2021 16:16:28 -0800
Subject: [PATCH 04/21] Update Hip kernel test.

---
 test/old-tests/unit/test-kernel.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/old-tests/unit/test-kernel.cpp b/test/old-tests/unit/test-kernel.cpp
index 6b00050aa4..69d3de8171 100644
--- a/test/old-tests/unit/test-kernel.cpp
+++ b/test/old-tests/unit/test-kernel.cpp
@@ -3181,7 +3181,7 @@ GPU_TEST(Kernel_gpu, HipExec1c)
 
   // Loop Fusion
   using Pol = KernelPolicy<
-      HipKernelExt<hip_explicit_launch<false, 5, 3>,
+      HipKernelExt<hip_explicit_launch<false, 5, 3, 1>,
            statement::Tile<2, tile_fixed<2>, hip_block_z_loop,
                     For<0, hip_block_x_loop,
                         For<1, hip_block_y_loop,

From 5e00d9b1277731595caefce7d8bf9a9c8c6cc9d7 Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Mon, 22 Nov 2021 16:42:49 -0800
Subject: [PATCH 05/21] Explicit Teams policies.

---
 include/RAJA/policy/cuda/policy.hpp      |  15 +--
 include/RAJA/policy/cuda/teams.hpp       | 121 ++++++++++++++++++++++-
 include/RAJA/policy/hip/policy.hpp       |  15 +--
 include/RAJA/policy/hip/teams.hpp        | 118 +++++++++++++++++++++-
 test/include/RAJA_test-teams-execpol.hpp |  38 ++++++-
 5 files changed, 283 insertions(+), 24 deletions(-)

diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 9dedf5cc89..94817479c9 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -82,12 +82,12 @@ struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Platform::cuda> {
 };
 
-template <bool Async, int num_threads = 0>
-struct cuda_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::cuda,
-                       RAJA::Pattern::region,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::cuda> {
+template <bool Async, int num_threads, size_t BLOCKS_PER_SM = 0>
+struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t<
+                                RAJA::Policy::cuda,
+                                RAJA::Pattern::region,
+                                detail::get_launch<Async>::value,
+                                RAJA::Platform::cuda> {
 };
 
 
@@ -261,7 +261,8 @@ using policy::cuda::cuda_synchronize;
 
 namespace expt
 {
-  using policy::cuda::cuda_launch_t;
+  template <bool Async, int num_threads = 0>
+  using cuda_launch_t = policy::cuda::cuda_launch_explicit_t<Async, num_threads, 0>;
 }
 
 
diff --git a/include/RAJA/policy/cuda/teams.hpp b/include/RAJA/policy/cuda/teams.hpp
index 164c574c08..c5d5b74ef2 100644
--- a/include/RAJA/policy/cuda/teams.hpp
+++ b/include/RAJA/policy/cuda/teams.hpp
@@ -155,8 +155,8 @@ struct LaunchExecute<RAJA::expt::cuda_launch_t<async, 0>> {
   }
 };
 
-template <typename BODY, int num_threads>
-__launch_bounds__(num_threads, 1) __global__
+template <typename BODY, int num_threads, size_t BLOCKS_PER_SM>
+__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
     void launch_global_fcn_fixed(LaunchContext ctx, BODY body_in)
 {
   using RAJA::internal::thread_privatize;
@@ -174,7 +174,122 @@ struct LaunchExecute<RAJA::expt::cuda_launch_t<async, nthreads>> {
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = launch_global_fcn_fixed<BODY, nthreads>;
+    auto func = launch_global_fcn_fixed<BODY, nthreads, 1>;
+
+    resources::Cuda cuda_res = resources::Cuda::get_default();
+
+    //
+    // Compute the number of blocks and threads
+    //
+
+    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(ctx.teams.value[0]),
+                         static_cast<cuda_dim_member_t>(ctx.teams.value[1]),
+                         static_cast<cuda_dim_member_t>(ctx.teams.value[2]) };
+
+    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(ctx.threads.value[0]),
+                          static_cast<cuda_dim_member_t>(ctx.threads.value[1]),
+                          static_cast<cuda_dim_member_t>(ctx.threads.value[2]) };
+
+    // Only launch kernel if we have something to iterate over
+    constexpr cuda_dim_member_t zero = 0;
+    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
+         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+
+      RAJA_FT_BEGIN;
+
+      //
+      // Setup shared memory buffers
+      //
+      size_t shmem = 0;
+
+      {
+        //
+        // Privatize the loop_body, using make_launch_body to setup reductions
+        //
+        BODY body = RAJA::cuda::make_launch_body(
+            gridSize, blockSize, shmem, cuda_res, std::forward<BODY_IN>(body_in));
+
+        //
+        // Launch the kernel
+        //
+        void *args[] = {(void*)&ctx, (void*)&body};
+        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, shmem, cuda_res, async, ctx.kernel_name);
+      }
+
+      RAJA_FT_END;
+    }
+
+  }
+
+  template <typename BODY_IN>
+  static resources::EventProxy<resources::Resource>
+  exec(RAJA::resources::Resource res, LaunchContext const &ctx, BODY_IN &&body_in)
+  {
+    using BODY = camp::decay<BODY_IN>;
+
+    auto func = launch_global_fcn<BODY>;
+
+    /*Get the concrete resource */
+    resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
+
+    //
+    // Compute the number of blocks and threads
+    //
+
+    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(ctx.teams.value[0]),
+                         static_cast<cuda_dim_member_t>(ctx.teams.value[1]),
+                         static_cast<cuda_dim_member_t>(ctx.teams.value[2]) };
+
+    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(ctx.threads.value[0]),
+                          static_cast<cuda_dim_member_t>(ctx.threads.value[1]),
+                          static_cast<cuda_dim_member_t>(ctx.threads.value[2]) };
+
+    // Only launch kernel if we have something to iterate over
+    constexpr cuda_dim_member_t zero = 0;
+    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
+         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+
+      RAJA_FT_BEGIN;
+
+      //
+      // Setup shared memory buffers
+      //
+      size_t shmem = 0;
+
+      {
+        //
+        // Privatize the loop_body, using make_launch_body to setup reductions
+        //
+        BODY body = RAJA::cuda::make_launch_body(
+            gridSize, blockSize, shmem, cuda_res, std::forward<BODY_IN>(body_in));
+
+        //
+        // Launch the kernel
+        //
+        void *args[] = {(void*)&ctx, (void*)&body};
+        {
+          RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, shmem, cuda_res, async, ctx.kernel_name);
+        }
+      }
+
+      RAJA_FT_END;
+    }
+
+    return resources::EventProxy<resources::Resource>(res);
+  }
+
+};
+
+
+template <bool async, int nthreads, size_t BLOCKS_PER_SM>
+struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>> {
+
+  template <typename BODY_IN>
+  static void exec(LaunchContext const &ctx, BODY_IN &&body_in)
+  {
+    using BODY = camp::decay<BODY_IN>;
+
+    auto func = launch_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM>;
 
     resources::Cuda cuda_res = resources::Cuda::get_default();
 
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 8d0db60ca1..8cb14de4b0 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -78,12 +78,12 @@ struct hip_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Platform::hip> {
 };
 
-template <bool Async, int num_threads = 0>
-struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::hip,
-                       RAJA::Pattern::region,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::hip> {
+template <bool Async, int num_threads, int BLOCKS_PER_SM = 0>
+struct hip_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t<
+                                      RAJA::Policy::hip,
+                                      RAJA::Pattern::region,
+                                      detail::get_launch<Async>::value,
+                                      RAJA::Platform::hip> {
 };
 
 
@@ -265,7 +265,8 @@ using policy::hip::hip_synchronize;
 
 namespace expt
 {
-  using policy::hip::hip_launch_t;
+  template <bool Async, int num_threads = 0>
+  using hip_launch_t = policy::hip::hip_launch_explicit_t<Async, num_threads, 0>;
 }
 
 /*!
diff --git a/include/RAJA/policy/hip/teams.hpp b/include/RAJA/policy/hip/teams.hpp
index d881e61a62..75aaf3d62d 100644
--- a/include/RAJA/policy/hip/teams.hpp
+++ b/include/RAJA/policy/hip/teams.hpp
@@ -151,8 +151,8 @@ struct LaunchExecute<RAJA::expt::hip_launch_t<async, 0>> {
 
 };
 
-template <typename BODY, int num_threads>
-__launch_bounds__(num_threads, 1) __global__
+template <typename BODY, int num_threads, int BLOCKS_PER_SM>
+__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
 static void launch_global_fcn_fixed(LaunchContext ctx, BODY body_in)
 {
   using RAJA::internal::thread_privatize;
@@ -170,7 +170,119 @@ struct LaunchExecute<RAJA::expt::hip_launch_t<async, nthreads>> {
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = launch_global_fcn_fixed<BODY, nthreads>;
+    auto func = launch_global_fcn_fixed<BODY, nthreads, 1>;
+
+    resources::Hip hip_res = resources::Hip::get_default();
+
+    //
+    // Compute the number of blocks and threads
+    //
+
+    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(ctx.teams.value[0]),
+                        static_cast<hip_dim_member_t>(ctx.teams.value[1]),
+                        static_cast<hip_dim_member_t>(ctx.teams.value[2]) };
+
+    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(ctx.threads.value[0]),
+                         static_cast<hip_dim_member_t>(ctx.threads.value[1]),
+                         static_cast<hip_dim_member_t>(ctx.threads.value[2]) };
+
+    // Only launch kernel if we have something to iterate over
+    constexpr hip_dim_member_t zero = 0;
+    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
+         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+
+      RAJA_FT_BEGIN;
+
+      //
+      // Setup shared memory buffers
+      //
+      size_t shmem = 0;
+
+      {
+        //
+        // Privatize the loop_body, using make_launch_body to setup reductions
+        //
+        BODY body = RAJA::hip::make_launch_body(
+            gridSize, blockSize, shmem, hip_res, std::forward<BODY_IN>(body_in));
+
+        //
+        // Launch the kernel
+        //
+        void *args[] = {(void*)&ctx, (void*)&body};
+        RAJA::hip::launch((const void*)func, gridSize, blockSize, args, shmem, hip_res, async, ctx.kernel_name);
+      }
+
+      RAJA_FT_END;
+    }
+
+  }
+
+  template <typename BODY_IN>
+  static resources::EventProxy<resources::Resource>
+  exec(RAJA::resources::Resource res, LaunchContext const &ctx, BODY_IN &&body_in)
+  {
+    using BODY = camp::decay<BODY_IN>;
+
+    auto func = launch_global_fcn<BODY>;
+
+    resources::Hip hip_res = res.get<RAJA::resources::Hip>();
+
+    //
+    // Compute the number of blocks and threads
+    //
+
+    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(ctx.teams.value[0]),
+                        static_cast<hip_dim_member_t>(ctx.teams.value[1]),
+                        static_cast<hip_dim_member_t>(ctx.teams.value[2]) };
+
+    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(ctx.threads.value[0]),
+                         static_cast<hip_dim_member_t>(ctx.threads.value[1]),
+                         static_cast<hip_dim_member_t>(ctx.threads.value[2]) };
+
+    // Only launch kernel if we have something to iterate over
+    constexpr hip_dim_member_t zero = 0;
+    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
+         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+
+      RAJA_FT_BEGIN;
+
+      //
+      // Setup shared memory buffers
+      //
+      size_t shmem = 0;
+
+      {
+        //
+        // Privatize the loop_body, using make_launch_body to setup reductions
+        //
+        BODY body = RAJA::hip::make_launch_body(
+            gridSize, blockSize, shmem, hip_res, std::forward<BODY_IN>(body_in));
+
+        //
+        // Launch the kernel
+        //
+        void *args[] = {(void*)&ctx, (void*)&body};
+        RAJA::hip::launch((const void*)func, gridSize, blockSize, args, shmem, hip_res, async, ctx.kernel_name);
+      }
+
+      RAJA_FT_END;
+    }
+
+    return resources::EventProxy<resources::Resource>(res);
+  }
+
+};
+
+
+template <bool async, int nthreads, int BLOCKS_PER_SM>
+struct LaunchExecute<RAJA::policy::hip::hip_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>> {
+
+  template <typename BODY_IN>
+  static void exec(LaunchContext const &ctx, BODY_IN &&body_in)
+  {
+    using BODY = camp::decay<BODY_IN>;
+
+    auto func = launch_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM>;
 
     resources::Hip hip_res = resources::Hip::get_default();
 
diff --git a/test/include/RAJA_test-teams-execpol.hpp b/test/include/RAJA_test-teams-execpol.hpp
index 60b3e5c76b..c15773005e 100644
--- a/test/include/RAJA_test-teams-execpol.hpp
+++ b/test/include/RAJA_test-teams-execpol.hpp
@@ -22,8 +22,14 @@ using seq_cuda_policies = camp::list<
   RAJA::expt::LoopPolicy<RAJA::loop_exec, RAJA::cuda_block_x_direct>,
   RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::cuda_thread_x_loop>>;
 
+using seq_cuda_explicit_policies = camp::list<
+  RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t,RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+  RAJA::expt::LoopPolicy<RAJA::loop_exec, RAJA::cuda_block_x_direct>,
+  RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::cuda_thread_x_loop>>;
+
 using Sequential_launch_policies = camp::list<
-        seq_cuda_policies
+        seq_cuda_policies,
+        seq_cuda_explicit_policies
          >;
 
 #elif defined(RAJA_ENABLE_HIP)
@@ -32,8 +38,14 @@ using seq_hip_policies = camp::list<
   RAJA::expt::LoopPolicy<RAJA::loop_exec, RAJA::hip_block_x_direct>,
   RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_x_loop>>;
 
+using seq_hip_explicit_policies = camp::list<
+  RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t,RAJA::policy::hip::hip_launch_explicit_t<true, 0, 0>>,
+  RAJA::expt::LoopPolicy<RAJA::loop_exec, RAJA::hip_block_x_direct>,
+  RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_x_loop>>;
+
 using Sequential_launch_policies = camp::list<
-         seq_hip_policies
+         seq_hip_policies,
+         seq_hip_explicit_policies
          >;
 #else
 using Sequential_launch_policies = camp::list<
@@ -54,8 +66,15 @@ using omp_cuda_policies = camp::list<
          RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::cuda_thread_x_loop>
   >;
 
+using omp_cuda_explicit_policies = camp::list<
+         RAJA::expt::LaunchPolicy<RAJA::expt::omp_launch_t,RAJA::policy::cuda::cuda_launch_explicit_t<false, 0, 0>>,
+         RAJA::expt::LoopPolicy<RAJA::omp_parallel_for_exec, RAJA::cuda_block_x_direct>,
+         RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::cuda_thread_x_loop>
+  >;
+
 using OpenMP_launch_policies = camp::list<
-         omp_cuda_policies
+         omp_cuda_policies,
+         omp_cuda_explicit_policies
          >;
 
 #elif defined(RAJA_ENABLE_HIP)
@@ -66,8 +85,15 @@ using omp_hip_policies = camp::list<
          RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_x_loop>
   >;
 
+using omp_hip_explicit_policies = camp::list<
+         RAJA::expt::LaunchPolicy<RAJA::expt::omp_launch_t,RAJA::policy::hip::hip_launch_explicit_t<false, 0, 0>>,
+         RAJA::expt::LoopPolicy<RAJA::omp_parallel_for_exec, RAJA::hip_block_x_direct>,
+         RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_x_loop>
+  >;
+
 using OpenMP_launch_policies = camp::list<
-         omp_hip_policies
+         omp_hip_policies,
+         omp_hip_explicit_policies
          >;
 #else
 using OpenMP_launch_policies = camp::list<
@@ -82,8 +108,10 @@ using OpenMP_launch_policies = camp::list<
 #if defined(RAJA_ENABLE_CUDA)
 using Cuda_launch_policies = camp::list<
          seq_cuda_policies
+         , seq_cuda_explicit_policies
 #if defined(RAJA_ENABLE_OPENMP)
          , omp_cuda_policies
+         , omp_cuda_explicit_policies
 #endif
         >;
 #endif  // RAJA_ENABLE_CUDA
@@ -91,8 +119,10 @@ using Cuda_launch_policies = camp::list<
 #if defined(RAJA_ENABLE_HIP)
 using Hip_launch_policies = camp::list<
          seq_hip_policies
+         , seq_hip_explicit_policies
 #if defined(RAJA_ENABLE_OPENMP)
          , omp_hip_policies
+         , omp_hip_explicit_policies
 #endif
         >;
 #endif // RAJA_ENABLE_HIP

From 6817a3f402ab0b9ad2cde2b570b3467a1abe07fd Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Mon, 29 Nov 2021 16:36:43 -0800
Subject: [PATCH 06/21] WorkGroup modifications for CUDA and HIP.

---
 include/RAJA/policy/cuda/WorkGroup/Vtable.hpp |  17 ++
 .../RAJA/policy/cuda/WorkGroup/WorkRunner.hpp | 273 +++++++++++++++++-
 include/RAJA/policy/cuda/policy.hpp           |  11 +-
 include/RAJA/policy/hip/WorkGroup/Vtable.hpp  |  17 ++
 .../RAJA/policy/hip/WorkGroup/WorkRunner.hpp  | 271 ++++++++++++++++-
 include/RAJA/policy/hip/policy.hpp            |  11 +-
 test/include/RAJA_test-workgroup.hpp          |  12 +-
 7 files changed, 599 insertions(+), 13 deletions(-)

diff --git a/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp b/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp
index ba03b10aae..772ca6b9ce 100644
--- a/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp
@@ -110,6 +110,23 @@ inline const Vtable_T* get_Vtable(cuda_work<BLOCK_SIZE, Async> const&)
   return &vtable;
 }
 
+/*!
+* Explicit BLOCKS_PER_SM version.
+* Populate and return a Vtable object where the
+* call operator is a device function
+*/
+template < typename T, typename Vtable_T, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async >
+inline const Vtable_T* get_Vtable(cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
+{
+  static Vtable_T vtable{
+        &Vtable_T::template move_construct_destroy<T>,
+        get_cached_Vtable_cuda_device_call<T, Vtable_T>(),
+        &Vtable_T::template destroy<T>,
+        sizeof(T)
+      };
+  return &vtable;
+}
+
 }  // namespace detail
 
 }  // namespace RAJA
diff --git a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
index 37cc4c1e1b..67efaf23c6 100644
--- a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
@@ -87,6 +87,62 @@ struct WorkRunner<
   }
 };
 
+/*!
+ * Explicit BLOCKS_PER_SM version.
+ * Runs work in a storage container in order
+ * and returns any per run resources
+ */
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallOrdered<
+        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{
+  using base = WorkRunnerForallOrdered<
+        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>;
+  using base::base;
+  using IndexType = INDEX_T;
+  using per_run_storage = typename base::per_run_storage;
+
+  ///
+  /// run the loops in the given work container in order using forall
+  /// run all loops asynchronously and synchronize after is necessary
+  ///
+  template < typename WorkContainer >
+  per_run_storage run(WorkContainer const& storage,
+                      typename base::resource_type r, Args... args) const
+  {
+    per_run_storage run_storage =
+        base::run(storage, r, std::forward<Args>(args)...);
+
+    IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
+
+    // Only synchronize if we had something to iterate over
+    if (num_loops > 0 && BLOCK_SIZE > 0) {
+      if (!Async) { RAJA::cuda::synchronize(r); }
+    }
+
+    return run_storage;
+  }
+};
+
 /*!
  * Runs work in a storage container in reverse order
  * and returns any per run resources
@@ -143,6 +199,63 @@ struct WorkRunner<
 };
 
 
+/*!
+ * Explicit BLOCKS_PER_SM version.
+ * Runs work in a storage container in reverse order
+ * and returns any per run resources
+ */
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::reverse_ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallReverse<
+        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::reverse_ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{
+  using base = WorkRunnerForallReverse<
+        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::reverse_ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>;
+  using base::base;
+  using IndexType = INDEX_T;
+  using per_run_storage = typename base::per_run_storage;
+
+  ///
+  /// run the loops in the given work container in reverse order using forall
+  /// run all loops asynchronously and synchronize after is necessary
+  ///
+  template < typename WorkContainer >
+  per_run_storage run(WorkContainer const& storage,
+                      typename base::resource_type r, Args... args) const
+  {
+    per_run_storage run_storage =
+        base::run(storage, r, std::forward<Args>(args)...);
+
+    IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
+
+    // Only synchronize if we had something to iterate over
+    if (num_loops > 0 && BLOCK_SIZE > 0) {
+      if (!Async) { RAJA::cuda::synchronize(r); }
+    }
+
+    return run_storage;
+  }
+};
+
+
 /*!
  * A body and segment holder for storing loops that will be executed
  * on the device
@@ -177,11 +290,12 @@ struct HoldCudaDeviceXThreadblockLoop
 };
 
 template < size_t BLOCK_SIZE,
+           size_t BLOCKS_PER_SM,
            typename StorageIter,
            typename value_type,
            typename index_type,
            typename ... Args >
-__launch_bounds__(BLOCK_SIZE, 1) __global__
+__launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
     void cuda_unordered_y_block_global(StorageIter iter, Args... args)
 {
   const index_type i_loop = blockIdx.y;
@@ -290,7 +404,162 @@ struct WorkRunner<
 
     per_run_storage run_storage{};
 
-    auto func = cuda_unordered_y_block_global<BLOCK_SIZE, Iterator, value_type, index_type, Args...>;
+    auto func = cuda_unordered_y_block_global<BLOCK_SIZE, 1 /*BLOCKS_PER_SM*/, Iterator, value_type, index_type, Args...>;
+
+    //
+    // Compute the requested iteration space size
+    //
+    Iterator begin = std::begin(storage);
+    Iterator end = std::end(storage);
+    IndexType num_loops = std::distance(begin, end);
+
+    // Only launch kernel if we have something to iterate over
+    if (num_loops > 0 && BLOCK_SIZE > 0) {
+
+      index_type average_iterations = m_total_iterations / static_cast<index_type>(num_loops);
+
+      //
+      // Compute the number of blocks
+      //
+      constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
+      cuda_dim_t blockSize{static_cast<cuda_dim_member_t>(block_size), 1, 1};
+      cuda_dim_t gridSize{static_cast<cuda_dim_member_t>((average_iterations + block_size - 1) / block_size),
+                          static_cast<cuda_dim_member_t>(num_loops),
+                          1};
+
+      RAJA_FT_BEGIN;
+
+      //
+      // Setup shared memory buffers
+      //
+      size_t shmem = 0;
+
+      {
+        //
+        // Launch the kernel
+        //
+        void* func_args[] = { (void*)&begin, (void*)&args... };
+        RAJA::cuda::launch((const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
+      }
+
+      RAJA_FT_END;
+    }
+
+    return run_storage;
+  }
+
+  // clear any state so ready to be destroyed or reused
+  void clear()
+  {
+    m_total_iterations = 0;
+  }
+
+private:
+  index_type m_total_iterations = 0;
+};
+
+
+/*!
+ * Explicit BLOCKS_PER_SM version.
+ * Runs work in a storage container out of order with loops mapping to
+ * cuda blocks in the y direction and iterations mapping to threads in
+ * the x direction, with the number of threads in the x dimension determined
+ * by the average number of iterates per loop
+ */
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{
+  using exec_policy = RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
+  using order_policy = RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
+  using Allocator = ALLOCATOR_T;
+  using index_type = INDEX_T;
+  using resource_type = resources::Cuda;
+
+  using vtable_type = Vtable<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>, Args...>;
+
+  WorkRunner() = default;
+
+  WorkRunner(WorkRunner const&) = delete;
+  WorkRunner& operator=(WorkRunner const&) = delete;
+
+  WorkRunner(WorkRunner && o)
+    : m_total_iterations(o.m_total_iterations)
+  {
+    o.m_total_iterations = 0;
+  }
+  WorkRunner& operator=(WorkRunner && o)
+  {
+    m_total_iterations = o.m_total_iterations;
+
+    o.m_total_iterations = 0;
+    return *this;
+  }
+
+  // The type  that will hold the segment and loop body in work storage
+  template < typename ITERABLE, typename LOOP_BODY >
+  using holder_type = HoldCudaDeviceXThreadblockLoop<ITERABLE, LOOP_BODY,
+                                 index_type, Args...>;
+
+  // The policy indicating where the call function is invoked
+  // in this case the values are called on the device
+  using vtable_exec_policy = exec_policy;
+
+  // runner interfaces with storage to enqueue so the runner can get
+  // information from the segment and loop at enqueue time
+  template < typename WorkContainer, typename Iterable, typename LoopBody >
+  inline void enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
+  {
+    using Iterator  = camp::decay<decltype(std::begin(iter))>;
+    using LOOP_BODY = camp::decay<LoopBody>;
+    using ITERABLE  = camp::decay<Iterable>;
+    using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+
+    using holder = holder_type<ITERABLE, LOOP_BODY>;
+
+    // using true_value_type = typename WorkContainer::template true_value_type<holder>;
+
+    Iterator begin = std::begin(iter);
+    Iterator end = std::end(iter);
+    IndexType len = std::distance(begin, end);
+
+    // Only launch kernel if we have something to iterate over
+    if (len > 0 && BLOCK_SIZE > 0) {
+
+      m_total_iterations += len;
+
+      //
+      // TODO: Privatize the loop_body, using make_launch_body to setup reductions
+      //
+      // LOOP_BODY body = RAJA::cuda::make_launch_body(
+      //     gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
+
+      storage.template emplace<holder>(
+          get_Vtable<holder, vtable_type>(vtable_exec_policy{}),
+          std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
+    }
+  }
+
+  // no extra storage required here
+  using per_run_storage = int;
+
+  template < typename WorkContainer >
+  per_run_storage run(WorkContainer const& storage, resource_type r, Args... args) const
+  {
+    using Iterator  = camp::decay<decltype(std::begin(storage))>;
+    using IndexType = camp::decay<decltype(std::distance(std::begin(storage), std::end(storage)))>;
+    using value_type = typename WorkContainer::value_type;
+
+    per_run_storage run_storage{};
+
+    auto func = cuda_unordered_y_block_global<BLOCK_SIZE, BLOCKS_PER_SM, Iterator, value_type, index_type, Args...>;
 
     //
     // Compute the requested iteration space size
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 94817479c9..e28db8109f 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -101,8 +101,8 @@ struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform
 ///
 /// WorkGroup execution policies
 ///
-template <size_t BLOCK_SIZE, bool Async = false>
-struct cuda_work : public RAJA::make_policy_pattern_launch_platform_t<
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
+struct cuda_work_explicit : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Policy::cuda,
                        RAJA::Pattern::workgroup_exec,
                        detail::get_launch<Async>::value,
@@ -231,10 +231,13 @@ using cuda_exec = policy::cuda::cuda_exec_explicit<BLOCK_SIZE, 1, ASYNC>;
 template <size_t BLOCK_SIZE, bool ASYNC = true>
 using cuda_exec_async = policy::cuda::cuda_exec_explicit<BLOCK_SIZE, 1, true>;
 
-using policy::cuda::cuda_work;
+using policy::cuda::cuda_work_explicit;
+
+template <size_t BLOCK_SIZE, bool ASYNC = false>
+using cuda_work = policy::cuda::cuda_work_explicit<BLOCK_SIZE, 1, ASYNC>;
 
 template <size_t BLOCK_SIZE>
-using cuda_work_async = policy::cuda::cuda_work<BLOCK_SIZE, true>;
+using cuda_work_async = policy::cuda::cuda_work_explicit<BLOCK_SIZE, 1, true>;
 
 using policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
 
diff --git a/include/RAJA/policy/hip/WorkGroup/Vtable.hpp b/include/RAJA/policy/hip/WorkGroup/Vtable.hpp
index e4ce5212bf..fceac6c102 100644
--- a/include/RAJA/policy/hip/WorkGroup/Vtable.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/Vtable.hpp
@@ -109,6 +109,23 @@ inline const Vtable_T* get_Vtable(hip_work<BLOCK_SIZE, Async> const&)
   return &vtable;
 }
 
+/*!
+* Explicit BLOCKS_PER_SM version.
+* Populate and return a Vtable object where the
+* call operator is a device function
+*/
+template < typename T, typename Vtable_T, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async >
+inline const Vtable_T* get_Vtable(hip_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
+{
+  static Vtable_T vtable{
+        &Vtable_T::template move_construct_destroy<T>,
+        get_cached_Vtable_hip_device_call<T, Vtable_T>(),
+        &Vtable_T::template destroy<T>,
+        sizeof(T)
+      };
+  return &vtable;
+}
+
 #endif
 
 }  // namespace detail
diff --git a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
index 0b1e060a6d..a42577f643 100644
--- a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
@@ -87,6 +87,62 @@ struct WorkRunner<
   }
 };
 
+/*!
+ * Explicit BLOCKS_PER_SM version.
+ * Runs work in a storage container in order
+ * and returns any per run resources
+ */
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::hip_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallOrdered<
+        RAJA::hip_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+        RAJA::hip_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{
+  using base = WorkRunnerForallOrdered<
+        RAJA::hip_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+        RAJA::hip_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>;
+  using base::base;
+  using IndexType = INDEX_T;
+  using per_run_storage = typename base::per_run_storage;
+
+  ///
+  /// run the loops in the given work container in order using forall
+  /// run all loops asynchronously and synchronize after is necessary
+  ///
+  template < typename WorkContainer >
+  per_run_storage run(WorkContainer const& storage,
+                      typename base::resource_type r, Args... args) const
+  {
+    per_run_storage run_storage =
+        base::run(storage, r, std::forward<Args>(args)...);
+
+    IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
+
+    // Only synchronize if we had something to iterate over
+    if (num_loops > 0 && BLOCK_SIZE > 0) {
+      if (!Async) { RAJA::hip::synchronize(r); }
+    }
+
+    return run_storage;
+  }
+};
+
 /*!
  * Runs work in a storage container in reverse order
  * and returns any per run resources
@@ -143,6 +199,63 @@ struct WorkRunner<
 };
 
 
+/*!
+ * Explicit BLOCKS_PER_SM version.
+ * Runs work in a storage container in reverse order
+ * and returns any per run resources
+ */
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::hip_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::reverse_ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+    : WorkRunnerForallReverse<
+        RAJA::hip_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+        RAJA::hip_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::reverse_ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{
+  using base = WorkRunnerForallReverse<
+        RAJA::hip_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+        RAJA::hip_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::reverse_ordered,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>;
+  using base::base;
+  using IndexType = INDEX_T;
+  using per_run_storage = typename base::per_run_storage;
+
+  ///
+  /// run the loops in the given work container in reverse order using forall
+  /// run all loops asynchronously and synchronize after is necessary
+  ///
+  template < typename WorkContainer >
+  per_run_storage run(WorkContainer const& storage,
+                      typename base::resource_type r, Args... args) const
+  {
+    per_run_storage run_storage =
+        base::run(storage, r, std::forward<Args>(args)...);
+
+    IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
+
+    // Only synchronize if we had something to iterate over
+    if (num_loops > 0 && BLOCK_SIZE > 0) {
+      if (!Async) { RAJA::hip::synchronize(r); }
+    }
+
+    return run_storage;
+  }
+};
+
+
 #if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /*!
@@ -179,11 +292,12 @@ struct HoldHipDeviceXThreadblockLoop
 };
 
 template < size_t BLOCK_SIZE,
+           size_t BLOCKS_PER_SM,
            typename StorageIter,
            typename value_type,
            typename index_type,
            typename ... Args >
-__launch_bounds__(BLOCK_SIZE, 1) __global__
+__launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
     void hip_unordered_y_block_global(StorageIter iter, Args... args)
 {
   const index_type i_loop = blockIdx.y;
@@ -346,6 +460,161 @@ struct WorkRunner<
   index_type m_total_iterations = 0;
 };
 
+/*!
+ * Explicit BLOCKS_PER_SM version.
+ * Runs work in a storage container out of order with loops mapping to
+ * hip blocks in the y direction and iterations mapping to threads in
+ * the x direction, with the number of threads in the x dimension determined
+ * by the average number of iterates per loop
+ */
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::hip_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>
+{
+  using exec_policy = RAJA::hip_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
+  using order_policy = RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
+  using Allocator = ALLOCATOR_T;
+  using index_type = INDEX_T;
+  using resource_type = resources::Hip;
+
+  using vtable_type = Vtable<RAJA::hip_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>, Args...>;
+
+  WorkRunner() = default;
+
+  WorkRunner(WorkRunner const&) = delete;
+  WorkRunner& operator=(WorkRunner const&) = delete;
+
+  WorkRunner(WorkRunner && o)
+    : m_total_iterations(o.m_total_iterations)
+  {
+    o.m_total_iterations = 0;
+  }
+  WorkRunner& operator=(WorkRunner && o)
+  {
+    m_total_iterations = o.m_total_iterations;
+
+    o.m_total_iterations = 0;
+    return *this;
+  }
+
+  // The type  that will hold the segment and loop body in work storage
+  template < typename ITERABLE, typename LOOP_BODY >
+  using holder_type = HoldHipDeviceXThreadblockLoop<ITERABLE, LOOP_BODY,
+                                 index_type, Args...>;
+
+  // The policy indicating where the call function is invoked
+  // in this case the values are called on the device
+  using vtable_exec_policy = exec_policy;
+
+  // runner interfaces with storage to enqueue so the runner can get
+  // information from the segment and loop at enqueue time
+  template < typename WorkContainer, typename Iterable, typename LoopBody >
+  inline void enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
+  {
+    using Iterator  = camp::decay<decltype(std::begin(iter))>;
+    using LOOP_BODY = camp::decay<LoopBody>;
+    using ITERABLE  = camp::decay<Iterable>;
+    using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+
+    using holder = holder_type<ITERABLE, LOOP_BODY>;
+
+    // using true_value_type = typename WorkContainer::template true_value_type<holder>;
+
+    Iterator begin = std::begin(iter);
+    Iterator end = std::end(iter);
+    IndexType len = std::distance(begin, end);
+
+    // Only launch kernel if we have something to iterate over
+    if (len > 0 && BLOCK_SIZE > 0) {
+
+      m_total_iterations += len;
+
+      //
+      // TODO: Privatize the loop_body, using make_launch_body to setup reductions
+      //
+      // LOOP_BODY body = RAJA::hip::make_launch_body(
+      //     gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
+
+      storage.template emplace<holder>(
+          get_Vtable<holder, vtable_type>(vtable_exec_policy{}),
+          std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
+    }
+  }
+
+  // no extra storage required here
+  using per_run_storage = int;
+
+  template < typename WorkContainer >
+  per_run_storage run(WorkContainer const& storage, resource_type r, Args... args) const
+  {
+    using Iterator  = camp::decay<decltype(std::begin(storage))>;
+    using IndexType = camp::decay<decltype(std::distance(std::begin(storage), std::end(storage)))>;
+    using value_type = typename WorkContainer::value_type;
+
+    per_run_storage run_storage{};
+
+    auto func = hip_unordered_y_block_global<BLOCK_SIZE, BLOCKS_PER_SM, Iterator, value_type, index_type, Args...>;
+
+    //
+    // Compute the requested iteration space size
+    //
+    Iterator begin = std::begin(storage);
+    Iterator end = std::end(storage);
+    IndexType num_loops = std::distance(begin, end);
+
+    // Only launch kernel if we have something to iterate over
+    if (num_loops > 0 && BLOCK_SIZE > 0) {
+
+      index_type average_iterations = m_total_iterations / static_cast<index_type>(num_loops);
+
+      //
+      // Compute the number of blocks
+      //
+      constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
+      hip_dim_t blockSize{static_cast<hip_dim_member_t>(block_size), 1, 1};
+      hip_dim_t gridSize{static_cast<hip_dim_member_t>((average_iterations + block_size - 1) / block_size),
+                          static_cast<hip_dim_member_t>(num_loops),
+                          1};
+
+      RAJA_FT_BEGIN;
+
+      //
+      // Setup shared memory buffers
+      //
+      size_t shmem = 0;
+
+      {
+        //
+        // Launch the kernel
+        //
+        void* func_args[] = { (void*)&begin, (void*)&args... };
+        RAJA::hip::launch((const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
+      }
+
+      RAJA_FT_END;
+    }
+
+    return run_storage;
+  }
+
+  // clear any state so ready to be destroyed or reused
+  void clear()
+  {
+    m_total_iterations = 0;
+  }
+
+private:
+  index_type m_total_iterations = 0;
+};
+
+
 #endif
 
 }  // namespace detail
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 8cb14de4b0..34b23bf51e 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -94,8 +94,8 @@ struct hip_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_
 ///
 /// WorkGroup execution policies
 ///
-template <size_t BLOCK_SIZE, bool Async = false>
-struct hip_work : public RAJA::make_policy_pattern_launch_platform_t<
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
+struct hip_work_explicit : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Policy::hip,
                        RAJA::Pattern::workgroup_exec,
                        detail::get_launch<Async>::value,
@@ -233,10 +233,13 @@ using hip_exec = policy::hip::hip_exec_explicit<BLOCK_SIZE, 1, ASYNC>;
 template <size_t BLOCK_SIZE, bool ASYNC = true>
 using hip_exec_async = policy::hip::hip_exec_explicit<BLOCK_SIZE, 1, true>;
 
-using policy::hip::hip_work;
+using policy::hip::hip_work_explicit;
+
+template <size_t BLOCK_SIZE, bool ASYNC = false>
+using hip_work = policy::hip::hip_work_explicit<BLOCK_SIZE, 1, ASYNC>;
 
 template <size_t BLOCK_SIZE>
-using hip_work_async = policy::hip::hip_work<BLOCK_SIZE, true>;
+using hip_work_async = policy::hip::hip_work_explicit<BLOCK_SIZE, 1, true>;
 
 using policy::hip::hip_atomic;
 using policy::hip::hip_atomic_explicit;
diff --git a/test/include/RAJA_test-workgroup.hpp b/test/include/RAJA_test-workgroup.hpp
index bc8e59ff25..e2f44039ee 100644
--- a/test/include/RAJA_test-workgroup.hpp
+++ b/test/include/RAJA_test-workgroup.hpp
@@ -360,8 +360,13 @@ using OpenMPTargetStoragePolicyList = SequentialStoragePolicyList;
 #if defined(RAJA_ENABLE_CUDA)
 using CudaExecPolicyList =
     camp::list<
+                #if defined(RAJA_TEST_EXHAUSTIVE)
+                // avoid compilation error:
+                // tpl/camp/include/camp/camp.hpp(104): error #456: excessive recursion at instantiation of class
                 RAJA::cuda_work<256>,
-                RAJA::cuda_work<1024>
+                #endif
+                RAJA::cuda_work<1024>,
+                RAJA::cuda_work_explicit<256, 2>
               >;
 using CudaOrderedPolicyList = SequentialOrderedPolicyList;
 using CudaOrderPolicyList   =
@@ -376,8 +381,11 @@ using CudaStoragePolicyList = SequentialStoragePolicyList;
 #if defined(RAJA_ENABLE_HIP)
 using HipExecPolicyList =
     camp::list<
+                #if defined(RAJA_TEST_EXHAUSTIVE)
                 RAJA::hip_work<256>,
-                RAJA::hip_work<1024>
+                #endif
+                RAJA::hip_work<1024>,
+                RAJA::hip_work_explicit<256, 2>
               >;
 using HipOrderedPolicyList = SequentialOrderedPolicyList;
 using HipOrderPolicyList   =

From 454edc11569a334404bcb65f792d215685bf1cbf Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Mon, 29 Nov 2021 17:58:03 -0800
Subject: [PATCH 07/21] Remove old redundant policies. Hide _launch_explicit_t
 under expt.

---
 include/RAJA/policy/cuda/WorkGroup/Vtable.hpp |  17 --
 .../RAJA/policy/cuda/WorkGroup/WorkRunner.hpp | 268 ------------------
 include/RAJA/policy/cuda/policy.hpp           |   8 +-
 include/RAJA/policy/cuda/teams.hpp            | 244 +---------------
 include/RAJA/policy/hip/WorkGroup/Vtable.hpp  |  17 --
 .../RAJA/policy/hip/WorkGroup/WorkRunner.hpp  | 267 -----------------
 include/RAJA/policy/hip/policy.hpp            |   5 +-
 include/RAJA/policy/hip/teams.hpp             | 237 +---------------
 test/include/RAJA_test-teams-execpol.hpp      |   8 +-
 9 files changed, 17 insertions(+), 1054 deletions(-)

diff --git a/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp b/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp
index 772ca6b9ce..7f76961d3e 100644
--- a/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp
@@ -98,23 +98,6 @@ inline typename Vtable_T::call_sig get_cached_Vtable_cuda_device_call()
 * Populate and return a Vtable object where the
 * call operator is a device function
 */
-template < typename T, typename Vtable_T, size_t BLOCK_SIZE, bool Async >
-inline const Vtable_T* get_Vtable(cuda_work<BLOCK_SIZE, Async> const&)
-{
-  static Vtable_T vtable{
-        &Vtable_T::template move_construct_destroy<T>,
-        get_cached_Vtable_cuda_device_call<T, Vtable_T>(),
-        &Vtable_T::template destroy<T>,
-        sizeof(T)
-      };
-  return &vtable;
-}
-
-/*!
-* Explicit BLOCKS_PER_SM version.
-* Populate and return a Vtable object where the
-* call operator is a device function
-*/
 template < typename T, typename Vtable_T, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async >
 inline const Vtable_T* get_Vtable(cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
 {
diff --git a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
index 67efaf23c6..46e109c974 100644
--- a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
@@ -36,62 +36,6 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, bool Async,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::cuda_work<BLOCK_SIZE, Async>,
-        RAJA::ordered,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallOrdered<
-        RAJA::cuda_exec_async<BLOCK_SIZE>,
-        RAJA::cuda_work<BLOCK_SIZE, Async>,
-        RAJA::ordered,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{
-  using base = WorkRunnerForallOrdered<
-        RAJA::cuda_exec_async<BLOCK_SIZE>,
-        RAJA::cuda_work<BLOCK_SIZE, Async>,
-        RAJA::ordered,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
-  using base::base;
-  using IndexType = INDEX_T;
-  using per_run_storage = typename base::per_run_storage;
-
-  ///
-  /// run the loops in the given work container in order using forall
-  /// run all loops asynchronously and synchronize after is necessary
-  ///
-  template < typename WorkContainer >
-  per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
-  {
-    per_run_storage run_storage =
-        base::run(storage, r, std::forward<Args>(args)...);
-
-    IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
-
-    // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::cuda::synchronize(r); }
-    }
-
-    return run_storage;
-  }
-};
-
-/*!
- * Explicit BLOCKS_PER_SM version.
- * Runs work in a storage container in order
- * and returns any per run resources
- */
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
@@ -147,63 +91,6 @@ struct WorkRunner<
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, bool Async,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::cuda_work<BLOCK_SIZE, Async>,
-        RAJA::reverse_ordered,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallReverse<
-        RAJA::cuda_exec_async<BLOCK_SIZE>,
-        RAJA::cuda_work<BLOCK_SIZE, Async>,
-        RAJA::reverse_ordered,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{
-  using base = WorkRunnerForallReverse<
-        RAJA::cuda_exec_async<BLOCK_SIZE>,
-        RAJA::cuda_work<BLOCK_SIZE, Async>,
-        RAJA::reverse_ordered,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
-  using base::base;
-  using IndexType = INDEX_T;
-  using per_run_storage = typename base::per_run_storage;
-
-  ///
-  /// run the loops in the given work container in reverse order using forall
-  /// run all loops asynchronously and synchronize after is necessary
-  ///
-  template < typename WorkContainer >
-  per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
-  {
-    per_run_storage run_storage =
-        base::run(storage, r, std::forward<Args>(args)...);
-
-    IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
-
-    // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::cuda::synchronize(r); }
-    }
-
-    return run_storage;
-  }
-};
-
-
-/*!
- * Explicit BLOCKS_PER_SM version.
- * Runs work in a storage container in reverse order
- * and returns any per run resources
- */
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
@@ -311,161 +198,6 @@ __launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
  * the x direction, with the number of threads in the x dimension determined
  * by the average number of iterates per loop
  */
-template <size_t BLOCK_SIZE, bool Async,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::cuda_work<BLOCK_SIZE, Async>,
-        RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{
-  using exec_policy = RAJA::cuda_work<BLOCK_SIZE, Async>;
-  using order_policy = RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
-  using Allocator = ALLOCATOR_T;
-  using index_type = INDEX_T;
-  using resource_type = resources::Cuda;
-
-  using vtable_type = Vtable<RAJA::cuda_work<BLOCK_SIZE, true>, Args...>;
-
-  WorkRunner() = default;
-
-  WorkRunner(WorkRunner const&) = delete;
-  WorkRunner& operator=(WorkRunner const&) = delete;
-
-  WorkRunner(WorkRunner && o)
-    : m_total_iterations(o.m_total_iterations)
-  {
-    o.m_total_iterations = 0;
-  }
-  WorkRunner& operator=(WorkRunner && o)
-  {
-    m_total_iterations = o.m_total_iterations;
-
-    o.m_total_iterations = 0;
-    return *this;
-  }
-
-  // The type  that will hold the segment and loop body in work storage
-  template < typename ITERABLE, typename LOOP_BODY >
-  using holder_type = HoldCudaDeviceXThreadblockLoop<ITERABLE, LOOP_BODY,
-                                 index_type, Args...>;
-
-  // The policy indicating where the call function is invoked
-  // in this case the values are called on the device
-  using vtable_exec_policy = exec_policy;
-
-  // runner interfaces with storage to enqueue so the runner can get
-  // information from the segment and loop at enqueue time
-  template < typename WorkContainer, typename Iterable, typename LoopBody >
-  inline void enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
-  {
-    using Iterator  = camp::decay<decltype(std::begin(iter))>;
-    using LOOP_BODY = camp::decay<LoopBody>;
-    using ITERABLE  = camp::decay<Iterable>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-
-    using holder = holder_type<ITERABLE, LOOP_BODY>;
-
-    // using true_value_type = typename WorkContainer::template true_value_type<holder>;
-
-    Iterator begin = std::begin(iter);
-    Iterator end = std::end(iter);
-    IndexType len = std::distance(begin, end);
-
-    // Only launch kernel if we have something to iterate over
-    if (len > 0 && BLOCK_SIZE > 0) {
-
-      m_total_iterations += len;
-
-      //
-      // TODO: Privatize the loop_body, using make_launch_body to setup reductions
-      //
-      // LOOP_BODY body = RAJA::cuda::make_launch_body(
-      //     gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
-
-      storage.template emplace<holder>(
-          get_Vtable<holder, vtable_type>(vtable_exec_policy{}),
-          std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
-    }
-  }
-
-  // no extra storage required here
-  using per_run_storage = int;
-
-  template < typename WorkContainer >
-  per_run_storage run(WorkContainer const& storage, resource_type r, Args... args) const
-  {
-    using Iterator  = camp::decay<decltype(std::begin(storage))>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(storage), std::end(storage)))>;
-    using value_type = typename WorkContainer::value_type;
-
-    per_run_storage run_storage{};
-
-    auto func = cuda_unordered_y_block_global<BLOCK_SIZE, 1 /*BLOCKS_PER_SM*/, Iterator, value_type, index_type, Args...>;
-
-    //
-    // Compute the requested iteration space size
-    //
-    Iterator begin = std::begin(storage);
-    Iterator end = std::end(storage);
-    IndexType num_loops = std::distance(begin, end);
-
-    // Only launch kernel if we have something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
-
-      index_type average_iterations = m_total_iterations / static_cast<index_type>(num_loops);
-
-      //
-      // Compute the number of blocks
-      //
-      constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
-      cuda_dim_t blockSize{static_cast<cuda_dim_member_t>(block_size), 1, 1};
-      cuda_dim_t gridSize{static_cast<cuda_dim_member_t>((average_iterations + block_size - 1) / block_size),
-                          static_cast<cuda_dim_member_t>(num_loops),
-                          1};
-
-      RAJA_FT_BEGIN;
-
-      //
-      // Setup shared memory buffers
-      //
-      size_t shmem = 0;
-
-      {
-        //
-        // Launch the kernel
-        //
-        void* func_args[] = { (void*)&begin, (void*)&args... };
-        RAJA::cuda::launch((const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
-      }
-
-      RAJA_FT_END;
-    }
-
-    return run_storage;
-  }
-
-  // clear any state so ready to be destroyed or reused
-  void clear()
-  {
-    m_total_iterations = 0;
-  }
-
-private:
-  index_type m_total_iterations = 0;
-};
-
-
-/*!
- * Explicit BLOCKS_PER_SM version.
- * Runs work in a storage container out of order with loops mapping to
- * cuda blocks in the y direction and iterations mapping to threads in
- * the x direction, with the number of threads in the x dimension determined
- * by the average number of iterates per loop
- */
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index e28db8109f..b35b8cb1eb 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -82,6 +82,8 @@ struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Platform::cuda> {
 };
 
+namespace expt
+{
 template <bool Async, int num_threads, size_t BLOCKS_PER_SM = 0>
 struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t<
                                 RAJA::Policy::cuda,
@@ -89,7 +91,7 @@ struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform
                                 detail::get_launch<Async>::value,
                                 RAJA::Platform::cuda> {
 };
-
+}
 
 
 
@@ -228,7 +230,7 @@ using cuda_exec_explicit_async = policy::cuda::cuda_exec_explicit<BLOCK_SIZE, BL
 template <size_t BLOCK_SIZE, bool ASYNC = false>
 using cuda_exec = policy::cuda::cuda_exec_explicit<BLOCK_SIZE, 1, ASYNC>;
 
-template <size_t BLOCK_SIZE, bool ASYNC = true>
+template <size_t BLOCK_SIZE>
 using cuda_exec_async = policy::cuda::cuda_exec_explicit<BLOCK_SIZE, 1, true>;
 
 using policy::cuda::cuda_work_explicit;
@@ -265,7 +267,7 @@ using policy::cuda::cuda_synchronize;
 namespace expt
 {
   template <bool Async, int num_threads = 0>
-  using cuda_launch_t = policy::cuda::cuda_launch_explicit_t<Async, num_threads, 0>;
+  using cuda_launch_t = policy::cuda::expt::cuda_launch_explicit_t<Async, num_threads, 0>;
 }
 
 
diff --git a/include/RAJA/policy/cuda/teams.hpp b/include/RAJA/policy/cuda/teams.hpp
index c5d5b74ef2..adc900f93d 100644
--- a/include/RAJA/policy/cuda/teams.hpp
+++ b/include/RAJA/policy/cuda/teams.hpp
@@ -31,130 +31,6 @@ namespace RAJA
 namespace expt
 {
 
-template <typename BODY>
-__global__ void launch_global_fcn(LaunchContext ctx, BODY body_in)
-{
-  using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
-  body(ctx);
-}
-
-template <bool async>
-struct LaunchExecute<RAJA::expt::cuda_launch_t<async, 0>> {
-
-  template <typename BODY_IN>
-  static void exec(LaunchContext const &ctx, BODY_IN &&body_in)
-  {
-    using BODY = camp::decay<BODY_IN>;
-
-    auto func = launch_global_fcn<BODY>;
-
-    resources::Cuda cuda_res = resources::Cuda::get_default();
-
-    //
-    // Compute the number of blocks and threads
-    //
-
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(ctx.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(ctx.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(ctx.teams.value[2]) };
-
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(ctx.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(ctx.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(ctx.threads.value[2]) };
-
-    // Only launch kernel if we have something to iterate over
-    constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
-
-      RAJA_FT_BEGIN;
-
-      //
-      // Setup shared memory buffers
-      //
-      size_t shmem = 0;
-
-      {
-        //
-        // Privatize the loop_body, using make_launch_body to setup reductions
-        //
-        BODY body = RAJA::cuda::make_launch_body(
-            gridSize, blockSize, shmem, cuda_res, std::forward<BODY_IN>(body_in));
-
-        //
-        // Launch the kernel
-        //
-        void *args[] = {(void*)&ctx, (void*)&body};
-        {
-          RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, shmem, cuda_res, async, ctx.kernel_name);
-        }
-      }
-
-      RAJA_FT_END;
-    }
-
-  }
-
-  template <typename BODY_IN>
-  static resources::EventProxy<resources::Resource>
-  exec(RAJA::resources::Resource res, LaunchContext const &ctx, BODY_IN &&body_in)
-  {
-    using BODY = camp::decay<BODY_IN>;
-
-    auto func = launch_global_fcn<BODY>;
-
-    /*Get the concrete resource */
-    resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
-
-    //
-    // Compute the number of blocks and threads
-    //
-
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(ctx.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(ctx.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(ctx.teams.value[2]) };
-
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(ctx.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(ctx.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(ctx.threads.value[2]) };
-
-    // Only launch kernel if we have something to iterate over
-    constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
-
-      RAJA_FT_BEGIN;
-
-      //
-      // Setup shared memory buffers
-      //
-      size_t shmem = 0;
-
-      {
-        //
-        // Privatize the loop_body, using make_launch_body to setup reductions
-        //
-        BODY body = RAJA::cuda::make_launch_body(
-            gridSize, blockSize, shmem, cuda_res, std::forward<BODY_IN>(body_in));
-
-        //
-        // Launch the kernel
-        //
-        void *args[] = {(void*)&ctx, (void*)&body};
-        {
-          RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, shmem, cuda_res, async, ctx.kernel_name);
-        }
-      }
-
-      RAJA_FT_END;
-    }
-
-    return resources::EventProxy<resources::Resource>(res);
-  }
-};
-
 template <typename BODY, int num_threads, size_t BLOCKS_PER_SM>
 __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
     void launch_global_fcn_fixed(LaunchContext ctx, BODY body_in)
@@ -165,124 +41,8 @@ __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
   body(ctx);
 }
 
-
-template <bool async, int nthreads>
-struct LaunchExecute<RAJA::expt::cuda_launch_t<async, nthreads>> {
-
-  template <typename BODY_IN>
-  static void exec(LaunchContext const &ctx, BODY_IN &&body_in)
-  {
-    using BODY = camp::decay<BODY_IN>;
-
-    auto func = launch_global_fcn_fixed<BODY, nthreads, 1>;
-
-    resources::Cuda cuda_res = resources::Cuda::get_default();
-
-    //
-    // Compute the number of blocks and threads
-    //
-
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(ctx.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(ctx.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(ctx.teams.value[2]) };
-
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(ctx.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(ctx.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(ctx.threads.value[2]) };
-
-    // Only launch kernel if we have something to iterate over
-    constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
-
-      RAJA_FT_BEGIN;
-
-      //
-      // Setup shared memory buffers
-      //
-      size_t shmem = 0;
-
-      {
-        //
-        // Privatize the loop_body, using make_launch_body to setup reductions
-        //
-        BODY body = RAJA::cuda::make_launch_body(
-            gridSize, blockSize, shmem, cuda_res, std::forward<BODY_IN>(body_in));
-
-        //
-        // Launch the kernel
-        //
-        void *args[] = {(void*)&ctx, (void*)&body};
-        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, shmem, cuda_res, async, ctx.kernel_name);
-      }
-
-      RAJA_FT_END;
-    }
-
-  }
-
-  template <typename BODY_IN>
-  static resources::EventProxy<resources::Resource>
-  exec(RAJA::resources::Resource res, LaunchContext const &ctx, BODY_IN &&body_in)
-  {
-    using BODY = camp::decay<BODY_IN>;
-
-    auto func = launch_global_fcn<BODY>;
-
-    /*Get the concrete resource */
-    resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
-
-    //
-    // Compute the number of blocks and threads
-    //
-
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(ctx.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(ctx.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(ctx.teams.value[2]) };
-
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(ctx.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(ctx.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(ctx.threads.value[2]) };
-
-    // Only launch kernel if we have something to iterate over
-    constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
-
-      RAJA_FT_BEGIN;
-
-      //
-      // Setup shared memory buffers
-      //
-      size_t shmem = 0;
-
-      {
-        //
-        // Privatize the loop_body, using make_launch_body to setup reductions
-        //
-        BODY body = RAJA::cuda::make_launch_body(
-            gridSize, blockSize, shmem, cuda_res, std::forward<BODY_IN>(body_in));
-
-        //
-        // Launch the kernel
-        //
-        void *args[] = {(void*)&ctx, (void*)&body};
-        {
-          RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, shmem, cuda_res, async, ctx.kernel_name);
-        }
-      }
-
-      RAJA_FT_END;
-    }
-
-    return resources::EventProxy<resources::Resource>(res);
-  }
-
-};
-
-
 template <bool async, int nthreads, size_t BLOCKS_PER_SM>
-struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>> {
+struct LaunchExecute<RAJA::policy::cuda::expt::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>> {
 
   template <typename BODY_IN>
   static void exec(LaunchContext const &ctx, BODY_IN &&body_in)
@@ -342,7 +102,7 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = launch_global_fcn<BODY>;
+    auto func = launch_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM>;
 
     /*Get the concrete resource */
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
diff --git a/include/RAJA/policy/hip/WorkGroup/Vtable.hpp b/include/RAJA/policy/hip/WorkGroup/Vtable.hpp
index fceac6c102..f2369b23c9 100644
--- a/include/RAJA/policy/hip/WorkGroup/Vtable.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/Vtable.hpp
@@ -97,23 +97,6 @@ inline typename Vtable_T::call_sig get_cached_Vtable_hip_device_call()
 * Populate and return a Vtable object where the
 * call operator is a device function
 */
-template < typename T, typename Vtable_T, size_t BLOCK_SIZE, bool Async >
-inline const Vtable_T* get_Vtable(hip_work<BLOCK_SIZE, Async> const&)
-{
-  static Vtable_T vtable{
-        &Vtable_T::template move_construct_destroy<T>,
-        get_cached_Vtable_hip_device_call<T, Vtable_T>(),
-        &Vtable_T::template destroy<T>,
-        sizeof(T)
-      };
-  return &vtable;
-}
-
-/*!
-* Explicit BLOCKS_PER_SM version.
-* Populate and return a Vtable object where the
-* call operator is a device function
-*/
 template < typename T, typename Vtable_T, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async >
 inline const Vtable_T* get_Vtable(hip_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
 {
diff --git a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
index a42577f643..a494ca6299 100644
--- a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
@@ -36,62 +36,6 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, bool Async,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::ordered,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallOrdered<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::ordered,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{
-  using base = WorkRunnerForallOrdered<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::ordered,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
-  using base::base;
-  using IndexType = INDEX_T;
-  using per_run_storage = typename base::per_run_storage;
-
-  ///
-  /// run the loops in the given work container in order using forall
-  /// run all loops asynchronously and synchronize after is necessary
-  ///
-  template < typename WorkContainer >
-  per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
-  {
-    per_run_storage run_storage =
-        base::run(storage, r, std::forward<Args>(args)...);
-
-    IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
-
-    // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::hip::synchronize(r); }
-    }
-
-    return run_storage;
-  }
-};
-
-/*!
- * Explicit BLOCKS_PER_SM version.
- * Runs work in a storage container in order
- * and returns any per run resources
- */
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
@@ -147,63 +91,6 @@ struct WorkRunner<
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, bool Async,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::reverse_ordered,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallReverse<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::reverse_ordered,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{
-  using base = WorkRunnerForallReverse<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::reverse_ordered,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
-  using base::base;
-  using IndexType = INDEX_T;
-  using per_run_storage = typename base::per_run_storage;
-
-  ///
-  /// run the loops in the given work container in reverse order using forall
-  /// run all loops asynchronously and synchronize after is necessary
-  ///
-  template < typename WorkContainer >
-  per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
-  {
-    per_run_storage run_storage =
-        base::run(storage, r, std::forward<Args>(args)...);
-
-    IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
-
-    // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::hip::synchronize(r); }
-    }
-
-    return run_storage;
-  }
-};
-
-
-/*!
- * Explicit BLOCKS_PER_SM version.
- * Runs work in a storage container in reverse order
- * and returns any per run resources
- */
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
@@ -313,160 +200,6 @@ __launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
  * the x direction, with the number of threads in the x dimension determined
  * by the average number of iterates per loop
  */
-template <size_t BLOCK_SIZE, bool Async,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{
-  using exec_policy = RAJA::hip_work<BLOCK_SIZE, Async>;
-  using order_policy = RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
-  using Allocator = ALLOCATOR_T;
-  using index_type = INDEX_T;
-  using resource_type = resources::Hip;
-
-  using vtable_type = Vtable<RAJA::hip_work<BLOCK_SIZE, true>, Args...>;
-
-  WorkRunner() = default;
-
-  WorkRunner(WorkRunner const&) = delete;
-  WorkRunner& operator=(WorkRunner const&) = delete;
-
-  WorkRunner(WorkRunner && o)
-    : m_total_iterations(o.m_total_iterations)
-  {
-    o.m_total_iterations = 0;
-  }
-  WorkRunner& operator=(WorkRunner && o)
-  {
-    m_total_iterations = o.m_total_iterations;
-
-    o.m_total_iterations = 0;
-    return *this;
-  }
-
-  // The type  that will hold the segment and loop body in work storage
-  template < typename ITERABLE, typename LOOP_BODY >
-  using holder_type = HoldHipDeviceXThreadblockLoop<ITERABLE, LOOP_BODY,
-                                 index_type, Args...>;
-
-  // The policy indicating where the call function is invoked
-  // in this case the values are called on the device
-  using vtable_exec_policy = exec_policy;
-
-  // runner interfaces with storage to enqueue so the runner can get
-  // information from the segment and loop at enqueue time
-  template < typename WorkContainer, typename Iterable, typename LoopBody >
-  inline void enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
-  {
-    using Iterator  = camp::decay<decltype(std::begin(iter))>;
-    using LOOP_BODY = camp::decay<LoopBody>;
-    using ITERABLE  = camp::decay<Iterable>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-
-    using holder = holder_type<ITERABLE, LOOP_BODY>;
-
-    // using true_value_type = typename WorkContainer::template true_value_type<holder>;
-
-    Iterator begin = std::begin(iter);
-    Iterator end = std::end(iter);
-    IndexType len = std::distance(begin, end);
-
-    // Only launch kernel if we have something to iterate over
-    if (len > 0 && BLOCK_SIZE > 0) {
-
-      m_total_iterations += len;
-
-      //
-      // TODO: Privatize the loop_body, using make_launch_body to setup reductions
-      //
-      // LOOP_BODY body = RAJA::hip::make_launch_body(
-      //     gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
-
-      storage.template emplace<holder>(
-          get_Vtable<holder, vtable_type>(vtable_exec_policy{}),
-          std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
-    }
-  }
-
-  // no extra storage required here
-  using per_run_storage = int;
-
-  template < typename WorkContainer >
-  per_run_storage run(WorkContainer const& storage, resource_type r, Args... args) const
-  {
-    using Iterator  = camp::decay<decltype(std::begin(storage))>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(storage), std::end(storage)))>;
-    using value_type = typename WorkContainer::value_type;
-
-    per_run_storage run_storage{};
-
-    auto func = hip_unordered_y_block_global<BLOCK_SIZE, Iterator, value_type, index_type, Args...>;
-
-    //
-    // Compute the requested iteration space size
-    //
-    Iterator begin = std::begin(storage);
-    Iterator end = std::end(storage);
-    IndexType num_loops = std::distance(begin, end);
-
-    // Only launch kernel if we have something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
-
-      index_type average_iterations = m_total_iterations / static_cast<index_type>(num_loops);
-
-      //
-      // Compute the number of blocks
-      //
-      constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
-      hip_dim_t blockSize{static_cast<hip_dim_member_t>(block_size), 1, 1};
-      hip_dim_t gridSize{static_cast<hip_dim_member_t>((average_iterations + block_size - 1) / block_size),
-                          static_cast<hip_dim_member_t>(num_loops),
-                          1};
-
-      RAJA_FT_BEGIN;
-
-      //
-      // Setup shared memory buffers
-      //
-      size_t shmem = 0;
-
-      {
-        //
-        // Launch the kernel
-        //
-        void* func_args[] = { (void*)&begin, (void*)&args... };
-        RAJA::hip::launch((const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
-      }
-
-      RAJA_FT_END;
-    }
-
-    return run_storage;
-  }
-
-  // clear any state so ready to be destroyed or reused
-  void clear()
-  {
-    m_total_iterations = 0;
-  }
-
-private:
-  index_type m_total_iterations = 0;
-};
-
-/*!
- * Explicit BLOCKS_PER_SM version.
- * Runs work in a storage container out of order with loops mapping to
- * hip blocks in the y direction and iterations mapping to threads in
- * the x direction, with the number of threads in the x dimension determined
- * by the average number of iterates per loop
- */
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 34b23bf51e..7169fabb96 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -78,6 +78,8 @@ struct hip_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Platform::hip> {
 };
 
+namespace expt
+{
 template <bool Async, int num_threads, int BLOCKS_PER_SM = 0>
 struct hip_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t<
                                       RAJA::Policy::hip,
@@ -85,6 +87,7 @@ struct hip_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_
                                       detail::get_launch<Async>::value,
                                       RAJA::Platform::hip> {
 };
+}
 
 
 //
@@ -230,7 +233,7 @@ using hip_exec_explicit_async = policy::hip::hip_exec_explicit<BLOCK_SIZE, BLOCK
 template <size_t BLOCK_SIZE, bool ASYNC = false>
 using hip_exec = policy::hip::hip_exec_explicit<BLOCK_SIZE, 1, ASYNC>;
 
-template <size_t BLOCK_SIZE, bool ASYNC = true>
+template <size_t BLOCK_SIZE>
 using hip_exec_async = policy::hip::hip_exec_explicit<BLOCK_SIZE, 1, true>;
 
 using policy::hip::hip_work_explicit;
diff --git a/include/RAJA/policy/hip/teams.hpp b/include/RAJA/policy/hip/teams.hpp
index 75aaf3d62d..d07a37e7d7 100644
--- a/include/RAJA/policy/hip/teams.hpp
+++ b/include/RAJA/policy/hip/teams.hpp
@@ -31,126 +31,6 @@ namespace RAJA
 namespace expt
 {
 
-template <typename BODY>
-__global__ void launch_global_fcn(LaunchContext ctx, BODY body_in)
-{
-  using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
-  body(ctx);
-}
-
-template <bool async>
-struct LaunchExecute<RAJA::expt::hip_launch_t<async, 0>> {
-
-  template <typename BODY_IN>
-  static void exec(LaunchContext const &ctx, BODY_IN &&body_in)
-  {
-    using BODY = camp::decay<BODY_IN>;
-
-    auto func = launch_global_fcn<BODY>;
-
-    resources::Hip hip_res = resources::Hip::get_default();
-
-    //
-    // Compute the number of blocks and threads
-    //
-
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(ctx.teams.value[0]),
-                        static_cast<hip_dim_member_t>(ctx.teams.value[1]),
-                        static_cast<hip_dim_member_t>(ctx.teams.value[2]) };
-
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(ctx.threads.value[0]),
-                         static_cast<hip_dim_member_t>(ctx.threads.value[1]),
-                         static_cast<hip_dim_member_t>(ctx.threads.value[2]) };
-
-    // Only launch kernel if we have something to iterate over
-    constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
-
-      RAJA_FT_BEGIN;
-
-      //
-      // Setup shared memory buffers
-      //
-      size_t shmem = 0;
-
-      {
-        //
-        // Privatize the loop_body, using make_launch_body to setup reductions
-        //
-        BODY body = RAJA::hip::make_launch_body(
-            gridSize, blockSize, shmem, hip_res, std::forward<BODY_IN>(body_in));
-
-        //
-        // Launch the kernel
-        //
-        void *args[] = {(void*)&ctx, (void*)&body};
-        RAJA::hip::launch((const void*)func, gridSize, blockSize, args, shmem, hip_res, async, ctx.kernel_name);
-      }
-
-      RAJA_FT_END;
-    }
-
-  }
-
-  template <typename BODY_IN>
-  static resources::EventProxy<resources::Resource>
-  exec(RAJA::resources::Resource res, LaunchContext const &ctx, BODY_IN &&body_in)
-  {
-    using BODY = camp::decay<BODY_IN>;
-
-    auto func = launch_global_fcn<BODY>;
-
-    resources::Hip hip_res = res.get<RAJA::resources::Hip>();
-
-    //
-    // Compute the number of blocks and threads
-    //
-
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(ctx.teams.value[0]),
-                        static_cast<hip_dim_member_t>(ctx.teams.value[1]),
-                        static_cast<hip_dim_member_t>(ctx.teams.value[2]) };
-
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(ctx.threads.value[0]),
-                         static_cast<hip_dim_member_t>(ctx.threads.value[1]),
-                         static_cast<hip_dim_member_t>(ctx.threads.value[2]) };
-
-    // Only launch kernel if we have something to iterate over
-    constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
-
-      RAJA_FT_BEGIN;
-
-      //
-      // Setup shared memory buffers
-      //
-      size_t shmem = 0;
-
-      {
-        //
-        // Privatize the loop_body, using make_launch_body to setup reductions
-        //
-        BODY body = RAJA::hip::make_launch_body(
-            gridSize, blockSize, shmem, hip_res, std::forward<BODY_IN>(body_in));
-
-        //
-        // Launch the kernel
-        //
-        void *args[] = {(void*)&ctx, (void*)&body};
-        RAJA::hip::launch((const void*)func, gridSize, blockSize, args, shmem, hip_res, async, ctx.kernel_name);
-      }
-
-      RAJA_FT_END;
-    }
-
-    return resources::EventProxy<resources::Resource>(res);
-  }
-
-};
-
 template <typename BODY, int num_threads, int BLOCKS_PER_SM>
 __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
 static void launch_global_fcn_fixed(LaunchContext ctx, BODY body_in)
@@ -161,121 +41,8 @@ static void launch_global_fcn_fixed(LaunchContext ctx, BODY body_in)
   body(ctx);
 }
 
-
-template <bool async, int nthreads>
-struct LaunchExecute<RAJA::expt::hip_launch_t<async, nthreads>> {
-
-  template <typename BODY_IN>
-  static void exec(LaunchContext const &ctx, BODY_IN &&body_in)
-  {
-    using BODY = camp::decay<BODY_IN>;
-
-    auto func = launch_global_fcn_fixed<BODY, nthreads, 1>;
-
-    resources::Hip hip_res = resources::Hip::get_default();
-
-    //
-    // Compute the number of blocks and threads
-    //
-
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(ctx.teams.value[0]),
-                        static_cast<hip_dim_member_t>(ctx.teams.value[1]),
-                        static_cast<hip_dim_member_t>(ctx.teams.value[2]) };
-
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(ctx.threads.value[0]),
-                         static_cast<hip_dim_member_t>(ctx.threads.value[1]),
-                         static_cast<hip_dim_member_t>(ctx.threads.value[2]) };
-
-    // Only launch kernel if we have something to iterate over
-    constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
-
-      RAJA_FT_BEGIN;
-
-      //
-      // Setup shared memory buffers
-      //
-      size_t shmem = 0;
-
-      {
-        //
-        // Privatize the loop_body, using make_launch_body to setup reductions
-        //
-        BODY body = RAJA::hip::make_launch_body(
-            gridSize, blockSize, shmem, hip_res, std::forward<BODY_IN>(body_in));
-
-        //
-        // Launch the kernel
-        //
-        void *args[] = {(void*)&ctx, (void*)&body};
-        RAJA::hip::launch((const void*)func, gridSize, blockSize, args, shmem, hip_res, async, ctx.kernel_name);
-      }
-
-      RAJA_FT_END;
-    }
-
-  }
-
-  template <typename BODY_IN>
-  static resources::EventProxy<resources::Resource>
-  exec(RAJA::resources::Resource res, LaunchContext const &ctx, BODY_IN &&body_in)
-  {
-    using BODY = camp::decay<BODY_IN>;
-
-    auto func = launch_global_fcn<BODY>;
-
-    resources::Hip hip_res = res.get<RAJA::resources::Hip>();
-
-    //
-    // Compute the number of blocks and threads
-    //
-
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(ctx.teams.value[0]),
-                        static_cast<hip_dim_member_t>(ctx.teams.value[1]),
-                        static_cast<hip_dim_member_t>(ctx.teams.value[2]) };
-
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(ctx.threads.value[0]),
-                         static_cast<hip_dim_member_t>(ctx.threads.value[1]),
-                         static_cast<hip_dim_member_t>(ctx.threads.value[2]) };
-
-    // Only launch kernel if we have something to iterate over
-    constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
-
-      RAJA_FT_BEGIN;
-
-      //
-      // Setup shared memory buffers
-      //
-      size_t shmem = 0;
-
-      {
-        //
-        // Privatize the loop_body, using make_launch_body to setup reductions
-        //
-        BODY body = RAJA::hip::make_launch_body(
-            gridSize, blockSize, shmem, hip_res, std::forward<BODY_IN>(body_in));
-
-        //
-        // Launch the kernel
-        //
-        void *args[] = {(void*)&ctx, (void*)&body};
-        RAJA::hip::launch((const void*)func, gridSize, blockSize, args, shmem, hip_res, async, ctx.kernel_name);
-      }
-
-      RAJA_FT_END;
-    }
-
-    return resources::EventProxy<resources::Resource>(res);
-  }
-
-};
-
-
 template <bool async, int nthreads, int BLOCKS_PER_SM>
-struct LaunchExecute<RAJA::policy::hip::hip_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>> {
+struct LaunchExecute<RAJA::policy::hip::expt::hip_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>> {
 
   template <typename BODY_IN>
   static void exec(LaunchContext const &ctx, BODY_IN &&body_in)
@@ -335,7 +102,7 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_explicit_t<async, nthreads, B
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = launch_global_fcn<BODY>;
+    auto func = launch_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM>;
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
diff --git a/test/include/RAJA_test-teams-execpol.hpp b/test/include/RAJA_test-teams-execpol.hpp
index c15773005e..318d9d3cad 100644
--- a/test/include/RAJA_test-teams-execpol.hpp
+++ b/test/include/RAJA_test-teams-execpol.hpp
@@ -23,7 +23,7 @@ using seq_cuda_policies = camp::list<
   RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::cuda_thread_x_loop>>;
 
 using seq_cuda_explicit_policies = camp::list<
-  RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t,RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+  RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t,RAJA::policy::cuda::expt::cuda_launch_explicit_t<true, 0, 0>>,
   RAJA::expt::LoopPolicy<RAJA::loop_exec, RAJA::cuda_block_x_direct>,
   RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::cuda_thread_x_loop>>;
 
@@ -39,7 +39,7 @@ using seq_hip_policies = camp::list<
   RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_x_loop>>;
 
 using seq_hip_explicit_policies = camp::list<
-  RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t,RAJA::policy::hip::hip_launch_explicit_t<true, 0, 0>>,
+  RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t,RAJA::policy::hip::expt::hip_launch_explicit_t<true, 0, 0>>,
   RAJA::expt::LoopPolicy<RAJA::loop_exec, RAJA::hip_block_x_direct>,
   RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_x_loop>>;
 
@@ -67,7 +67,7 @@ using omp_cuda_policies = camp::list<
   >;
 
 using omp_cuda_explicit_policies = camp::list<
-         RAJA::expt::LaunchPolicy<RAJA::expt::omp_launch_t,RAJA::policy::cuda::cuda_launch_explicit_t<false, 0, 0>>,
+         RAJA::expt::LaunchPolicy<RAJA::expt::omp_launch_t,RAJA::policy::cuda::expt::cuda_launch_explicit_t<false, 0, 0>>,
          RAJA::expt::LoopPolicy<RAJA::omp_parallel_for_exec, RAJA::cuda_block_x_direct>,
          RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::cuda_thread_x_loop>
   >;
@@ -86,7 +86,7 @@ using omp_hip_policies = camp::list<
   >;
 
 using omp_hip_explicit_policies = camp::list<
-         RAJA::expt::LaunchPolicy<RAJA::expt::omp_launch_t,RAJA::policy::hip::hip_launch_explicit_t<false, 0, 0>>,
+         RAJA::expt::LaunchPolicy<RAJA::expt::omp_launch_t,RAJA::policy::hip::expt::hip_launch_explicit_t<false, 0, 0>>,
          RAJA::expt::LoopPolicy<RAJA::omp_parallel_for_exec, RAJA::hip_block_x_direct>,
          RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_x_loop>
   >;

From 58f143a2535f445a54071bfb1ca6db813c10bf1e Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Mon, 29 Nov 2021 18:20:10 -0800
Subject: [PATCH 08/21] Fix Hip namespace.

---
 include/RAJA/policy/hip/policy.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 7169fabb96..75c5003ee9 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -272,7 +272,7 @@ using policy::hip::hip_synchronize;
 namespace expt
 {
   template <bool Async, int num_threads = 0>
-  using hip_launch_t = policy::hip::hip_launch_explicit_t<Async, num_threads, 0>;
+  using hip_launch_t = policy::hip::expt::hip_launch_explicit_t<Async, num_threads, 0>;
 }
 
 /*!

From 189de7c2494c1177bd77155ddd90a26aeb70629d Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Mon, 29 Nov 2021 19:02:05 -0800
Subject: [PATCH 09/21] Fix HIP min warps per execution unit.

---
 include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp | 3 ++-
 include/RAJA/policy/hip/forall.hpp               | 3 ++-
 include/RAJA/policy/hip/kernel/HipKernel.hpp     | 3 ++-
 include/RAJA/policy/hip/teams.hpp                | 3 ++-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
index a494ca6299..2b0ceda8e8 100644
--- a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
@@ -178,13 +178,14 @@ struct HoldHipDeviceXThreadblockLoop
   LoopBody m_body;
 };
 
+// HIP BLOCKS_PER_SM calculation is actually MIN_WARPS_PER_EXECUTION_UNIT
 template < size_t BLOCK_SIZE,
            size_t BLOCKS_PER_SM,
            typename StorageIter,
            typename value_type,
            typename index_type,
            typename ... Args >
-__launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
+__launch_bounds__(BLOCK_SIZE, (BLOCK_SIZE * BLOCKS_PER_SM)/32) __global__
     void hip_unordered_y_block_global(StorageIter iter, Args... args)
 {
   const index_type i_loop = blockIdx.y;
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index 0ec9c5c1e9..db5226524b 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -127,12 +127,13 @@ __device__ __forceinline__ unsigned int getGlobalNumThreads_3D_3D()
  *
  ******************************************************************************
  */
+// HIP BLOCKS_PER_SM calculation is actually MIN_WARPS_PER_EXECUTION_UNIT
 template <size_t BlockSize,
           size_t BlocksPerSM,
           typename Iterator,
           typename LOOP_BODY,
           typename IndexType>
-__launch_bounds__(BlockSize, BlocksPerSM) __global__
+__launch_bounds__(BlockSize, (BlockSize * BlocksPerSM)/32) __global__
     void forall_hip_kernel(LOOP_BODY loop_body,
                             const Iterator idx,
                             IndexType length)
diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp
index 70b1afe88f..d5896a5948 100644
--- a/include/RAJA/policy/hip/kernel/HipKernel.hpp
+++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp
@@ -211,8 +211,9 @@ __global__ void HipKernelLauncher(Data data)
  *
  * This launcher is used by the HipKerelFixed policies.
  */
+// HIP BLOCKS_PER_SM calculation is actually MIN_WARPS_PER_EXECUTION_UNIT
 template <size_t BlockSize, size_t BlocksPerSM, typename Data, typename Exec>
-__launch_bounds__(BlockSize, BlocksPerSM) __global__
+__launch_bounds__(BlockSize, (BlockSize * BlocksPerSM)/32) __global__
     void HipKernelLauncherFixed(Data data)
 {
 
diff --git a/include/RAJA/policy/hip/teams.hpp b/include/RAJA/policy/hip/teams.hpp
index d07a37e7d7..09e25049f8 100644
--- a/include/RAJA/policy/hip/teams.hpp
+++ b/include/RAJA/policy/hip/teams.hpp
@@ -31,8 +31,9 @@ namespace RAJA
 namespace expt
 {
 
+// HIP BLOCKS_PER_SM calculation is actually MIN_WARPS_PER_EXECUTION_UNIT
 template <typename BODY, int num_threads, int BLOCKS_PER_SM>
-__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
+__launch_bounds__(num_threads, (num_threads * BLOCKS_PER_SM)/32) __global__
 static void launch_global_fcn_fixed(LaunchContext ctx, BODY body_in)
 {
   using RAJA::internal::thread_privatize;

From b8115fd1def8d91e8004b551ff45d3458904e9b7 Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Mon, 29 Nov 2021 19:38:16 -0800
Subject: [PATCH 10/21] Non-zero HIP threads and blocks.

---
 test/include/RAJA_test-teams-execpol.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/include/RAJA_test-teams-execpol.hpp b/test/include/RAJA_test-teams-execpol.hpp
index 318d9d3cad..0311191790 100644
--- a/test/include/RAJA_test-teams-execpol.hpp
+++ b/test/include/RAJA_test-teams-execpol.hpp
@@ -39,7 +39,7 @@ using seq_hip_policies = camp::list<
   RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_x_loop>>;
 
 using seq_hip_explicit_policies = camp::list<
-  RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t,RAJA::policy::hip::expt::hip_launch_explicit_t<true, 0, 0>>,
+  RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t,RAJA::policy::hip::expt::hip_launch_explicit_t<true, 32, 2>>,
   RAJA::expt::LoopPolicy<RAJA::loop_exec, RAJA::hip_block_x_direct>,
   RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_x_loop>>;
 
@@ -86,7 +86,7 @@ using omp_hip_policies = camp::list<
   >;
 
 using omp_hip_explicit_policies = camp::list<
-         RAJA::expt::LaunchPolicy<RAJA::expt::omp_launch_t,RAJA::policy::hip::expt::hip_launch_explicit_t<false, 0, 0>>,
+         RAJA::expt::LaunchPolicy<RAJA::expt::omp_launch_t,RAJA::policy::hip::expt::hip_launch_explicit_t<false, 32, 2>>,
          RAJA::expt::LoopPolicy<RAJA::omp_parallel_for_exec, RAJA::hip_block_x_direct>,
          RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_x_loop>
   >;

From eed727d80b2e7170cfe30a62ffdefc89bc9e2406 Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Mon, 29 Nov 2021 22:27:33 -0800
Subject: [PATCH 11/21] Default HIP blocks per SM to 1.

---
 include/RAJA/policy/hip/policy.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 75c5003ee9..4876dafbc9 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -272,7 +272,7 @@ using policy::hip::hip_synchronize;
 namespace expt
 {
   template <bool Async, int num_threads = 0>
-  using hip_launch_t = policy::hip::expt::hip_launch_explicit_t<Async, num_threads, 0>;
+  using hip_launch_t = policy::hip::expt::hip_launch_explicit_t<Async, num_threads, 1>;
 }
 
 /*!

From cd4ef2e465e1df82864d545aeda3afc838fa6328 Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Mon, 29 Nov 2021 22:35:56 -0800
Subject: [PATCH 12/21] HIP math.

---
 include/RAJA/policy/hip/policy.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 4876dafbc9..88583313b9 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -271,7 +271,7 @@ using policy::hip::hip_synchronize;
 
 namespace expt
 {
-  template <bool Async, int num_threads = 0>
+  template <bool Async, int num_threads = 1>
   using hip_launch_t = policy::hip::expt::hip_launch_explicit_t<Async, num_threads, 1>;
 }
 

From dfc6b9e4988463fd76abe46811a7841cf355b67e Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Tue, 30 Nov 2021 09:54:20 -0800
Subject: [PATCH 13/21] Comment on placeholder cuda_exec.

---
 include/RAJA/policy/cuda/kernel/CudaKernel.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
index 2655ee630b..2d6bcd66d4 100644
--- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
+++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
@@ -84,8 +84,8 @@ namespace statement
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel.
- *
- *
+ * Note - Statement requires a placeholder cuda_exec policy for the sake of 
+ * object oriented inheritance.
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct CudaKernelExt

From 490a3f6626f0899a827c0237e580f2d89d056e50 Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Tue, 30 Nov 2021 19:18:16 -0800
Subject: [PATCH 14/21] CUDA default blocks per SM.

---
 include/RAJA/policy/cuda/kernel/CudaKernel.hpp |  4 ++--
 include/RAJA/policy/cuda/policy.hpp            | 15 +++++++++------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
index 2d6bcd66d4..2335ebad0e 100644
--- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
+++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
@@ -68,7 +68,7 @@ struct cuda_explicit_launch {};
  * Blocks per SM defaults to 1.
  */
 template <bool async0, size_t num_blocks, size_t num_threads>
-using cuda_launch = cuda_explicit_launch<async0, num_blocks, num_threads, 1>;
+using cuda_launch = cuda_explicit_launch<async0, num_blocks, num_threads, policy::cuda::MIN_BLOCKS_PER_SM>;
 
 
 /*!
@@ -77,7 +77,7 @@ using cuda_launch = cuda_explicit_launch<async0, num_blocks, num_threads, 1>;
  * If num_threads is 0 then num_threads is chosen at runtime.
  */
 template <size_t num_threads0, bool async0>
-using cuda_occ_calc_launch = cuda_explicit_launch<async0, 0, num_threads0, 0>;
+using cuda_occ_calc_launch = cuda_explicit_launch<async0, 0, num_threads0, policy::cuda::MIN_BLOCKS_PER_SM>;
 
 namespace statement
 {
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index b35b8cb1eb..918b138927 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -74,6 +74,9 @@ namespace policy
 namespace cuda
 {
 
+constexpr const size_t MIN_BLOCKS_PER_SM = 1;
+constexpr const size_t MAX_BLOCKS_PER_SM = 32;
+
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Policy::cuda,
@@ -84,7 +87,7 @@ struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
 
 namespace expt
 {
-template <bool Async, int num_threads, size_t BLOCKS_PER_SM = 0>
+template <bool Async, int num_threads, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
 struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t<
                                 RAJA::Policy::cuda,
                                 RAJA::Pattern::region,
@@ -228,18 +231,18 @@ template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_explicit_async = policy::cuda::cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, bool ASYNC = false>
-using cuda_exec = policy::cuda::cuda_exec_explicit<BLOCK_SIZE, 1, ASYNC>;
+using cuda_exec = policy::cuda::cuda_exec_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, ASYNC>;
 
 template <size_t BLOCK_SIZE>
-using cuda_exec_async = policy::cuda::cuda_exec_explicit<BLOCK_SIZE, 1, true>;
+using cuda_exec_async = policy::cuda::cuda_exec_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 using policy::cuda::cuda_work_explicit;
 
 template <size_t BLOCK_SIZE, bool ASYNC = false>
-using cuda_work = policy::cuda::cuda_work_explicit<BLOCK_SIZE, 1, ASYNC>;
+using cuda_work = policy::cuda::cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, ASYNC>;
 
 template <size_t BLOCK_SIZE>
-using cuda_work_async = policy::cuda::cuda_work_explicit<BLOCK_SIZE, 1, true>;
+using cuda_work_async = policy::cuda::cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 using policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
 
@@ -267,7 +270,7 @@ using policy::cuda::cuda_synchronize;
 namespace expt
 {
   template <bool Async, int num_threads = 0>
-  using cuda_launch_t = policy::cuda::expt::cuda_launch_explicit_t<Async, num_threads, 0>;
+  using cuda_launch_t = policy::cuda::expt::cuda_launch_explicit_t<Async, num_threads, policy::cuda::MIN_BLOCKS_PER_SM>;
 }
 
 

From 7e48b0c2487f83488f5b8f8467a0649b8a0f138b Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Thu, 2 Dec 2021 14:22:42 -0800
Subject: [PATCH 15/21] Adding Teams default launch execute policy back.

---
 include/RAJA/policy/cuda/policy.hpp |   3 +-
 include/RAJA/policy/cuda/teams.hpp  | 124 ++++++++++++++++++++++++++++
 include/RAJA/policy/hip/policy.hpp  |   1 +
 include/RAJA/policy/hip/teams.hpp   | 121 +++++++++++++++++++++++++++
 4 files changed, 248 insertions(+), 1 deletion(-)

diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 918b138927..1497141968 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -269,7 +269,8 @@ using policy::cuda::cuda_synchronize;
 
 namespace expt
 {
-  template <bool Async, int num_threads = 0>
+  // num_threads defaults to 1, but not expected to be used in kernel launch
+  template <bool Async, int num_threads = 1>
   using cuda_launch_t = policy::cuda::expt::cuda_launch_explicit_t<Async, num_threads, policy::cuda::MIN_BLOCKS_PER_SM>;
 }
 
diff --git a/include/RAJA/policy/cuda/teams.hpp b/include/RAJA/policy/cuda/teams.hpp
index adc900f93d..326757e506 100644
--- a/include/RAJA/policy/cuda/teams.hpp
+++ b/include/RAJA/policy/cuda/teams.hpp
@@ -31,6 +31,130 @@ namespace RAJA
 namespace expt
 {
 
+template <typename BODY>
+__global__ void launch_global_fcn(LaunchContext ctx, BODY body_in)
+{
+  using RAJA::internal::thread_privatize;
+  auto privatizer = thread_privatize(body_in);
+  auto& body = privatizer.get_priv();
+  body(ctx);
+}
+
+template <bool async>
+struct LaunchExecute<RAJA::expt::cuda_launch_t<async, 1>> {
+// cuda_launch_t num_threads set to 1, but not used in launch of kernel
+
+  template <typename BODY_IN>
+  static void exec(LaunchContext const &ctx, BODY_IN &&body_in)
+  {
+    using BODY = camp::decay<BODY_IN>;
+
+    auto func = launch_global_fcn<BODY>;
+
+    resources::Cuda cuda_res = resources::Cuda::get_default();
+
+    //
+    // Compute the number of blocks and threads
+    //
+
+    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(ctx.teams.value[0]),
+                         static_cast<cuda_dim_member_t>(ctx.teams.value[1]),
+                         static_cast<cuda_dim_member_t>(ctx.teams.value[2]) };
+
+    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(ctx.threads.value[0]),
+                          static_cast<cuda_dim_member_t>(ctx.threads.value[1]),
+                          static_cast<cuda_dim_member_t>(ctx.threads.value[2]) };
+
+    // Only launch kernel if we have something to iterate over
+    constexpr cuda_dim_member_t zero = 0;
+    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
+         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+
+      RAJA_FT_BEGIN;
+
+      //
+      // Setup shared memory buffers
+      //
+      size_t shmem = 0;
+
+      {
+        //
+        // Privatize the loop_body, using make_launch_body to setup reductions
+        //
+        BODY body = RAJA::cuda::make_launch_body(
+            gridSize, blockSize, shmem, cuda_res, std::forward<BODY_IN>(body_in));
+
+        //
+        // Launch the kernel
+        //
+        void *args[] = {(void*)&ctx, (void*)&body};
+        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, shmem, cuda_res, async, ctx.kernel_name);
+      }
+
+      RAJA_FT_END;
+    }
+
+  }
+
+  template <typename BODY_IN>
+  static resources::EventProxy<resources::Resource>
+  exec(RAJA::resources::Resource res, LaunchContext const &ctx, BODY_IN &&body_in)
+  {
+    using BODY = camp::decay<BODY_IN>;
+
+    auto func = launch_global_fcn<BODY>;
+
+    /*Get the concrete resource */
+    resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
+
+    //
+    // Compute the number of blocks and threads
+    //
+
+    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(ctx.teams.value[0]),
+                         static_cast<cuda_dim_member_t>(ctx.teams.value[1]),
+                         static_cast<cuda_dim_member_t>(ctx.teams.value[2]) };
+
+    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(ctx.threads.value[0]),
+                          static_cast<cuda_dim_member_t>(ctx.threads.value[1]),
+                          static_cast<cuda_dim_member_t>(ctx.threads.value[2]) };
+
+    // Only launch kernel if we have something to iterate over
+    constexpr cuda_dim_member_t zero = 0;
+    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
+         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+
+      RAJA_FT_BEGIN;
+
+      //
+      // Setup shared memory buffers
+      //
+      size_t shmem = 0;
+
+      {
+        //
+        // Privatize the loop_body, using make_launch_body to setup reductions
+        //
+        BODY body = RAJA::cuda::make_launch_body(
+            gridSize, blockSize, shmem, cuda_res, std::forward<BODY_IN>(body_in));
+
+        //
+        // Launch the kernel
+        //
+        void *args[] = {(void*)&ctx, (void*)&body};
+        {
+          RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, shmem, cuda_res, async, ctx.kernel_name);
+        }
+      }
+
+      RAJA_FT_END;
+    }
+
+    return resources::EventProxy<resources::Resource>(res);
+  }
+
+};
+
 template <typename BODY, int num_threads, size_t BLOCKS_PER_SM>
 __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
     void launch_global_fcn_fixed(LaunchContext ctx, BODY body_in)
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 88583313b9..f0886b37bd 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -271,6 +271,7 @@ using policy::hip::hip_synchronize;
 
 namespace expt
 {
+  // num_threads defaults to 1, but not expected to be used in kernel launch
   template <bool Async, int num_threads = 1>
   using hip_launch_t = policy::hip::expt::hip_launch_explicit_t<Async, num_threads, 1>;
 }
diff --git a/include/RAJA/policy/hip/teams.hpp b/include/RAJA/policy/hip/teams.hpp
index 09e25049f8..d6e8ea28f1 100644
--- a/include/RAJA/policy/hip/teams.hpp
+++ b/include/RAJA/policy/hip/teams.hpp
@@ -31,6 +31,127 @@ namespace RAJA
 namespace expt
 {
 
+template <typename BODY>
+__global__ static void launch_global_fcn(LaunchContext ctx, BODY body_in)
+{
+  using RAJA::internal::thread_privatize;
+  auto privatizer = thread_privatize(body_in);
+  auto& body = privatizer.get_priv();
+  body(ctx);
+}
+
+template <bool async>
+struct LaunchExecute<RAJA::expt::hip_launch_t<async, 1>> {
+// hip_launch_t num_threads set to 1, but not used in launch of kernel
+
+  template <typename BODY_IN>
+  static void exec(LaunchContext const &ctx, BODY_IN &&body_in)
+  {
+    using BODY = camp::decay<BODY_IN>;
+
+    auto func = launch_global_fcn<BODY>;
+
+    resources::Hip hip_res = resources::Hip::get_default();
+
+    //
+    // Compute the number of blocks and threads
+    //
+
+    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(ctx.teams.value[0]),
+                        static_cast<hip_dim_member_t>(ctx.teams.value[1]),
+                        static_cast<hip_dim_member_t>(ctx.teams.value[2]) };
+
+    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(ctx.threads.value[0]),
+                         static_cast<hip_dim_member_t>(ctx.threads.value[1]),
+                         static_cast<hip_dim_member_t>(ctx.threads.value[2]) };
+
+    // Only launch kernel if we have something to iterate over
+    constexpr hip_dim_member_t zero = 0;
+    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
+         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+
+      RAJA_FT_BEGIN;
+
+      //
+      // Setup shared memory buffers
+      //
+      size_t shmem = 0;
+
+      {
+        //
+        // Privatize the loop_body, using make_launch_body to setup reductions
+        //
+        BODY body = RAJA::hip::make_launch_body(
+            gridSize, blockSize, shmem, hip_res, std::forward<BODY_IN>(body_in));
+
+        //
+        // Launch the kernel
+        //
+        void *args[] = {(void*)&ctx, (void*)&body};
+        RAJA::hip::launch((const void*)func, gridSize, blockSize, args, shmem, hip_res, async, ctx.kernel_name);
+      }
+
+      RAJA_FT_END;
+    }
+
+  }
+
+  template <typename BODY_IN>
+  static resources::EventProxy<resources::Resource>
+  exec(RAJA::resources::Resource res, LaunchContext const &ctx, BODY_IN &&body_in)
+  {
+    using BODY = camp::decay<BODY_IN>;
+
+    auto func = launch_global_fcn<BODY>;
+
+    resources::Hip hip_res = res.get<RAJA::resources::Hip>();
+
+    //
+    // Compute the number of blocks and threads
+    //
+
+    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(ctx.teams.value[0]),
+                        static_cast<hip_dim_member_t>(ctx.teams.value[1]),
+                        static_cast<hip_dim_member_t>(ctx.teams.value[2]) };
+
+    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(ctx.threads.value[0]),
+                         static_cast<hip_dim_member_t>(ctx.threads.value[1]),
+                         static_cast<hip_dim_member_t>(ctx.threads.value[2]) };
+
+    // Only launch kernel if we have something to iterate over
+    constexpr hip_dim_member_t zero = 0;
+    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
+         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+
+      RAJA_FT_BEGIN;
+
+      //
+      // Setup shared memory buffers
+      //
+      size_t shmem = 0;
+
+      {
+        //
+        // Privatize the loop_body, using make_launch_body to setup reductions
+        //
+        BODY body = RAJA::hip::make_launch_body(
+            gridSize, blockSize, shmem, hip_res, std::forward<BODY_IN>(body_in));
+
+        //
+        // Launch the kernel
+        //
+        void *args[] = {(void*)&ctx, (void*)&body};
+        RAJA::hip::launch((const void*)func, gridSize, blockSize, args, shmem, hip_res, async, ctx.kernel_name);
+      }
+
+      RAJA_FT_END;
+    }
+
+    return resources::EventProxy<resources::Resource>(res);
+  }
+
+};
+
 // HIP BLOCKS_PER_SM calculation is actually MIN_WARPS_PER_EXECUTION_UNIT
 template <typename BODY, int num_threads, int BLOCKS_PER_SM>
 __launch_bounds__(num_threads, (num_threads * BLOCKS_PER_SM)/32) __global__

From e517a78dc7b696ce2d063f0c2c83609b5e0dfaff Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Wed, 5 Jan 2022 16:38:59 -0800
Subject: [PATCH 16/21] Undo HIP changes, awaiting further guidance from AMD on
 launch_bounds.

---
 include/RAJA/policy/hip/WorkGroup/Vtable.hpp  |  4 +-
 .../RAJA/policy/hip/WorkGroup/WorkRunner.hpp  | 38 ++++++------
 include/RAJA/policy/hip/forall.hpp            | 15 ++---
 include/RAJA/policy/hip/kernel/HipKernel.hpp  | 59 ++++++++-----------
 include/RAJA/policy/hip/policy.hpp            | 36 ++++-------
 include/RAJA/policy/hip/scan.hpp              | 13 ++--
 include/RAJA/policy/hip/sort.hpp              | 48 +++++++--------
 include/RAJA/policy/hip/teams.hpp             | 18 +++---
 include/RAJA/util/resource.hpp                | 10 ----
 test/include/RAJA_test-forall-execpol.hpp     |  3 +-
 test/include/RAJA_test-teams-execpol.hpp      | 19 +-----
 test/include/RAJA_test-workgroup.hpp          |  3 +-
 test/old-tests/unit/test-kernel.cpp           |  2 +-
 .../algorithm/tests/test-algorithm-sort.hpp   |  3 +-
 .../tests/test-algorithm-stable-sort.hpp      |  3 +-
 15 files changed, 104 insertions(+), 170 deletions(-)

diff --git a/include/RAJA/policy/hip/WorkGroup/Vtable.hpp b/include/RAJA/policy/hip/WorkGroup/Vtable.hpp
index f2369b23c9..e4ce5212bf 100644
--- a/include/RAJA/policy/hip/WorkGroup/Vtable.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/Vtable.hpp
@@ -97,8 +97,8 @@ inline typename Vtable_T::call_sig get_cached_Vtable_hip_device_call()
 * Populate and return a Vtable object where the
 * call operator is a device function
 */
-template < typename T, typename Vtable_T, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async >
-inline const Vtable_T* get_Vtable(hip_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
+template < typename T, typename Vtable_T, size_t BLOCK_SIZE, bool Async >
+inline const Vtable_T* get_Vtable(hip_work<BLOCK_SIZE, Async> const&)
 {
   static Vtable_T vtable{
         &Vtable_T::template move_construct_destroy<T>,
diff --git a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
index 2b0ceda8e8..29571812c7 100644
--- a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
@@ -36,27 +36,27 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
 struct WorkRunner<
-        RAJA::hip_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::hip_work<BLOCK_SIZE, Async>,
         RAJA::ordered,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
     : WorkRunnerForallOrdered<
-        RAJA::hip_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::hip_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::hip_exec_async<BLOCK_SIZE>,
+        RAJA::hip_work<BLOCK_SIZE, Async>,
         RAJA::ordered,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
 {
   using base = WorkRunnerForallOrdered<
-        RAJA::hip_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::hip_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::hip_exec_async<BLOCK_SIZE>,
+        RAJA::hip_work<BLOCK_SIZE, Async>,
         RAJA::ordered,
         ALLOCATOR_T,
         INDEX_T,
@@ -91,27 +91,27 @@ struct WorkRunner<
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
 struct WorkRunner<
-        RAJA::hip_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::hip_work<BLOCK_SIZE, Async>,
         RAJA::reverse_ordered,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
     : WorkRunnerForallReverse<
-        RAJA::hip_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::hip_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::hip_exec_async<BLOCK_SIZE>,
+        RAJA::hip_work<BLOCK_SIZE, Async>,
         RAJA::reverse_ordered,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
 {
   using base = WorkRunnerForallReverse<
-        RAJA::hip_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::hip_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::hip_exec_async<BLOCK_SIZE>,
+        RAJA::hip_work<BLOCK_SIZE, Async>,
         RAJA::reverse_ordered,
         ALLOCATOR_T,
         INDEX_T,
@@ -178,14 +178,12 @@ struct HoldHipDeviceXThreadblockLoop
   LoopBody m_body;
 };
 
-// HIP BLOCKS_PER_SM calculation is actually MIN_WARPS_PER_EXECUTION_UNIT
 template < size_t BLOCK_SIZE,
-           size_t BLOCKS_PER_SM,
            typename StorageIter,
            typename value_type,
            typename index_type,
            typename ... Args >
-__launch_bounds__(BLOCK_SIZE, (BLOCK_SIZE * BLOCKS_PER_SM)/32) __global__
+__launch_bounds__(BLOCK_SIZE, 1) __global__
     void hip_unordered_y_block_global(StorageIter iter, Args... args)
 {
   const index_type i_loop = blockIdx.y;
@@ -201,24 +199,24 @@ __launch_bounds__(BLOCK_SIZE, (BLOCK_SIZE * BLOCKS_PER_SM)/32) __global__
  * the x direction, with the number of threads in the x dimension determined
  * by the average number of iterates per loop
  */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
 struct WorkRunner<
-        RAJA::hip_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+        RAJA::hip_work<BLOCK_SIZE, Async>,
         RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
 {
-  using exec_policy = RAJA::hip_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
+  using exec_policy = RAJA::hip_work<BLOCK_SIZE, Async>;
   using order_policy = RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
   using Allocator = ALLOCATOR_T;
   using index_type = INDEX_T;
   using resource_type = resources::Hip;
 
-  using vtable_type = Vtable<RAJA::hip_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>, Args...>;
+  using vtable_type = Vtable<RAJA::hip_work<BLOCK_SIZE, true>, Args...>;
 
   WorkRunner() = default;
 
@@ -294,7 +292,7 @@ struct WorkRunner<
 
     per_run_storage run_storage{};
 
-    auto func = hip_unordered_y_block_global<BLOCK_SIZE, BLOCKS_PER_SM, Iterator, value_type, index_type, Args...>;
+    auto func = hip_unordered_y_block_global<BLOCK_SIZE, Iterator, value_type, index_type, Args...>;
 
     //
     // Compute the requested iteration space size
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index db5226524b..99e1b8a63a 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -127,13 +127,11 @@ __device__ __forceinline__ unsigned int getGlobalNumThreads_3D_3D()
  *
  ******************************************************************************
  */
-// HIP BLOCKS_PER_SM calculation is actually MIN_WARPS_PER_EXECUTION_UNIT
 template <size_t BlockSize,
-          size_t BlocksPerSM,
           typename Iterator,
           typename LOOP_BODY,
           typename IndexType>
-__launch_bounds__(BlockSize, (BlockSize * BlocksPerSM)/32) __global__
+__launch_bounds__(BlockSize, 1) __global__
     void forall_hip_kernel(LOOP_BODY loop_body,
                             const Iterator idx,
                             IndexType length)
@@ -157,9 +155,9 @@ __launch_bounds__(BlockSize, (BlockSize * BlocksPerSM)/32) __global__
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, size_t BlocksPerSM, bool Async>
+template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async>
 RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(resources::Hip hip_res,
-                                                    hip_exec_explicit<BlockSize, BlocksPerSM, Async>,
+                                                    hip_exec<BlockSize, Async>,
                                                     Iterable&& iter,
                                                     LoopBody&& loop_body)
 {
@@ -167,7 +165,7 @@ RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(resources::Hip hip
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
-  auto func = impl::forall_hip_kernel<BlockSize, BlocksPerSM, Iterator, LOOP_BODY, IndexType>;
+  auto func = impl::forall_hip_kernel<BlockSize, Iterator, LOOP_BODY, IndexType>;
 
   //
   // Compute the requested iteration space size
@@ -237,12 +235,11 @@ RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(resources::Hip hip
  */
 template <typename LoopBody,
           size_t BlockSize,
-          size_t BlocksPerSM,
           bool Async,
           typename... SegmentTypes>
 RAJA_INLINE resources::EventProxy<resources::Hip>
 forall_impl(resources::Hip r,
-            ExecPolicy<seq_segit, hip_exec_explicit<BlockSize, BlocksPerSM, Async>>,
+            ExecPolicy<seq_segit, hip_exec<BlockSize, Async>>,
             const TypedIndexSet<SegmentTypes...>& iset,
             LoopBody&& loop_body)
 {
@@ -251,7 +248,7 @@ forall_impl(resources::Hip r,
     iset.segmentCall(r,
                      isi,
                      detail::CallForall(),
-                     hip_exec_explicit<BlockSize, BlocksPerSM, true>(),
+                     hip_exec<BlockSize, true>(),
                      loop_body);
   }  // iterate over segments of index set
 
diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp
index d5896a5948..6acef19089 100644
--- a/include/RAJA/policy/hip/kernel/HipKernel.hpp
+++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp
@@ -45,30 +45,28 @@ namespace RAJA
 
 /*!
  * HIP kernel launch policy where the user may specify the number of physical
- * thread blocks, threads per block, and blocks per SM.
+ * thread blocks and threads per block.
  * If num_blocks is 0 and num_threads is non-zero then num_blocks is chosen at
  * runtime.
  * Num_blocks is chosen to maximize the number of blocks running concurrently.
- * Blocks per SM must be chosen by the user.
+ * If num_threads and num_blocks are both 0 then num_threads and num_blocks are
+ * chosen at runtime.
+ * Num_threads and num_blocks are determined by the HIP occupancy calculator.
+ * If num_threads is 0 and num_blocks is non-zero then num_threads is chosen at
+ * runtime.
+ * Num_threads is 1024, which may not be appropriate for all kernels.
  */
-template <bool async0, int num_blocks, int num_threads, int blocks_per_sm>
-struct hip_explicit_launch {};
+template <bool async0, int num_blocks, int num_threads>
+struct hip_launch {};
 
 /*!
  * HIP kernel launch policy where the user specifies the number of physical
  * thread blocks and threads per block.
  * If num_blocks is 0 then num_blocks is chosen at runtime.
  * Num_blocks is chosen to maximize the number of blocks running concurrently.
- * If num_threads and num_blocks are both 0 then num_threads and num_blocks are
- * chosen at runtime.
- * Num_threads and num_blocks are determined by the HIP occupancy calculator.
- * If num_threads is 0 and num_blocks is non-zero then num_threads is chosen at
- * runtime.
- * Num_threads is 1024, which may not be appropriate for all kernels.
- * Blocks per SM defaults to 1.
  */
 template <bool async0, int num_blocks, int num_threads>
-using hip_launch = hip_explicit_launch<async0, num_blocks, num_threads, 1>;
+using hip_explicit_launch = hip_launch<async0, num_blocks, num_threads>;
 
 
 /*!
@@ -77,7 +75,7 @@ using hip_launch = hip_explicit_launch<async0, num_blocks, num_threads, 1>;
  * If num_threads is 0 then num_threads is chosen at runtime.
  */
 template <int num_threads0, bool async0>
-using hip_occ_calc_launch = hip_explicit_launch<async0, 0, num_threads0, 0>;
+using hip_occ_calc_launch = hip_launch<async0, 0, num_threads0, 0>;
 
 namespace statement
 {
@@ -89,7 +87,7 @@ namespace statement
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct HipKernelExt
-    : public internal::Statement<hip_exec<0, 0>, EnclosedStmts...> {
+    : public internal::Statement<hip_exec<0>, EnclosedStmts...> {
 };
 
 
@@ -138,7 +136,7 @@ using HipKernelOccAsync =
  */
 template <int num_threads, typename... EnclosedStmts>
 using HipKernelFixed =
-    HipKernelExt<hip_launch<false, 0, num_threads>,
+    HipKernelExt<hip_explicit_launch<false, 0, num_threads>,
                   EnclosedStmts...>;
 
 /*!
@@ -158,17 +156,7 @@ using HipKernelFixedSM =
  */
 template <int num_threads, typename... EnclosedStmts>
 using HipKernelFixedAsync =
-    HipKernelExt<hip_launch<true, 0, num_threads>, EnclosedStmts...>;
-
-/*!
- * A RAJA::kernel statement that launches a HIP kernel with a fixed
- * number of threads (specified by num_threads) and min blocks per sm.
- * The kernel launch is asynchronous.
- */
-template <size_t num_threads, size_t blocks_per_sm, typename... EnclosedStmts>
-using HipKernelFixedSMAsync =
-    HipKernelExt<hip_explicit_launch<true, 0, num_threads, blocks_per_sm>,
-                  EnclosedStmts...>;
+    HipKernelExt<hip_explicit_launch<true, 0, num_threads>, EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with 1024 threads
@@ -211,9 +199,8 @@ __global__ void HipKernelLauncher(Data data)
  *
  * This launcher is used by the HipKerelFixed policies.
  */
-// HIP BLOCKS_PER_SM calculation is actually MIN_WARPS_PER_EXECUTION_UNIT
-template <size_t BlockSize, size_t BlocksPerSM, typename Data, typename Exec>
-__launch_bounds__(BlockSize, (BlockSize * BlocksPerSM)/32) __global__
+template <size_t BlockSize, typename Data, typename Exec>
+__launch_bounds__(BlockSize, 1) __global__
     void HipKernelLauncherFixed(Data data)
 {
 
@@ -233,13 +220,13 @@ __launch_bounds__(BlockSize, (BlockSize * BlocksPerSM)/32) __global__
  * The default case handles BlockSize != 0 and gets the fixed max block size
  * version of the kernel.
  */
-template<size_t BlockSize, size_t BlocksPerSM, typename Data, typename executor_t>
+template<size_t BlockSize, typename Data, typename executor_t>
 struct HipKernelLauncherGetter
 {
-  using type = camp::decay<decltype(&internal::HipKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>)>;
+  using type = camp::decay<decltype(&internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>)>;
   static constexpr type get() noexcept
   {
-    return internal::HipKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>;
+    return internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>;
   }
 };
 
@@ -248,7 +235,7 @@ struct HipKernelLauncherGetter
  * block size version of the kernel.
  */
 template<typename Data, typename executor_t>
-struct HipKernelLauncherGetter<0, 0, Data, executor_t>
+struct HipKernelLauncherGetter<0, Data, executor_t>
 {
   using type = camp::decay<decltype(&internal::HipKernelLauncher<Data, executor_t>)>;
   static constexpr type get() noexcept
@@ -272,8 +259,8 @@ struct HipLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the HIP occupancy calculator.
  */
-template<bool async0, int num_blocks, int num_threads, int blocks_per_sm, typename StmtList, typename Data, typename Types>
-struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads, blocks_per_sm>,StmtList,Data,Types>
+template<bool async0, int num_blocks, int num_threads, typename StmtList, typename Data, typename Types>
+struct HipLaunchHelper<hip_launch<async0, num_blocks, num_threads>,StmtList,Data,Types>
 {
   using Self = HipLaunchHelper;
 
@@ -281,7 +268,7 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads, bloc
 
   using executor_t = internal::hip_statement_list_executor_t<StmtList, Data, Types>;
 
-  using kernelGetter_t = HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, (blocks_per_sm <= 0) ? 0 : blocks_per_sm, Data, executor_t>;
+  using kernelGetter_t = HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, Data, executor_t>;
 
   inline static void recommended_blocks_threads(int shmem_size,
       int &recommended_blocks, int &recommended_threads)
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index f0886b37bd..18364c20a2 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -70,24 +70,21 @@ namespace policy
 namespace hip
 {
 
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
-struct hip_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
+template <size_t BLOCK_SIZE, bool Async = false>
+struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Policy::hip,
                        RAJA::Pattern::forall,
                        detail::get_launch<Async>::value,
                        RAJA::Platform::hip> {
 };
 
-namespace expt
-{
-template <bool Async, int num_threads, int BLOCKS_PER_SM = 0>
-struct hip_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t<
+template <bool Async, int num_threads = 0>
+struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
                                       RAJA::Policy::hip,
                                       RAJA::Pattern::region,
                                       detail::get_launch<Async>::value,
                                       RAJA::Platform::hip> {
 };
-}
 
 
 //
@@ -97,8 +94,8 @@ struct hip_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_
 ///
 /// WorkGroup execution policies
 ///
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
-struct hip_work_explicit : public RAJA::make_policy_pattern_launch_platform_t<
+template <size_t BLOCK_SIZE, bool Async = false>
+struct hip_work : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Policy::hip,
                        RAJA::Pattern::workgroup_exec,
                        detail::get_launch<Async>::value,
@@ -225,24 +222,15 @@ using hip_atomic = hip_atomic_explicit<loop_atomic>;
 }  // end namespace hip
 }  // end namespace policy
 
-using policy::hip::hip_exec_explicit;
-
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
-using hip_exec_explicit_async = policy::hip::hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>;
-
-template <size_t BLOCK_SIZE, bool ASYNC = false>
-using hip_exec = policy::hip::hip_exec_explicit<BLOCK_SIZE, 1, ASYNC>;
+using policy::hip::hip_exec;
 
 template <size_t BLOCK_SIZE>
-using hip_exec_async = policy::hip::hip_exec_explicit<BLOCK_SIZE, 1, true>;
-
-using policy::hip::hip_work_explicit;
+using hip_exec_async = policy::hip::hip_exec<BLOCK_SIZE, true>;
 
-template <size_t BLOCK_SIZE, bool ASYNC = false>
-using hip_work = policy::hip::hip_work_explicit<BLOCK_SIZE, 1, ASYNC>;
+using policy::hip::hip_work;
 
 template <size_t BLOCK_SIZE>
-using hip_work_async = policy::hip::hip_work_explicit<BLOCK_SIZE, 1, true>;
+using hip_work_async = policy::hip::hip_work<BLOCK_SIZE, true>;
 
 using policy::hip::hip_atomic;
 using policy::hip::hip_atomic_explicit;
@@ -271,9 +259,7 @@ using policy::hip::hip_synchronize;
 
 namespace expt
 {
-  // num_threads defaults to 1, but not expected to be used in kernel launch
-  template <bool Async, int num_threads = 1>
-  using hip_launch_t = policy::hip::expt::hip_launch_explicit_t<Async, num_threads, 1>;
+  using policy::hip::hip_launch_t;
 }
 
 /*!
diff --git a/include/RAJA/policy/hip/scan.hpp b/include/RAJA/policy/hip/scan.hpp
index 999837586c..85bc494abb 100644
--- a/include/RAJA/policy/hip/scan.hpp
+++ b/include/RAJA/policy/hip/scan.hpp
@@ -47,12 +47,12 @@ namespace scan
         \brief explicit inclusive inplace scan given range, function, and
    initial value
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename InputIter, typename Function>
+template <size_t BLOCK_SIZE, bool Async, typename InputIter, typename Function>
 RAJA_INLINE
 resources::EventProxy<resources::Hip>
 inclusive_inplace(
     resources::Hip hip_res,
-    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+    hip_exec<BLOCK_SIZE, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op)
@@ -116,7 +116,6 @@ inclusive_inplace(
    initial value
 */
 template <size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM,
           bool Async,
           typename InputIter,
           typename Function,
@@ -125,7 +124,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Hip>
 exclusive_inplace(
     resources::Hip hip_res,
-    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+    hip_exec<BLOCK_SIZE, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op,
@@ -193,7 +192,6 @@ exclusive_inplace(
    initial value
 */
 template <size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM,
           bool Async,
           typename InputIter,
           typename OutputIter,
@@ -202,7 +200,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Hip>
 inclusive(
     resources::Hip hip_res,
-    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+    hip_exec<BLOCK_SIZE, Async>,
     InputIter begin,
     InputIter end,
     OutputIter out,
@@ -266,7 +264,6 @@ inclusive(
    initial value
 */
 template <size_t BLOCK_SIZE,
-          size_t BLOCKS_PER_SM,
           bool Async,
           typename InputIter,
           typename OutputIter,
@@ -276,7 +273,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Hip>
 exclusive(
     resources::Hip hip_res,
-    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+    hip_exec<BLOCK_SIZE, Async>,
     InputIter begin,
     InputIter end,
     OutputIter out,
diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp
index 13b1151ca7..9090721ff5 100644
--- a/include/RAJA/policy/hip/sort.hpp
+++ b/include/RAJA/policy/hip/sort.hpp
@@ -73,7 +73,7 @@ namespace detail
 /*!
         \brief static assert unimplemented stable sort
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter, typename Compare>
+template <size_t BLOCK_SIZE, bool Async, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       concepts::negate<concepts::all_of<
                         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
@@ -83,7 +83,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
 stable(
     resources::Hip hip_res,
-    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+    hip_exec<BLOCK_SIZE, Async>,
     Iter,
     Iter,
     Compare)
@@ -102,13 +102,13 @@ stable(
 /*!
         \brief stable sort given range in ascending order
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter>
+template <size_t BLOCK_SIZE, bool Async, typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 stable(
     resources::Hip hip_res,
-    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+    hip_exec<BLOCK_SIZE, Async>,
     Iter begin,
     Iter end,
     operators::less<RAJA::detail::IterVal<Iter>>)
@@ -190,13 +190,13 @@ stable(
 /*!
         \brief stable sort given range in descending order
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter>
+template <size_t BLOCK_SIZE, bool Async, typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 stable(
     resources::Hip hip_res,
-    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+    hip_exec<BLOCK_SIZE, Async>,
     Iter begin,
     Iter end,
     operators::greater<RAJA::detail::IterVal<Iter>>)
@@ -279,7 +279,7 @@ stable(
 /*!
         \brief static assert unimplemented sort
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter, typename Compare>
+template <size_t BLOCK_SIZE, bool Async, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       concepts::negate<concepts::all_of<
                         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
@@ -289,7 +289,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
 unstable(
     resources::Hip hip_res,
-    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+    hip_exec<BLOCK_SIZE, Async>,
     Iter,
     Iter,
     Compare)
@@ -308,13 +308,13 @@ unstable(
 /*!
         \brief sort given range in ascending order
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter>
+template <size_t BLOCK_SIZE, bool Async, typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 unstable(
     resources::Hip hip_res,
-    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> p,
+    hip_exec<BLOCK_SIZE, Async> p,
     Iter begin,
     Iter end,
     operators::less<RAJA::detail::IterVal<Iter>> comp)
@@ -325,13 +325,13 @@ unstable(
 /*!
         \brief sort given range in descending order
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter>
+template <size_t BLOCK_SIZE, bool Async, typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 unstable(
     resources::Hip hip_res,
-    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> p,
+    hip_exec<BLOCK_SIZE, Async> p,
     Iter begin,
     Iter end,
     operators::greater<RAJA::detail::IterVal<Iter>> comp)
@@ -343,7 +343,7 @@ unstable(
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename KeyIter, typename ValIter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       concepts::negate<concepts::all_of<
@@ -355,7 +355,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 stable_pairs(
     resources::Hip hip_res,
-    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+    hip_exec<BLOCK_SIZE, Async>,
     KeyIter,
     KeyIter,
     ValIter,
@@ -379,7 +379,7 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -387,7 +387,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       std::is_pointer<ValIter>>
 stable_pairs(
     resources::Hip hip_res,
-    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+    hip_exec<BLOCK_SIZE, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -483,7 +483,7 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -491,7 +491,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       std::is_pointer<ValIter>>
 stable_pairs(
     resources::Hip hip_res,
-    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+    hip_exec<BLOCK_SIZE, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -588,7 +588,7 @@ stable_pairs(
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename KeyIter, typename ValIter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       concepts::negate<concepts::all_of<
@@ -600,7 +600,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 unstable_pairs(
     resources::Hip hip_res,
-    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+    hip_exec<BLOCK_SIZE, Async>,
     KeyIter,
     KeyIter,
     ValIter,
@@ -624,7 +624,7 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -632,7 +632,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Hip hip_res,
-    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> p,
+    hip_exec<BLOCK_SIZE, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -644,7 +644,7 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <size_t BLOCK_SIZE, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -652,7 +652,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Hip hip_res,
-    hip_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> p,
+    hip_exec<BLOCK_SIZE, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
diff --git a/include/RAJA/policy/hip/teams.hpp b/include/RAJA/policy/hip/teams.hpp
index d6e8ea28f1..f3dd62d560 100644
--- a/include/RAJA/policy/hip/teams.hpp
+++ b/include/RAJA/policy/hip/teams.hpp
@@ -32,7 +32,7 @@ namespace expt
 {
 
 template <typename BODY>
-__global__ static void launch_global_fcn(LaunchContext ctx, BODY body_in)
+__global__ void launch_global_fcn(LaunchContext ctx, BODY body_in)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
@@ -41,8 +41,7 @@ __global__ static void launch_global_fcn(LaunchContext ctx, BODY body_in)
 }
 
 template <bool async>
-struct LaunchExecute<RAJA::expt::hip_launch_t<async, 1>> {
-// hip_launch_t num_threads set to 1, but not used in launch of kernel
+struct LaunchExecute<RAJA::expt::hip_launch_t<async, 0>> {
 
   template <typename BODY_IN>
   static void exec(LaunchContext const &ctx, BODY_IN &&body_in)
@@ -152,9 +151,8 @@ struct LaunchExecute<RAJA::expt::hip_launch_t<async, 1>> {
 
 };
 
-// HIP BLOCKS_PER_SM calculation is actually MIN_WARPS_PER_EXECUTION_UNIT
-template <typename BODY, int num_threads, int BLOCKS_PER_SM>
-__launch_bounds__(num_threads, (num_threads * BLOCKS_PER_SM)/32) __global__
+template <typename BODY, int num_threads>
+__launch_bounds__(num_threads, 1) __global__
 static void launch_global_fcn_fixed(LaunchContext ctx, BODY body_in)
 {
   using RAJA::internal::thread_privatize;
@@ -163,15 +161,15 @@ static void launch_global_fcn_fixed(LaunchContext ctx, BODY body_in)
   body(ctx);
 }
 
-template <bool async, int nthreads, int BLOCKS_PER_SM>
-struct LaunchExecute<RAJA::policy::hip::expt::hip_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>> {
+template <bool async, int nthreads>
+struct LaunchExecute<RAJA::policy::hip::expt::hip_launch_t<async, nthreads>> {
 
   template <typename BODY_IN>
   static void exec(LaunchContext const &ctx, BODY_IN &&body_in)
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = launch_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM>;
+    auto func = launch_global_fcn_fixed<BODY, nthreads>;
 
     resources::Hip hip_res = resources::Hip::get_default();
 
@@ -224,7 +222,7 @@ struct LaunchExecute<RAJA::policy::hip::expt::hip_launch_explicit_t<async, nthre
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = launch_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM>;
+    auto func = launch_global_fcn<BODY, nthreads>;
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
diff --git a/include/RAJA/util/resource.hpp b/include/RAJA/util/resource.hpp
index 7fca8332bd..40d8f973ac 100644
--- a/include/RAJA/util/resource.hpp
+++ b/include/RAJA/util/resource.hpp
@@ -99,16 +99,6 @@ namespace RAJA
   struct get_resource<ExecPolicy<ISetIter, hip_exec<BlockSize, Async>>>{
     using type = camp::resources::Hip;
   };
-
-  template<size_t BlockSize, size_t BlocksPerSM, bool Async>
-  struct get_resource<hip_exec_explicit<BlockSize, BlocksPerSM, Async>>{
-    using type = camp::resources::Hip;
-  };
-
-  template<typename ISetIter, size_t BlockSize, size_t BlocksPerSM, bool Async>
-  struct get_resource<ExecPolicy<ISetIter, hip_exec_explicit<BlockSize, BlocksPerSM, Async>>>{
-    using type = camp::resources::Hip;
-  };
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp
index 27d4624049..dfc42dd746 100644
--- a/test/include/RAJA_test-forall-execpol.hpp
+++ b/test/include/RAJA_test-forall-execpol.hpp
@@ -134,8 +134,7 @@ using CudaForallAtomicExecPols = CudaForallExecPols;
 
 #if defined(RAJA_ENABLE_HIP)
 using HipForallExecPols = camp::list< RAJA::hip_exec<128>,
-                                      RAJA::hip_exec<256>,
-                                      RAJA::hip_exec_explicit<256,2> >;
+                                      RAJA::hip_exec<256>   >;
 
 using HipForallReduceExecPols = HipForallExecPols;
 
diff --git a/test/include/RAJA_test-teams-execpol.hpp b/test/include/RAJA_test-teams-execpol.hpp
index 0311191790..37f163d09d 100644
--- a/test/include/RAJA_test-teams-execpol.hpp
+++ b/test/include/RAJA_test-teams-execpol.hpp
@@ -38,14 +38,8 @@ using seq_hip_policies = camp::list<
   RAJA::expt::LoopPolicy<RAJA::loop_exec, RAJA::hip_block_x_direct>,
   RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_x_loop>>;
 
-using seq_hip_explicit_policies = camp::list<
-  RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t,RAJA::policy::hip::expt::hip_launch_explicit_t<true, 32, 2>>,
-  RAJA::expt::LoopPolicy<RAJA::loop_exec, RAJA::hip_block_x_direct>,
-  RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_x_loop>>;
-
 using Sequential_launch_policies = camp::list<
-         seq_hip_policies,
-         seq_hip_explicit_policies
+         seq_hip_policies
          >;
 #else
 using Sequential_launch_policies = camp::list<
@@ -85,15 +79,8 @@ using omp_hip_policies = camp::list<
          RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_x_loop>
   >;
 
-using omp_hip_explicit_policies = camp::list<
-         RAJA::expt::LaunchPolicy<RAJA::expt::omp_launch_t,RAJA::policy::hip::expt::hip_launch_explicit_t<false, 32, 2>>,
-         RAJA::expt::LoopPolicy<RAJA::omp_parallel_for_exec, RAJA::hip_block_x_direct>,
-         RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_x_loop>
-  >;
-
 using OpenMP_launch_policies = camp::list<
-         omp_hip_policies,
-         omp_hip_explicit_policies
+         omp_hip_policies
          >;
 #else
 using OpenMP_launch_policies = camp::list<
@@ -119,10 +106,8 @@ using Cuda_launch_policies = camp::list<
 #if defined(RAJA_ENABLE_HIP)
 using Hip_launch_policies = camp::list<
          seq_hip_policies
-         , seq_hip_explicit_policies
 #if defined(RAJA_ENABLE_OPENMP)
          , omp_hip_policies
-         , omp_hip_explicit_policies
 #endif
         >;
 #endif // RAJA_ENABLE_HIP
diff --git a/test/include/RAJA_test-workgroup.hpp b/test/include/RAJA_test-workgroup.hpp
index e2f44039ee..43be1b98e7 100644
--- a/test/include/RAJA_test-workgroup.hpp
+++ b/test/include/RAJA_test-workgroup.hpp
@@ -384,8 +384,7 @@ using HipExecPolicyList =
                 #if defined(RAJA_TEST_EXHAUSTIVE)
                 RAJA::hip_work<256>,
                 #endif
-                RAJA::hip_work<1024>,
-                RAJA::hip_work_explicit<256, 2>
+                RAJA::hip_work<1024>
               >;
 using HipOrderedPolicyList = SequentialOrderedPolicyList;
 using HipOrderPolicyList   =
diff --git a/test/old-tests/unit/test-kernel.cpp b/test/old-tests/unit/test-kernel.cpp
index 69d3de8171..6b00050aa4 100644
--- a/test/old-tests/unit/test-kernel.cpp
+++ b/test/old-tests/unit/test-kernel.cpp
@@ -3181,7 +3181,7 @@ GPU_TEST(Kernel_gpu, HipExec1c)
 
   // Loop Fusion
   using Pol = KernelPolicy<
-      HipKernelExt<hip_explicit_launch<false, 5, 3, 1>,
+      HipKernelExt<hip_explicit_launch<false, 5, 3>,
            statement::Tile<2, tile_fixed<2>, hip_block_z_loop,
                     For<0, hip_block_x_loop,
                         For<1, hip_block_y_loop,
diff --git a/test/unit/algorithm/tests/test-algorithm-sort.hpp b/test/unit/algorithm/tests/test-algorithm-sort.hpp
index c0f584b80a..b0eb6d391b 100644
--- a/test/unit/algorithm/tests/test-algorithm-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-sort.hpp
@@ -127,8 +127,7 @@ using CudaSortSorters =
 using HipSortSorters =
   camp::list<
               PolicySort<RAJA::hip_exec<128>>,
-              PolicySortPairs<RAJA::hip_exec<128>>,
-              PolicySort<RAJA::hip_exec_explicit<128, 2>>
+              PolicySortPairs<RAJA::hip_exec<128>>
             >;
 
 #endif
diff --git a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
index 1001d12970..2ec301faaf 100644
--- a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp
@@ -127,8 +127,7 @@ using CudaStableSortSorters =
 using HipStableSortSorters =
   camp::list<
               PolicyStableSort<RAJA::hip_exec<128>>,
-              PolicyStableSortPairs<RAJA::hip_exec<128>>,
-              PolicyStableSort<RAJA::hip_exec_explicit<128, 2>>
+              PolicyStableSortPairs<RAJA::hip_exec<128>>
             >;
 
 #endif

From cfa84ae19c4fbdd9e30ab91788f142e9a2974f05 Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Wed, 5 Jan 2022 16:45:16 -0800
Subject: [PATCH 17/21] More HIP undos, and formatting.

---
 include/RAJA/policy/hip/kernel/HipKernel.hpp | 12 +-----------
 include/RAJA/policy/hip/policy.hpp           |  8 ++++----
 include/RAJA/policy/hip/teams.hpp            |  4 ++--
 test/include/RAJA_test-forall-execpol.hpp    |  2 +-
 4 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp
index 6acef19089..3a905ca723 100644
--- a/include/RAJA/policy/hip/kernel/HipKernel.hpp
+++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp
@@ -75,7 +75,7 @@ using hip_explicit_launch = hip_launch<async0, num_blocks, num_threads>;
  * If num_threads is 0 then num_threads is chosen at runtime.
  */
 template <int num_threads0, bool async0>
-using hip_occ_calc_launch = hip_launch<async0, 0, num_threads0, 0>;
+using hip_occ_calc_launch = hip_launch<async0, 0, num_threads0>;
 
 namespace statement
 {
@@ -139,16 +139,6 @@ using HipKernelFixed =
     HipKernelExt<hip_explicit_launch<false, 0, num_threads>,
                   EnclosedStmts...>;
 
-/*!
- * A RAJA::kernel statement that launches a HIP kernel with a fixed
- * number of threads (specified by num_threads) and min blocks per sm.
- * The kernel launch is synchronous.
- */
-template <size_t num_threads, size_t blocks_per_sm, typename... EnclosedStmts>
-using HipKernelFixedSM =
-    HipKernelExt<hip_explicit_launch<false, 0, num_threads, blocks_per_sm>,
-                  EnclosedStmts...>;
-
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with a fixed
  * number of threads (specified by num_threads)
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 18364c20a2..d9b8d62fff 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -80,10 +80,10 @@ struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t<
 
 template <bool Async, int num_threads = 0>
 struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
-                                      RAJA::Policy::hip,
-                                      RAJA::Pattern::region,
-                                      detail::get_launch<Async>::value,
-                                      RAJA::Platform::hip> {
+                       RAJA::Policy::hip,
+                       RAJA::Pattern::region,
+                       detail::get_launch<Async>::value,
+                       RAJA::Platform::hip> {
 };
 
 
diff --git a/include/RAJA/policy/hip/teams.hpp b/include/RAJA/policy/hip/teams.hpp
index f3dd62d560..5e5cc39d4a 100644
--- a/include/RAJA/policy/hip/teams.hpp
+++ b/include/RAJA/policy/hip/teams.hpp
@@ -162,7 +162,7 @@ static void launch_global_fcn_fixed(LaunchContext ctx, BODY body_in)
 }
 
 template <bool async, int nthreads>
-struct LaunchExecute<RAJA::policy::hip::expt::hip_launch_t<async, nthreads>> {
+struct LaunchExecute<RAJA::expt::hip_launch_t<async, nthreads>> {
 
   template <typename BODY_IN>
   static void exec(LaunchContext const &ctx, BODY_IN &&body_in)
@@ -222,7 +222,7 @@ struct LaunchExecute<RAJA::policy::hip::expt::hip_launch_t<async, nthreads>> {
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = launch_global_fcn<BODY, nthreads>;
+    auto func = launch_global_fcn<BODY>;
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp
index dfc42dd746..fd2cdc18e8 100644
--- a/test/include/RAJA_test-forall-execpol.hpp
+++ b/test/include/RAJA_test-forall-execpol.hpp
@@ -134,7 +134,7 @@ using CudaForallAtomicExecPols = CudaForallExecPols;
 
 #if defined(RAJA_ENABLE_HIP)
 using HipForallExecPols = camp::list< RAJA::hip_exec<128>,
-                                      RAJA::hip_exec<256>   >;
+                                      RAJA::hip_exec<256>  >;
 
 using HipForallReduceExecPols = HipForallExecPols;
 

From f263497a3044683e8628c9d96874ca5004476f6a Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Thu, 6 Jan 2022 14:36:02 -0800
Subject: [PATCH 18/21] Fix unrelated typo.

---
 docs/sphinx/user_guide/feature/policies.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index d66a392b3b..b24094ea09 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -684,7 +684,7 @@ used and the OpenMP version of the atomic operation is applied.
 
 Here is an example illustrating use of the ``auto_atomic`` policy::
 
-  RAJA::forall< RAJA::cuda_execBLOCK_SIZE> >(RAJA::RangeSegment seg(0, N),
+  RAJA::forall< RAJA::cuda_exec<BLOCK_SIZE> >(RAJA::RangeSegment seg(0, N),
     [=] RAJA_DEVICE (RAJA::Index_type i) {
 
     RAJA::atomicAdd< RAJA::auto_atomic >(&sum, 1);

From 96a1635247a58ffd03ff256c14b9e2f4ffe60a69 Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Thu, 6 Jan 2022 14:59:54 -0800
Subject: [PATCH 19/21] Documentation for cuda_exec_explicit.

---
 docs/sphinx/user_guide/feature/resource.rst     |  1 +
 docs/sphinx/user_guide/tutorial/add_vectors.rst | 11 +++++++++++
 examples/tut_add-vectors.cpp                    | 12 ++++++++++++
 3 files changed, 24 insertions(+)

diff --git a/docs/sphinx/user_guide/feature/resource.rst b/docs/sphinx/user_guide/feature/resource.rst
index 69842465b6..1a28720553 100644
--- a/docs/sphinx/user_guide/feature/resource.rst
+++ b/docs/sphinx/user_guide/feature/resource.rst
@@ -130,6 +130,7 @@ execution policy suport.
  ======== ==============================
  Cuda     | cuda_exec
           | cuda_exec_async
+          | cuda_exec_explicit
  Hip      | hip_exec
           | hip_exec_async
  Omp*     | omp_target_parallel_for_exec
diff --git a/docs/sphinx/user_guide/tutorial/add_vectors.rst b/docs/sphinx/user_guide/tutorial/add_vectors.rst
index c615dbf4bf..73e7ed13d9 100644
--- a/docs/sphinx/user_guide/tutorial/add_vectors.rst
+++ b/docs/sphinx/user_guide/tutorial/add_vectors.rst
@@ -96,6 +96,17 @@ Note that the CUDA execution policy type accepts a template argument
 ``CUDA_BLOCK_SIZE``, which specifies that each CUDA thread block launched 
 to execute the kernel will have the given number threads in the block.
 
+For performance tuning, the ``RAJA::cuda_exec_explicit`` policy is also
+provided. This allows the user to specify the number of blocks allocated
+per streaming multiprocessor (SM) to allow additional block level
+parallelism. Note that the third boolean argument representing asynchronous
+execution can be omitted, and is ``false`` by default:
+
+.. literalinclude:: ../../../../examples/tut_add-vectors.cpp
+   :start-after: _rajacuda_explicit_vector_add_start
+   :end-before: _rajacuda_explicit_vector_add_end
+   :language: C++ 
+
 Since the lambda defining the loop body will be passed to a device kernel, 
 it must be decorated with the ``__device__`` attribute when it is defined. 
 This can be done directly or by using the ``RAJA_DEVICE`` macro.
diff --git a/examples/tut_add-vectors.cpp b/examples/tut_add-vectors.cpp
index cbc993cb03..c64ee4ab32 100644
--- a/examples/tut_add-vectors.cpp
+++ b/examples/tut_add-vectors.cpp
@@ -168,6 +168,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   checkResult(c, N);
 //printResult(c, N);
+
+  std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n";
+
+  // _rajacuda_explicit_vector_add_start
+  RAJA::forall<RAJA::cuda_exec_explicit<CUDA_BLOCK_SIZE/2, 2, false>>(RAJA::RangeSegment(0, N), 
+    [=] RAJA_DEVICE (int i) { 
+    c[i] = a[i] + b[i]; 
+  });    
+  // _rajacuda_explicit_vector_add_end
+
+  checkResult(c, N);
+//printResult(c, N);
 #endif
 
 //----------------------------------------------------------------------------//

From 878b113829cab6d276bcd695364a5ea3681c394e Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Thu, 6 Jan 2022 15:38:33 -0800
Subject: [PATCH 20/21] Clarify async in example.

---
 examples/tut_add-vectors.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/tut_add-vectors.cpp b/examples/tut_add-vectors.cpp
index c64ee4ab32..b9330fdeee 100644
--- a/examples/tut_add-vectors.cpp
+++ b/examples/tut_add-vectors.cpp
@@ -169,10 +169,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   checkResult(c, N);
 //printResult(c, N);
 
+  bool Asynchronous = false;
   std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n";
 
   // _rajacuda_explicit_vector_add_start
-  RAJA::forall<RAJA::cuda_exec_explicit<CUDA_BLOCK_SIZE/2, 2, false>>(RAJA::RangeSegment(0, N), 
+  RAJA::forall<RAJA::cuda_exec_explicit<CUDA_BLOCK_SIZE/2, 2, Asynchronous>>(RAJA::RangeSegment(0, N), 
     [=] RAJA_DEVICE (int i) { 
     c[i] = a[i] + b[i]; 
   });    

From 0090cadf7f3bf1cbe8dc3698bbc64e94a0b9a3ee Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Thu, 6 Jan 2022 17:30:03 -0800
Subject: [PATCH 21/21] Satisfy NVCC const.

---
 examples/tut_add-vectors.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/tut_add-vectors.cpp b/examples/tut_add-vectors.cpp
index b9330fdeee..9d77468276 100644
--- a/examples/tut_add-vectors.cpp
+++ b/examples/tut_add-vectors.cpp
@@ -169,11 +169,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   checkResult(c, N);
 //printResult(c, N);
 
-  bool Asynchronous = false;
+  const bool Asynchronous = false;
   std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n";
 
   // _rajacuda_explicit_vector_add_start
-  RAJA::forall<RAJA::cuda_exec_explicit<CUDA_BLOCK_SIZE/2, 2, Asynchronous>>(RAJA::RangeSegment(0, N), 
+  RAJA::forall<RAJA::cuda_exec_explicit<CUDA_BLOCK_SIZE, 2, Asynchronous>>(RAJA::RangeSegment(0, N), 
     [=] RAJA_DEVICE (int i) { 
     c[i] = a[i] + b[i]; 
   });