LLNL · rchen20 · Jan 7, 2022 · Nov 19, 2021 · Nov 19, 2021 · Nov 19, 2021
diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
@@ -684,7 +684,7 @@ used and the OpenMP version of the atomic operation is applied.
 
 Here is an example illustrating use of the ``auto_atomic`` policy::
 
-  RAJA::forall< RAJA::cuda_execBLOCK_SIZE> >(RAJA::RangeSegment seg(0, N),
+  RAJA::forall< RAJA::cuda_exec<BLOCK_SIZE> >(RAJA::RangeSegment seg(0, N),
     [=] RAJA_DEVICE (RAJA::Index_type i) {
 
     RAJA::atomicAdd< RAJA::auto_atomic >(&sum, 1);

diff --git a/docs/sphinx/user_guide/feature/resource.rst b/docs/sphinx/user_guide/feature/resource.rst
@@ -130,6 +130,7 @@ execution policy suport.
  ======== ==============================
  Cuda     | cuda_exec
           | cuda_exec_async
+          | cuda_exec_explicit
  Hip      | hip_exec
           | hip_exec_async
  Omp*     | omp_target_parallel_for_exec

diff --git a/docs/sphinx/user_guide/tutorial/add_vectors.rst b/docs/sphinx/user_guide/tutorial/add_vectors.rst
@@ -96,6 +96,17 @@ Note that the CUDA execution policy type accepts a template argument
 ``CUDA_BLOCK_SIZE``, which specifies that each CUDA thread block launched 
 to execute the kernel will have the given number threads in the block.
 
+For performance tuning, the ``RAJA::cuda_exec_explicit`` policy is also
+provided. This allows the user to specify the number of blocks allocated
+per streaming multiprocessor (SM) to allow additional block level
+parallelism. Note that the third boolean argument representing asynchronous
+execution can be omitted, and is ``false`` by default:
+
+.. literalinclude:: ../../../../examples/tut_add-vectors.cpp
+   :start-after: _rajacuda_explicit_vector_add_start
+   :end-before: _rajacuda_explicit_vector_add_end
+   :language: C++ 
+
 Since the lambda defining the loop body will be passed to a device kernel, 
 it must be decorated with the ``__device__`` attribute when it is defined. 
 This can be done directly or by using the ``RAJA_DEVICE`` macro.

diff --git a/examples/tut_add-vectors.cpp b/examples/tut_add-vectors.cpp
@@ -168,6 +168,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   checkResult(c, N);
 //printResult(c, N);
+
+  std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n";
+
+  // _rajacuda_explicit_vector_add_start
+  RAJA::forall<RAJA::cuda_exec_explicit<CUDA_BLOCK_SIZE/2, 2, false>>(RAJA::RangeSegment(0, N), 
+    [=] RAJA_DEVICE (int i) { 
+    c[i] = a[i] + b[i]; 
+  });    
+  // _rajacuda_explicit_vector_add_end
+
+  checkResult(c, N);
+//printResult(c, N);
 #endif
 
 //----------------------------------------------------------------------------//

diff --git a/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp b/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp
@@ -98,8 +98,8 @@ inline typename Vtable_T::call_sig get_cached_Vtable_cuda_device_call()
 * Populate and return a Vtable object where the
 * call operator is a device function
 */
-template < typename T, typename Vtable_T, size_t BLOCK_SIZE, bool Async >
-inline const Vtable_T* get_Vtable(cuda_work<BLOCK_SIZE, Async> const&)
+template < typename T, typename Vtable_T, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async >
+inline const Vtable_T* get_Vtable(cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
 {
   static Vtable_T vtable{
         &Vtable_T::template move_construct_destroy<T>,

diff --git a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
@@ -36,27 +36,27 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
 struct WorkRunner<
-        RAJA::cuda_work<BLOCK_SIZE, Async>,
+        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
         RAJA::ordered,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
     : WorkRunnerForallOrdered<
-        RAJA::cuda_exec_async<BLOCK_SIZE>,
-        RAJA::cuda_work<BLOCK_SIZE, Async>,
+        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
         RAJA::ordered,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
 {
   using base = WorkRunnerForallOrdered<
-        RAJA::cuda_exec_async<BLOCK_SIZE>,
-        RAJA::cuda_work<BLOCK_SIZE, Async>,
+        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
         RAJA::ordered,
         ALLOCATOR_T,
         INDEX_T,
@@ -91,27 +91,27 @@ struct WorkRunner<
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
 struct WorkRunner<
-        RAJA::cuda_work<BLOCK_SIZE, Async>,
+        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
         RAJA::reverse_ordered,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
     : WorkRunnerForallReverse<
-        RAJA::cuda_exec_async<BLOCK_SIZE>,
-        RAJA::cuda_work<BLOCK_SIZE, Async>,
+        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
         RAJA::reverse_ordered,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
 {
   using base = WorkRunnerForallReverse<
-        RAJA::cuda_exec_async<BLOCK_SIZE>,
-        RAJA::cuda_work<BLOCK_SIZE, Async>,
+        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
         RAJA::reverse_ordered,
         ALLOCATOR_T,
         INDEX_T,
@@ -177,11 +177,12 @@ struct HoldCudaDeviceXThreadblockLoop
 };
 
 template < size_t BLOCK_SIZE,
+           size_t BLOCKS_PER_SM,
            typename StorageIter,
            typename value_type,
            typename index_type,
            typename ... Args >
-__launch_bounds__(BLOCK_SIZE, 1) __global__
+__launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
     void cuda_unordered_y_block_global(StorageIter iter, Args... args)
 {
   const index_type i_loop = blockIdx.y;
@@ -197,24 +198,24 @@ __launch_bounds__(BLOCK_SIZE, 1) __global__
  * the x direction, with the number of threads in the x dimension determined
  * by the average number of iterates per loop
  */
-template <size_t BLOCK_SIZE, bool Async,
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
 struct WorkRunner<
-        RAJA::cuda_work<BLOCK_SIZE, Async>,
+        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
         RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
 {
-  using exec_policy = RAJA::cuda_work<BLOCK_SIZE, Async>;
+  using exec_policy = RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
   using order_policy = RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
   using Allocator = ALLOCATOR_T;
   using index_type = INDEX_T;
   using resource_type = resources::Cuda;
 
-  using vtable_type = Vtable<RAJA::cuda_work<BLOCK_SIZE, true>, Args...>;
+  using vtable_type = Vtable<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>, Args...>;
 
   WorkRunner() = default;
 
@@ -290,7 +291,7 @@ struct WorkRunner<
 
     per_run_storage run_storage{};
 
-    auto func = cuda_unordered_y_block_global<BLOCK_SIZE, Iterator, value_type, index_type, Args...>;
+    auto func = cuda_unordered_y_block_global<BLOCK_SIZE, BLOCKS_PER_SM, Iterator, value_type, index_type, Args...>;
 
     //
     // Compute the requested iteration space size

diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
@@ -130,10 +130,11 @@ __device__ __forceinline__ unsigned int getGlobalNumThreads_3D_3D()
  ******************************************************************************
  */
 template <size_t BlockSize,
+          size_t BlocksPerSM,
           typename Iterator,
           typename LOOP_BODY,
           typename IndexType>
-__launch_bounds__(BlockSize, 1) __global__
+__launch_bounds__(BlockSize, BlocksPerSM) __global__
     void forall_cuda_kernel(LOOP_BODY loop_body,
                             const Iterator idx,
                             IndexType length)
@@ -157,17 +158,17 @@ __launch_bounds__(BlockSize, 1) __global__
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async>
+template <typename Iterable, typename LoopBody, size_t BlockSize, size_t BlocksPerSM, bool Async>
 RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(resources::Cuda cuda_res,
-                                                    cuda_exec<BlockSize, Async>,
+                                                    cuda_exec_explicit<BlockSize, BlocksPerSM, Async>,
                                                     Iterable&& iter,
                                                     LoopBody&& loop_body)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
-  auto func = impl::forall_cuda_kernel<BlockSize, Iterator, LOOP_BODY, IndexType>;
+  auto func = impl::forall_cuda_kernel<BlockSize, BlocksPerSM, Iterator, LOOP_BODY, IndexType>;
 
   //
   // Compute the requested iteration space size
@@ -238,11 +239,12 @@ RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(resources::Cuda c
  */
 template <typename LoopBody,
           size_t BlockSize,
+          size_t BlocksPerSM,
           bool Async,
           typename... SegmentTypes>
 RAJA_INLINE resources::EventProxy<resources::Cuda>
 forall_impl(resources::Cuda r,
-            ExecPolicy<seq_segit, cuda_exec<BlockSize, Async>>,
+            ExecPolicy<seq_segit, cuda_exec_explicit<BlockSize, BlocksPerSM, Async>>,
             const TypedIndexSet<SegmentTypes...>& iset,
             LoopBody&& loop_body)
 {
@@ -251,7 +253,7 @@ forall_impl(resources::Cuda r,
     iset.segmentCall(r,
                      isi,
                      detail::CallForall(),
-                     cuda_exec<BlockSize, true>(),
+                     cuda_exec_explicit<BlockSize, BlocksPerSM, true>(),
                      loop_body);
   }  // iterate over segments of index set
 

diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
@@ -45,28 +45,30 @@ namespace RAJA
 
 /*!
  * CUDA kernel launch policy where the user may specify the number of physical
- * thread blocks and threads per block.
+ * thread blocks, threads per block, and blocks per SM.
  * If num_blocks is 0 and num_threads is non-zero then num_blocks is chosen at
  * runtime.
  * Num_blocks is chosen to maximize the number of blocks running concurrently.
- * If num_threads and num_blocks are both 0 then num_threads and num_blocks are
- * chosen at runtime.
- * Num_threads and num_blocks are determined by the CUDA occupancy calculator.
- * If num_threads is 0 and num_blocks is non-zero then num_threads is chosen at
- * runtime.
- * Num_threads is 1024, which may not be appropriate for all kernels.
+ * Blocks per SM must be chosen by the user.
  */
 template <bool async0, size_t num_blocks, size_t num_threads, size_t blocks_per_sm>
-struct cuda_launch {};
+struct cuda_explicit_launch {};
 
 /*!
  * CUDA kernel launch policy where the user specifies the number of physical
  * thread blocks and threads per block.
  * If num_blocks is 0 then num_blocks is chosen at runtime.
  * Num_blocks is chosen to maximize the number of blocks running concurrently.
+ * If num_threads and num_blocks are both 0 then num_threads and num_blocks are
+ * chosen at runtime.
+ * Num_threads and num_blocks are determined by the CUDA occupancy calculator.
+ * If num_threads is 0 and num_blocks is non-zero then num_threads is chosen at
+ * runtime.
+ * Num_threads is 1024, which may not be appropriate for all kernels.
+ * Blocks per SM defaults to 1.
  */
-template <bool async0, size_t num_blocks, size_t num_threads, size_t blocks_per_sm>
-using cuda_explicit_launch = cuda_launch<async0, num_blocks, num_threads, blocks_per_sm>;
+template <bool async0, size_t num_blocks, size_t num_threads>
+using cuda_launch = cuda_explicit_launch<async0, num_blocks, num_threads, policy::cuda::MIN_BLOCKS_PER_SM>;
 
 
 /*!
@@ -75,19 +77,19 @@ using cuda_explicit_launch = cuda_launch<async0, num_blocks, num_threads, blocks
  * If num_threads is 0 then num_threads is chosen at runtime.
  */
 template <size_t num_threads0, bool async0>
-using cuda_occ_calc_launch = cuda_launch<async0, 0, num_threads0, 0>;
+using cuda_occ_calc_launch = cuda_explicit_launch<async0, 0, num_threads0, policy::cuda::MIN_BLOCKS_PER_SM>;
 
 namespace statement
 {
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel.
- *
- *
+ * Note - Statement requires a placeholder cuda_exec policy for the sake of 
+ * object oriented inheritance.
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct CudaKernelExt
-    : public internal::Statement<cuda_exec<0>, EnclosedStmts...> {
+    : public internal::Statement<cuda_exec_explicit<0, 0>, EnclosedStmts...> {
 };
 
 
@@ -97,19 +99,19 @@ struct CudaKernelExt
  * calculator determine the unspecified values.
  * The kernel launch is synchronous.
  */
-template <size_t num_blocks, size_t num_threads, size_t blocks_per_sm, typename... EnclosedStmts>
+template <size_t num_blocks, size_t num_threads, typename... EnclosedStmts>
 using CudaKernelExp =
-    CudaKernelExt<cuda_launch<false, num_blocks, num_threads, blocks_per_sm>, EnclosedStmts...>;
+    CudaKernelExt<cuda_launch<false, num_blocks, num_threads>, EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with the flexibility
  * to fix the number of threads and/or blocks and let the CUDA occupancy
  * calculator determine the unspecified values.
  * The kernel launch is asynchronous.
  */
-template <size_t num_blocks, size_t num_threads, size_t blocks_per_sm, typename... EnclosedStmts>
+template <size_t num_blocks, size_t num_threads, typename... EnclosedStmts>
 using CudaKernelExpAsync =
-    CudaKernelExt<cuda_launch<true, num_blocks, num_threads, blocks_per_sm>, EnclosedStmts...>;
+    CudaKernelExt<cuda_launch<true, num_blocks, num_threads>, EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel using the
@@ -136,7 +138,7 @@ using CudaKernelOccAsync =
  */
 template <size_t num_threads, typename... EnclosedStmts>
 using CudaKernelFixed =
-    CudaKernelExt<cuda_explicit_launch<false, operators::limits<size_t>::max(), num_threads, 1>,
+    CudaKernelExt<cuda_launch<false, operators::limits<size_t>::max(), num_threads>,
                   EnclosedStmts...>;
 
 /*!
@@ -156,7 +158,7 @@ using CudaKernelFixedSM =
  */
 template <size_t num_threads, typename... EnclosedStmts>
 using CudaKernelFixedAsync =
-    CudaKernelExt<cuda_explicit_launch<true, operators::limits<size_t>::max(), num_threads, 1>,
+    CudaKernelExt<cuda_launch<true, operators::limits<size_t>::max(), num_threads>,
                   EnclosedStmts...>;
 
 /*!
@@ -271,7 +273,7 @@ struct CudaLaunchHelper;
  * determined at runtime using the CUDA occupancy calculator.
  */
 template<bool async0, size_t num_blocks, size_t num_threads, size_t blocks_per_sm, typename StmtList, typename Data, typename Types>
-struct CudaLaunchHelper<cuda_launch<async0, num_blocks, num_threads, blocks_per_sm>,StmtList,Data,Types>
+struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, blocks_per_sm>,StmtList,Data,Types>
 {
   using Self = CudaLaunchHelper;