From b225a2cefd44b919058ac0fde12393fb4d2bc07e Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Thu, 18 Nov 2021 19:23:49 -0800 Subject: [PATCH 01/21] CUDA and HIP forall, with tests. --- include/RAJA/policy/cuda/forall.hpp | 14 +++--- include/RAJA/policy/cuda/policy.hpp | 16 +++++-- include/RAJA/policy/cuda/scan.hpp | 13 +++-- include/RAJA/policy/cuda/sort.hpp | 48 +++++++++---------- include/RAJA/policy/hip/forall.hpp | 14 +++--- include/RAJA/policy/hip/policy.hpp | 16 +++++-- include/RAJA/policy/hip/scan.hpp | 13 +++-- include/RAJA/policy/hip/sort.hpp | 48 +++++++++---------- include/RAJA/util/resource.hpp | 20 ++++++++ test/include/RAJA_test-forall-execpol.hpp | 6 ++- .../algorithm/tests/test-algorithm-sort.hpp | 6 ++- .../tests/test-algorithm-stable-sort.hpp | 6 ++- 12 files changed, 134 insertions(+), 86 deletions(-) diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp index 626b506796..aedfdf2e27 100644 --- a/include/RAJA/policy/cuda/forall.hpp +++ b/include/RAJA/policy/cuda/forall.hpp @@ -130,10 +130,11 @@ __device__ __forceinline__ unsigned int getGlobalNumThreads_3D_3D() ****************************************************************************** */ template -__launch_bounds__(BlockSize, 1) __global__ +__launch_bounds__(BlockSize, BlocksPerSM) __global__ void forall_cuda_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length) @@ -157,9 +158,9 @@ __launch_bounds__(BlockSize, 1) __global__ //////////////////////////////////////////////////////////////////////// // -template +template RAJA_INLINE resources::EventProxy forall_impl(resources::Cuda cuda_res, - cuda_exec, + cuda_exec_explicit, Iterable&& iter, LoopBody&& loop_body) { @@ -167,7 +168,7 @@ RAJA_INLINE resources::EventProxy forall_impl(resources::Cuda c using LOOP_BODY = camp::decay; using IndexType = camp::decay; - auto func = impl::forall_cuda_kernel; + auto func = impl::forall_cuda_kernel; // // Compute the requested iteration space size @@ -238,11 +239,12 @@ RAJA_INLINE resources::EventProxy forall_impl(resources::Cuda c */ template RAJA_INLINE resources::EventProxy forall_impl(resources::Cuda r, - ExecPolicy>, + ExecPolicy>, const TypedIndexSet& iset, LoopBody&& loop_body) { @@ -251,7 +253,7 @@ forall_impl(resources::Cuda r, iset.segmentCall(r, isi, detail::CallForall(), - cuda_exec(), + cuda_exec_explicit(), loop_body); } // iterate over segments of index set diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index 855d59deb5..9dedf5cc89 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -74,8 +74,8 @@ namespace policy namespace cuda { -template -struct cuda_exec : public RAJA::make_policy_pattern_launch_platform_t< +template +struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::cuda, RAJA::Pattern::forall, detail::get_launch::value, @@ -220,10 +220,16 @@ struct cuda_synchronize : make_policy_pattern_launch_t -using cuda_exec_async = policy::cuda::cuda_exec; +template +using cuda_exec_explicit_async = policy::cuda::cuda_exec_explicit; + +template +using cuda_exec = policy::cuda::cuda_exec_explicit; + +template +using cuda_exec_async = policy::cuda::cuda_exec_explicit; using policy::cuda::cuda_work; diff --git a/include/RAJA/policy/cuda/scan.hpp b/include/RAJA/policy/cuda/scan.hpp index 978a1cb0a1..d7ed28c39c 100644 --- a/include/RAJA/policy/cuda/scan.hpp +++ b/include/RAJA/policy/cuda/scan.hpp @@ -42,12 +42,12 @@ namespace scan \brief explicit inclusive inplace scan given range, function, and initial value */ -template +template RAJA_INLINE resources::EventProxy inclusive_inplace( resources::Cuda cuda_res, - cuda_exec, + cuda_exec_explicit, InputIter begin, InputIter end, Function binary_op) @@ -90,6 +90,7 @@ inclusive_inplace( initial value */ template exclusive_inplace( resources::Cuda cuda_res, - cuda_exec, + cuda_exec_explicit, InputIter begin, InputIter end, Function binary_op, @@ -144,6 +145,7 @@ exclusive_inplace( initial value */ template inclusive( resources::Cuda cuda_res, - cuda_exec, + cuda_exec_explicit, InputIter begin, InputIter end, OutputIter out, @@ -196,6 +198,7 @@ inclusive( initial value */ template exclusive( resources::Cuda cuda_res, - cuda_exec, + cuda_exec_explicit, InputIter begin, InputIter end, OutputIter out, diff --git a/include/RAJA/policy/cuda/sort.hpp b/include/RAJA/policy/cuda/sort.hpp index e91831bbc8..53c8661ec1 100644 --- a/include/RAJA/policy/cuda/sort.hpp +++ b/include/RAJA/policy/cuda/sort.hpp @@ -44,7 +44,7 @@ namespace sort /*! \brief static assert unimplemented stable sort */ -template +template concepts::enable_if_t, concepts::negate>, @@ -54,7 +54,7 @@ concepts::enable_if_t, camp::is_same>>>>>> stable( resources::Cuda cuda_res, - cuda_exec, + cuda_exec_explicit, Iter, Iter, Compare) @@ -75,13 +75,13 @@ stable( /*! \brief stable sort given range in ascending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> stable( resources::Cuda cuda_res, - cuda_exec, + cuda_exec_explicit, Iter begin, Iter end, operators::less>) @@ -143,13 +143,13 @@ stable( /*! \brief stable sort given range in descending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> stable( resources::Cuda cuda_res, - cuda_exec, + cuda_exec_explicit, Iter begin, Iter end, operators::greater>) @@ -212,7 +212,7 @@ stable( /*! \brief static assert unimplemented sort */ -template +template concepts::enable_if_t, concepts::negate>, @@ -222,7 +222,7 @@ concepts::enable_if_t, camp::is_same>>>>>> unstable( resources::Cuda cuda_res, - cuda_exec, + cuda_exec_explicit, Iter, Iter, Compare) @@ -243,13 +243,13 @@ unstable( /*! \brief sort given range in ascending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> unstable( resources::Cuda cuda_res, - cuda_exec p, + cuda_exec_explicit p, Iter begin, Iter end, operators::less> comp) @@ -260,13 +260,13 @@ unstable( /*! \brief sort given range in descending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> unstable( resources::Cuda cuda_res, - cuda_exec p, + cuda_exec_explicit p, Iter begin, Iter end, operators::greater> comp) @@ -278,7 +278,7 @@ unstable( /*! \brief static assert unimplemented stable sort pairs */ -template concepts::enable_if_t, concepts::negate, camp::is_same>>>>>> stable_pairs( resources::Cuda cuda_res, - cuda_exec, + cuda_exec_explicit, KeyIter, KeyIter, ValIter, @@ -314,7 +314,7 @@ stable_pairs( /*! \brief stable sort given range of pairs in ascending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -322,7 +322,7 @@ concepts::enable_if_t, std::is_pointer> stable_pairs( resources::Cuda cuda_res, - cuda_exec, + cuda_exec_explicit, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, @@ -396,7 +396,7 @@ stable_pairs( /*! \brief stable sort given range of pairs in descending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -404,7 +404,7 @@ concepts::enable_if_t, std::is_pointer> stable_pairs( resources::Cuda cuda_res, - cuda_exec, + cuda_exec_explicit, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, @@ -479,7 +479,7 @@ stable_pairs( /*! \brief static assert unimplemented sort pairs */ -template concepts::enable_if_t, concepts::negate, camp::is_same>>>>>> unstable_pairs( resources::Cuda cuda_res, - cuda_exec, + cuda_exec_explicit, KeyIter, KeyIter, ValIter, @@ -515,7 +515,7 @@ unstable_pairs( /*! \brief stable sort given range of pairs in ascending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -523,7 +523,7 @@ concepts::enable_if_t, std::is_pointer> unstable_pairs( resources::Cuda cuda_res, - cuda_exec p, + cuda_exec_explicit p, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, @@ -535,7 +535,7 @@ unstable_pairs( /*! \brief stable sort given range of pairs in descending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -543,7 +543,7 @@ concepts::enable_if_t, std::is_pointer> unstable_pairs( resources::Cuda cuda_res, - cuda_exec p, + cuda_exec_explicit p, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp index 99e1b8a63a..6e643459f5 100644 --- a/include/RAJA/policy/hip/forall.hpp +++ b/include/RAJA/policy/hip/forall.hpp @@ -128,10 +128,11 @@ __device__ __forceinline__ unsigned int getGlobalNumThreads_3D_3D() ****************************************************************************** */ template -__launch_bounds__(BlockSize, 1) __global__ +__launch_bounds__(BlockSize, BlocksPerSM) __global__ void forall_hip_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length) @@ -155,9 +156,9 @@ __launch_bounds__(BlockSize, 1) __global__ //////////////////////////////////////////////////////////////////////// // -template +template RAJA_INLINE resources::EventProxy forall_impl(resources::Hip hip_res, - hip_exec, + hip_exec_explicit, Iterable&& iter, LoopBody&& loop_body) { @@ -165,7 +166,7 @@ RAJA_INLINE resources::EventProxy forall_impl(resources::Hip hip using LOOP_BODY = camp::decay; using IndexType = camp::decay; - auto func = impl::forall_hip_kernel; + auto func = impl::forall_hip_kernel; // // Compute the requested iteration space size @@ -235,11 +236,12 @@ RAJA_INLINE resources::EventProxy forall_impl(resources::Hip hip */ template RAJA_INLINE resources::EventProxy forall_impl(resources::Hip r, - ExecPolicy>, + ExecPolicy>, const TypedIndexSet& iset, LoopBody&& loop_body) { @@ -248,7 +250,7 @@ forall_impl(resources::Hip r, iset.segmentCall(r, isi, detail::CallForall(), - hip_exec(), + hip_exec_explicit(), loop_body); } // iterate over segments of index set diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index d9b8d62fff..8d0db60ca1 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -70,8 +70,8 @@ namespace policy namespace hip { -template -struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t< +template +struct hip_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::hip, RAJA::Pattern::forall, detail::get_launch::value, @@ -222,10 +222,16 @@ using hip_atomic = hip_atomic_explicit; } // end namespace hip } // end namespace policy -using policy::hip::hip_exec; +using policy::hip::hip_exec_explicit; -template -using hip_exec_async = policy::hip::hip_exec; +template +using hip_exec_explicit_async = policy::hip::hip_exec_explicit; + +template +using hip_exec = policy::hip::hip_exec_explicit; + +template +using hip_exec_async = policy::hip::hip_exec_explicit; using policy::hip::hip_work; diff --git a/include/RAJA/policy/hip/scan.hpp b/include/RAJA/policy/hip/scan.hpp index 85bc494abb..999837586c 100644 --- a/include/RAJA/policy/hip/scan.hpp +++ b/include/RAJA/policy/hip/scan.hpp @@ -47,12 +47,12 @@ namespace scan \brief explicit inclusive inplace scan given range, function, and initial value */ -template +template RAJA_INLINE resources::EventProxy inclusive_inplace( resources::Hip hip_res, - hip_exec, + hip_exec_explicit, InputIter begin, InputIter end, Function binary_op) @@ -116,6 +116,7 @@ inclusive_inplace( initial value */ template exclusive_inplace( resources::Hip hip_res, - hip_exec, + hip_exec_explicit, InputIter begin, InputIter end, Function binary_op, @@ -192,6 +193,7 @@ exclusive_inplace( initial value */ template inclusive( resources::Hip hip_res, - hip_exec, + hip_exec_explicit, InputIter begin, InputIter end, OutputIter out, @@ -264,6 +266,7 @@ inclusive( initial value */ template exclusive( resources::Hip hip_res, - hip_exec, + hip_exec_explicit, InputIter begin, InputIter end, OutputIter out, diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp index 9090721ff5..05537342cb 100644 --- a/include/RAJA/policy/hip/sort.hpp +++ b/include/RAJA/policy/hip/sort.hpp @@ -73,7 +73,7 @@ namespace detail /*! \brief static assert unimplemented stable sort */ -template +template concepts::enable_if_t, concepts::negate>, @@ -83,7 +83,7 @@ concepts::enable_if_t, camp::is_same>>>>>> stable( resources::Hip hip_res, - hip_exec, + hip_exec_explicit, Iter, Iter, Compare) @@ -102,13 +102,13 @@ stable( /*! \brief stable sort given range in ascending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> stable( resources::Hip hip_res, - hip_exec, + hip_exec_explicit, Iter begin, Iter end, operators::less>) @@ -190,13 +190,13 @@ stable( /*! \brief stable sort given range in descending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> stable( resources::Hip hip_res, - hip_exec, + hip_exec_explicit, Iter begin, Iter end, operators::greater>) @@ -279,7 +279,7 @@ stable( /*! \brief static assert unimplemented sort */ -template +template concepts::enable_if_t, concepts::negate>, @@ -289,7 +289,7 @@ concepts::enable_if_t, camp::is_same>>>>>> unstable( resources::Hip hip_res, - hip_exec, + hip_exec_explicit, Iter, Iter, Compare) @@ -308,13 +308,13 @@ unstable( /*! \brief sort given range in ascending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> unstable( resources::Hip hip_res, - hip_exec p, + hip_exec_explicit p, Iter begin, Iter end, operators::less> comp) @@ -325,13 +325,13 @@ unstable( /*! \brief sort given range in descending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> unstable( resources::Hip hip_res, - hip_exec p, + hip_exec_explicit p, Iter begin, Iter end, operators::greater> comp) @@ -343,7 +343,7 @@ unstable( /*! \brief static assert unimplemented stable sort pairs */ -template concepts::enable_if_t, concepts::negate, camp::is_same>>>>>> stable_pairs( resources::Hip hip_res, - hip_exec, + hip_exec_explicit, KeyIter, KeyIter, ValIter, @@ -379,7 +379,7 @@ stable_pairs( /*! \brief stable sort given range of pairs in ascending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -387,7 +387,7 @@ concepts::enable_if_t, std::is_pointer> stable_pairs( resources::Hip hip_res, - hip_exec, + hip_exec_explicit, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, @@ -483,7 +483,7 @@ stable_pairs( /*! \brief stable sort given range of pairs in descending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -491,7 +491,7 @@ concepts::enable_if_t, std::is_pointer> stable_pairs( resources::Hip hip_res, - hip_exec, + hip_exec_explicit, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, @@ -588,7 +588,7 @@ stable_pairs( /*! \brief static assert unimplemented sort pairs */ -template concepts::enable_if_t, concepts::negate, camp::is_same>>>>>> unstable_pairs( resources::Hip hip_res, - hip_exec, + hip_exec_explicit, KeyIter, KeyIter, ValIter, @@ -624,7 +624,7 @@ unstable_pairs( /*! \brief stable sort given range of pairs in ascending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -632,7 +632,7 @@ concepts::enable_if_t, std::is_pointer> unstable_pairs( resources::Hip hip_res, - hip_exec p, + hip_exec_explicit p, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, @@ -644,7 +644,7 @@ unstable_pairs( /*! \brief stable sort given range of pairs in descending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -652,7 +652,7 @@ concepts::enable_if_t, std::is_pointer> unstable_pairs( resources::Hip hip_res, - hip_exec p, + hip_exec_explicit p, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, diff --git a/include/RAJA/util/resource.hpp b/include/RAJA/util/resource.hpp index dc50c2ea83..7fca8332bd 100644 --- a/include/RAJA/util/resource.hpp +++ b/include/RAJA/util/resource.hpp @@ -72,6 +72,16 @@ namespace RAJA struct get_resource>>{ using type = camp::resources::Cuda; }; + + template + struct get_resource>{ + using type = camp::resources::Cuda; + }; + + template + struct get_resource>>{ + using type = camp::resources::Cuda; + }; #endif #if defined(RAJA_HIP_ACTIVE) @@ -89,6 +99,16 @@ namespace RAJA struct get_resource>>{ using type = camp::resources::Hip; }; + + template + struct get_resource>{ + using type = camp::resources::Hip; + }; + + template + struct get_resource>>{ + using type = camp::resources::Hip; + }; #endif #if defined(RAJA_ENABLE_SYCL) diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp index ce2039e248..27d4624049 100644 --- a/test/include/RAJA_test-forall-execpol.hpp +++ b/test/include/RAJA_test-forall-execpol.hpp @@ -123,7 +123,8 @@ using OpenMPTargetForallAtomicExecPols = OpenMPTargetForallExecPols; #if defined(RAJA_ENABLE_CUDA) using CudaForallExecPols = camp::list< RAJA::cuda_exec<128>, - RAJA::cuda_exec<256> >; + RAJA::cuda_exec<256>, + RAJA::cuda_exec_explicit<256,2> >; using CudaForallReduceExecPols = CudaForallExecPols; @@ -133,7 +134,8 @@ using CudaForallAtomicExecPols = CudaForallExecPols; #if defined(RAJA_ENABLE_HIP) using HipForallExecPols = camp::list< RAJA::hip_exec<128>, - RAJA::hip_exec<256> >; + RAJA::hip_exec<256>, + RAJA::hip_exec_explicit<256,2> >; using HipForallReduceExecPols = HipForallExecPols; diff --git a/test/unit/algorithm/tests/test-algorithm-sort.hpp b/test/unit/algorithm/tests/test-algorithm-sort.hpp index 046ed493ef..c0f584b80a 100644 --- a/test/unit/algorithm/tests/test-algorithm-sort.hpp +++ b/test/unit/algorithm/tests/test-algorithm-sort.hpp @@ -116,7 +116,8 @@ using TBBSortSorters = using CudaSortSorters = camp::list< PolicySort>, - PolicySortPairs> + PolicySortPairs>, + PolicySort> >; #endif @@ -126,7 +127,8 @@ using CudaSortSorters = using HipSortSorters = camp::list< PolicySort>, - PolicySortPairs> + PolicySortPairs>, + PolicySort> >; #endif diff --git a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp index 00f65fae86..1001d12970 100644 --- a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp +++ b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp @@ -116,7 +116,8 @@ using TBBStableSortSorters = using CudaStableSortSorters = camp::list< PolicyStableSort>, - PolicyStableSortPairs> + PolicyStableSortPairs>, + PolicyStableSort> >; #endif @@ -126,7 +127,8 @@ using CudaStableSortSorters = using HipStableSortSorters = camp::list< PolicyStableSort>, - PolicyStableSortPairs> + PolicyStableSortPairs>, + PolicyStableSort> >; #endif From f0722347434550e30b76050abf7e5a75e6f2431f Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Fri, 19 Nov 2021 09:42:09 -0800 Subject: [PATCH 02/21] Missing comma. --- include/RAJA/policy/hip/sort.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp index 05537342cb..13b1151ca7 100644 --- a/include/RAJA/policy/hip/sort.hpp +++ b/include/RAJA/policy/hip/sort.hpp @@ -83,7 +83,7 @@ concepts::enable_if_t, camp::is_same>>>>>> stable( resources::Hip hip_res, - hip_exec_explicit, + hip_exec_explicit, Iter, Iter, Compare) From a8f424e13dc6b0df4b1c9664bf5fbead42ecbad8 Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Fri, 19 Nov 2021 16:11:00 -0800 Subject: [PATCH 03/21] Fix HIP, cleanup kernel. --- .../RAJA/policy/cuda/kernel/CudaKernel.hpp | 40 +++++------ include/RAJA/policy/hip/forall.hpp | 2 +- include/RAJA/policy/hip/kernel/HipKernel.hpp | 68 ++++++++++++------- 3 files changed, 67 insertions(+), 43 deletions(-) diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp index 276b2e9553..2655ee630b 100644 --- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp +++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp @@ -45,28 +45,30 @@ namespace RAJA /*! * CUDA kernel launch policy where the user may specify the number of physical - * thread blocks and threads per block. + * thread blocks, threads per block, and blocks per SM. * If num_blocks is 0 and num_threads is non-zero then num_blocks is chosen at * runtime. * Num_blocks is chosen to maximize the number of blocks running concurrently. - * If num_threads and num_blocks are both 0 then num_threads and num_blocks are - * chosen at runtime. - * Num_threads and num_blocks are determined by the CUDA occupancy calculator. - * If num_threads is 0 and num_blocks is non-zero then num_threads is chosen at - * runtime. - * Num_threads is 1024, which may not be appropriate for all kernels. + * Blocks per SM must be chosen by the user. */ template -struct cuda_launch {}; +struct cuda_explicit_launch {}; /*! * CUDA kernel launch policy where the user specifies the number of physical * thread blocks and threads per block. * If num_blocks is 0 then num_blocks is chosen at runtime. * Num_blocks is chosen to maximize the number of blocks running concurrently. + * If num_threads and num_blocks are both 0 then num_threads and num_blocks are + * chosen at runtime. + * Num_threads and num_blocks are determined by the CUDA occupancy calculator. + * If num_threads is 0 and num_blocks is non-zero then num_threads is chosen at + * runtime. + * Num_threads is 1024, which may not be appropriate for all kernels. + * Blocks per SM defaults to 1. */ -template -using cuda_explicit_launch = cuda_launch; +template +using cuda_launch = cuda_explicit_launch; /*! @@ -75,7 +77,7 @@ using cuda_explicit_launch = cuda_launch -using cuda_occ_calc_launch = cuda_launch; +using cuda_occ_calc_launch = cuda_explicit_launch; namespace statement { @@ -87,7 +89,7 @@ namespace statement */ template struct CudaKernelExt - : public internal::Statement, EnclosedStmts...> { + : public internal::Statement, EnclosedStmts...> { }; @@ -97,9 +99,9 @@ struct CudaKernelExt * calculator determine the unspecified values. * The kernel launch is synchronous. */ -template +template using CudaKernelExp = - CudaKernelExt, EnclosedStmts...>; + CudaKernelExt, EnclosedStmts...>; /*! * A RAJA::kernel statement that launches a CUDA kernel with the flexibility @@ -107,9 +109,9 @@ using CudaKernelExp = * calculator determine the unspecified values. * The kernel launch is asynchronous. */ -template +template using CudaKernelExpAsync = - CudaKernelExt, EnclosedStmts...>; + CudaKernelExt, EnclosedStmts...>; /*! * A RAJA::kernel statement that launches a CUDA kernel using the @@ -136,7 +138,7 @@ using CudaKernelOccAsync = */ template using CudaKernelFixed = - CudaKernelExt::max(), num_threads, 1>, + CudaKernelExt::max(), num_threads>, EnclosedStmts...>; /*! @@ -156,7 +158,7 @@ using CudaKernelFixedSM = */ template using CudaKernelFixedAsync = - CudaKernelExt::max(), num_threads, 1>, + CudaKernelExt::max(), num_threads>, EnclosedStmts...>; /*! @@ -271,7 +273,7 @@ struct CudaLaunchHelper; * determined at runtime using the CUDA occupancy calculator. */ template -struct CudaLaunchHelper,StmtList,Data,Types> +struct CudaLaunchHelper,StmtList,Data,Types> { using Self = CudaLaunchHelper; diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp index 6e643459f5..0ec9c5c1e9 100644 --- a/include/RAJA/policy/hip/forall.hpp +++ b/include/RAJA/policy/hip/forall.hpp @@ -158,7 +158,7 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__ template RAJA_INLINE resources::EventProxy forall_impl(resources::Hip hip_res, - hip_exec_explicit, + hip_exec_explicit, Iterable&& iter, LoopBody&& loop_body) { diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp index 3a905ca723..70b1afe88f 100644 --- a/include/RAJA/policy/hip/kernel/HipKernel.hpp +++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp @@ -45,28 +45,30 @@ namespace RAJA /*! * HIP kernel launch policy where the user may specify the number of physical - * thread blocks and threads per block. + * thread blocks, threads per block, and blocks per SM. * If num_blocks is 0 and num_threads is non-zero then num_blocks is chosen at * runtime. * Num_blocks is chosen to maximize the number of blocks running concurrently. - * If num_threads and num_blocks are both 0 then num_threads and num_blocks are - * chosen at runtime. - * Num_threads and num_blocks are determined by the HIP occupancy calculator. - * If num_threads is 0 and num_blocks is non-zero then num_threads is chosen at - * runtime. - * Num_threads is 1024, which may not be appropriate for all kernels. + * Blocks per SM must be chosen by the user. */ -template -struct hip_launch {}; +template +struct hip_explicit_launch {}; /*! * HIP kernel launch policy where the user specifies the number of physical * thread blocks and threads per block. * If num_blocks is 0 then num_blocks is chosen at runtime. * Num_blocks is chosen to maximize the number of blocks running concurrently. + * If num_threads and num_blocks are both 0 then num_threads and num_blocks are + * chosen at runtime. + * Num_threads and num_blocks are determined by the HIP occupancy calculator. + * If num_threads is 0 and num_blocks is non-zero then num_threads is chosen at + * runtime. + * Num_threads is 1024, which may not be appropriate for all kernels. + * Blocks per SM defaults to 1. */ template -using hip_explicit_launch = hip_launch; +using hip_launch = hip_explicit_launch; /*! @@ -75,7 +77,7 @@ using hip_explicit_launch = hip_launch; * If num_threads is 0 then num_threads is chosen at runtime. */ template -using hip_occ_calc_launch = hip_launch; +using hip_occ_calc_launch = hip_explicit_launch; namespace statement { @@ -87,7 +89,7 @@ namespace statement */ template struct HipKernelExt - : public internal::Statement, EnclosedStmts...> { + : public internal::Statement, EnclosedStmts...> { }; @@ -136,7 +138,17 @@ using HipKernelOccAsync = */ template using HipKernelFixed = - HipKernelExt, + HipKernelExt, + EnclosedStmts...>; + +/*! + * A RAJA::kernel statement that launches a HIP kernel with a fixed + * number of threads (specified by num_threads) and min blocks per sm. + * The kernel launch is synchronous. + */ +template +using HipKernelFixedSM = + HipKernelExt, EnclosedStmts...>; /*! @@ -146,7 +158,17 @@ using HipKernelFixed = */ template using HipKernelFixedAsync = - HipKernelExt, EnclosedStmts...>; + HipKernelExt, EnclosedStmts...>; + +/*! + * A RAJA::kernel statement that launches a HIP kernel with a fixed + * number of threads (specified by num_threads) and min blocks per sm. + * The kernel launch is asynchronous. + */ +template +using HipKernelFixedSMAsync = + HipKernelExt, + EnclosedStmts...>; /*! * A RAJA::kernel statement that launches a HIP kernel with 1024 threads @@ -189,8 +211,8 @@ __global__ void HipKernelLauncher(Data data) * * This launcher is used by the HipKerelFixed policies. */ -template -__launch_bounds__(BlockSize, 1) __global__ +template +__launch_bounds__(BlockSize, BlocksPerSM) __global__ void HipKernelLauncherFixed(Data data) { @@ -210,13 +232,13 @@ __launch_bounds__(BlockSize, 1) __global__ * The default case handles BlockSize != 0 and gets the fixed max block size * version of the kernel. */ -template +template struct HipKernelLauncherGetter { - using type = camp::decay)>; + using type = camp::decay)>; static constexpr type get() noexcept { - return internal::HipKernelLauncherFixed; + return internal::HipKernelLauncherFixed; } }; @@ -225,7 +247,7 @@ struct HipKernelLauncherGetter * block size version of the kernel. */ template -struct HipKernelLauncherGetter<0, Data, executor_t> +struct HipKernelLauncherGetter<0, 0, Data, executor_t> { using type = camp::decay)>; static constexpr type get() noexcept @@ -249,8 +271,8 @@ struct HipLaunchHelper; * The user may specify the number of threads and blocks or let one or both be * determined at runtime using the HIP occupancy calculator. */ -template -struct HipLaunchHelper,StmtList,Data,Types> +template +struct HipLaunchHelper,StmtList,Data,Types> { using Self = HipLaunchHelper; @@ -258,7 +280,7 @@ struct HipLaunchHelper,StmtList,Data using executor_t = internal::hip_statement_list_executor_t; - using kernelGetter_t = HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, Data, executor_t>; + using kernelGetter_t = HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, (blocks_per_sm <= 0) ? 0 : blocks_per_sm, Data, executor_t>; inline static void recommended_blocks_threads(int shmem_size, int &recommended_blocks, int &recommended_threads) From 04c18b60b8c5f579316b0fab577fd628173843fa Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Fri, 19 Nov 2021 16:16:28 -0800 Subject: [PATCH 04/21] Update Hip kernel test. --- test/old-tests/unit/test-kernel.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/old-tests/unit/test-kernel.cpp b/test/old-tests/unit/test-kernel.cpp index 6b00050aa4..69d3de8171 100644 --- a/test/old-tests/unit/test-kernel.cpp +++ b/test/old-tests/unit/test-kernel.cpp @@ -3181,7 +3181,7 @@ GPU_TEST(Kernel_gpu, HipExec1c) // Loop Fusion using Pol = KernelPolicy< - HipKernelExt, + HipKernelExt, statement::Tile<2, tile_fixed<2>, hip_block_z_loop, For<0, hip_block_x_loop, For<1, hip_block_y_loop, From 5e00d9b1277731595caefce7d8bf9a9c8c6cc9d7 Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Mon, 22 Nov 2021 16:42:49 -0800 Subject: [PATCH 05/21] Explicit Teams policies. --- include/RAJA/policy/cuda/policy.hpp | 15 +-- include/RAJA/policy/cuda/teams.hpp | 121 ++++++++++++++++++++++- include/RAJA/policy/hip/policy.hpp | 15 +-- include/RAJA/policy/hip/teams.hpp | 118 +++++++++++++++++++++- test/include/RAJA_test-teams-execpol.hpp | 38 ++++++- 5 files changed, 283 insertions(+), 24 deletions(-) diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index 9dedf5cc89..94817479c9 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -82,12 +82,12 @@ struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Platform::cuda> { }; -template -struct cuda_launch_t : public RAJA::make_policy_pattern_launch_platform_t< - RAJA::Policy::cuda, - RAJA::Pattern::region, - detail::get_launch::value, - RAJA::Platform::cuda> { +template +struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t< + RAJA::Policy::cuda, + RAJA::Pattern::region, + detail::get_launch::value, + RAJA::Platform::cuda> { }; @@ -261,7 +261,8 @@ using policy::cuda::cuda_synchronize; namespace expt { - using policy::cuda::cuda_launch_t; + template + using cuda_launch_t = policy::cuda::cuda_launch_explicit_t; } diff --git a/include/RAJA/policy/cuda/teams.hpp b/include/RAJA/policy/cuda/teams.hpp index 164c574c08..c5d5b74ef2 100644 --- a/include/RAJA/policy/cuda/teams.hpp +++ b/include/RAJA/policy/cuda/teams.hpp @@ -155,8 +155,8 @@ struct LaunchExecute> { } }; -template -__launch_bounds__(num_threads, 1) __global__ +template +__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ void launch_global_fcn_fixed(LaunchContext ctx, BODY body_in) { using RAJA::internal::thread_privatize; @@ -174,7 +174,122 @@ struct LaunchExecute> { { using BODY = camp::decay; - auto func = launch_global_fcn_fixed; + auto func = launch_global_fcn_fixed; + + resources::Cuda cuda_res = resources::Cuda::get_default(); + + // + // Compute the number of blocks and threads + // + + cuda_dim_t gridSize{ static_cast(ctx.teams.value[0]), + static_cast(ctx.teams.value[1]), + static_cast(ctx.teams.value[2]) }; + + cuda_dim_t blockSize{ static_cast(ctx.threads.value[0]), + static_cast(ctx.threads.value[1]), + static_cast(ctx.threads.value[2]) }; + + // Only launch kernel if we have something to iterate over + constexpr cuda_dim_member_t zero = 0; + if ( gridSize.x > zero && gridSize.y > zero && gridSize.z > zero && + blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) { + + RAJA_FT_BEGIN; + + // + // Setup shared memory buffers + // + size_t shmem = 0; + + { + // + // Privatize the loop_body, using make_launch_body to setup reductions + // + BODY body = RAJA::cuda::make_launch_body( + gridSize, blockSize, shmem, cuda_res, std::forward(body_in)); + + // + // Launch the kernel + // + void *args[] = {(void*)&ctx, (void*)&body}; + RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, shmem, cuda_res, async, ctx.kernel_name); + } + + RAJA_FT_END; + } + + } + + template + static resources::EventProxy + exec(RAJA::resources::Resource res, LaunchContext const &ctx, BODY_IN &&body_in) + { + using BODY = camp::decay; + + auto func = launch_global_fcn; + + /*Get the concrete resource */ + resources::Cuda cuda_res = res.get(); + + // + // Compute the number of blocks and threads + // + + cuda_dim_t gridSize{ static_cast(ctx.teams.value[0]), + static_cast(ctx.teams.value[1]), + static_cast(ctx.teams.value[2]) }; + + cuda_dim_t blockSize{ static_cast(ctx.threads.value[0]), + static_cast(ctx.threads.value[1]), + static_cast(ctx.threads.value[2]) }; + + // Only launch kernel if we have something to iterate over + constexpr cuda_dim_member_t zero = 0; + if ( gridSize.x > zero && gridSize.y > zero && gridSize.z > zero && + blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) { + + RAJA_FT_BEGIN; + + // + // Setup shared memory buffers + // + size_t shmem = 0; + + { + // + // Privatize the loop_body, using make_launch_body to setup reductions + // + BODY body = RAJA::cuda::make_launch_body( + gridSize, blockSize, shmem, cuda_res, std::forward(body_in)); + + // + // Launch the kernel + // + void *args[] = {(void*)&ctx, (void*)&body}; + { + RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, shmem, cuda_res, async, ctx.kernel_name); + } + } + + RAJA_FT_END; + } + + return resources::EventProxy(res); + } + +}; + + +template +struct LaunchExecute> { + + template + static void exec(LaunchContext const &ctx, BODY_IN &&body_in) + { + using BODY = camp::decay; + + auto func = launch_global_fcn_fixed; resources::Cuda cuda_res = resources::Cuda::get_default(); diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 8d0db60ca1..8cb14de4b0 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -78,12 +78,12 @@ struct hip_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Platform::hip> { }; -template -struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t< - RAJA::Policy::hip, - RAJA::Pattern::region, - detail::get_launch::value, - RAJA::Platform::hip> { +template +struct hip_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t< + RAJA::Policy::hip, + RAJA::Pattern::region, + detail::get_launch::value, + RAJA::Platform::hip> { }; @@ -265,7 +265,8 @@ using policy::hip::hip_synchronize; namespace expt { - using policy::hip::hip_launch_t; + template + using hip_launch_t = policy::hip::hip_launch_explicit_t; } /*! diff --git a/include/RAJA/policy/hip/teams.hpp b/include/RAJA/policy/hip/teams.hpp index d881e61a62..75aaf3d62d 100644 --- a/include/RAJA/policy/hip/teams.hpp +++ b/include/RAJA/policy/hip/teams.hpp @@ -151,8 +151,8 @@ struct LaunchExecute> { }; -template -__launch_bounds__(num_threads, 1) __global__ +template +__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ static void launch_global_fcn_fixed(LaunchContext ctx, BODY body_in) { using RAJA::internal::thread_privatize; @@ -170,7 +170,119 @@ struct LaunchExecute> { { using BODY = camp::decay; - auto func = launch_global_fcn_fixed; + auto func = launch_global_fcn_fixed; + + resources::Hip hip_res = resources::Hip::get_default(); + + // + // Compute the number of blocks and threads + // + + hip_dim_t gridSize{ static_cast(ctx.teams.value[0]), + static_cast(ctx.teams.value[1]), + static_cast(ctx.teams.value[2]) }; + + hip_dim_t blockSize{ static_cast(ctx.threads.value[0]), + static_cast(ctx.threads.value[1]), + static_cast(ctx.threads.value[2]) }; + + // Only launch kernel if we have something to iterate over + constexpr hip_dim_member_t zero = 0; + if ( gridSize.x > zero && gridSize.y > zero && gridSize.z > zero && + blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) { + + RAJA_FT_BEGIN; + + // + // Setup shared memory buffers + // + size_t shmem = 0; + + { + // + // Privatize the loop_body, using make_launch_body to setup reductions + // + BODY body = RAJA::hip::make_launch_body( + gridSize, blockSize, shmem, hip_res, std::forward(body_in)); + + // + // Launch the kernel + // + void *args[] = {(void*)&ctx, (void*)&body}; + RAJA::hip::launch((const void*)func, gridSize, blockSize, args, shmem, hip_res, async, ctx.kernel_name); + } + + RAJA_FT_END; + } + + } + + template + static resources::EventProxy + exec(RAJA::resources::Resource res, LaunchContext const &ctx, BODY_IN &&body_in) + { + using BODY = camp::decay; + + auto func = launch_global_fcn; + + resources::Hip hip_res = res.get(); + + // + // Compute the number of blocks and threads + // + + hip_dim_t gridSize{ static_cast(ctx.teams.value[0]), + static_cast(ctx.teams.value[1]), + static_cast(ctx.teams.value[2]) }; + + hip_dim_t blockSize{ static_cast(ctx.threads.value[0]), + static_cast(ctx.threads.value[1]), + static_cast(ctx.threads.value[2]) }; + + // Only launch kernel if we have something to iterate over + constexpr hip_dim_member_t zero = 0; + if ( gridSize.x > zero && gridSize.y > zero && gridSize.z > zero && + blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) { + + RAJA_FT_BEGIN; + + // + // Setup shared memory buffers + // + size_t shmem = 0; + + { + // + // Privatize the loop_body, using make_launch_body to setup reductions + // + BODY body = RAJA::hip::make_launch_body( + gridSize, blockSize, shmem, hip_res, std::forward(body_in)); + + // + // Launch the kernel + // + void *args[] = {(void*)&ctx, (void*)&body}; + RAJA::hip::launch((const void*)func, gridSize, blockSize, args, shmem, hip_res, async, ctx.kernel_name); + } + + RAJA_FT_END; + } + + return resources::EventProxy(res); + } + +}; + + +template +struct LaunchExecute> { + + template + static void exec(LaunchContext const &ctx, BODY_IN &&body_in) + { + using BODY = camp::decay; + + auto func = launch_global_fcn_fixed; resources::Hip hip_res = resources::Hip::get_default(); diff --git a/test/include/RAJA_test-teams-execpol.hpp b/test/include/RAJA_test-teams-execpol.hpp index 60b3e5c76b..c15773005e 100644 --- a/test/include/RAJA_test-teams-execpol.hpp +++ b/test/include/RAJA_test-teams-execpol.hpp @@ -22,8 +22,14 @@ using seq_cuda_policies = camp::list< RAJA::expt::LoopPolicy, RAJA::expt::LoopPolicy>; +using seq_cuda_explicit_policies = camp::list< + RAJA::expt::LaunchPolicy>, + RAJA::expt::LoopPolicy, + RAJA::expt::LoopPolicy>; + using Sequential_launch_policies = camp::list< - seq_cuda_policies + seq_cuda_policies, + seq_cuda_explicit_policies >; #elif defined(RAJA_ENABLE_HIP) @@ -32,8 +38,14 @@ using seq_hip_policies = camp::list< RAJA::expt::LoopPolicy, RAJA::expt::LoopPolicy>; +using seq_hip_explicit_policies = camp::list< + RAJA::expt::LaunchPolicy>, + RAJA::expt::LoopPolicy, + RAJA::expt::LoopPolicy>; + using Sequential_launch_policies = camp::list< - seq_hip_policies + seq_hip_policies, + seq_hip_explicit_policies >; #else using Sequential_launch_policies = camp::list< @@ -54,8 +66,15 @@ using omp_cuda_policies = camp::list< RAJA::expt::LoopPolicy >; +using omp_cuda_explicit_policies = camp::list< + RAJA::expt::LaunchPolicy>, + RAJA::expt::LoopPolicy, + RAJA::expt::LoopPolicy + >; + using OpenMP_launch_policies = camp::list< - omp_cuda_policies + omp_cuda_policies, + omp_cuda_explicit_policies >; #elif defined(RAJA_ENABLE_HIP) @@ -66,8 +85,15 @@ using omp_hip_policies = camp::list< RAJA::expt::LoopPolicy >; +using omp_hip_explicit_policies = camp::list< + RAJA::expt::LaunchPolicy>, + RAJA::expt::LoopPolicy, + RAJA::expt::LoopPolicy + >; + using OpenMP_launch_policies = camp::list< - omp_hip_policies + omp_hip_policies, + omp_hip_explicit_policies >; #else using OpenMP_launch_policies = camp::list< @@ -82,8 +108,10 @@ using OpenMP_launch_policies = camp::list< #if defined(RAJA_ENABLE_CUDA) using Cuda_launch_policies = camp::list< seq_cuda_policies + , seq_cuda_explicit_policies #if defined(RAJA_ENABLE_OPENMP) , omp_cuda_policies + , omp_cuda_explicit_policies #endif >; #endif // RAJA_ENABLE_CUDA @@ -91,8 +119,10 @@ using Cuda_launch_policies = camp::list< #if defined(RAJA_ENABLE_HIP) using Hip_launch_policies = camp::list< seq_hip_policies + , seq_hip_explicit_policies #if defined(RAJA_ENABLE_OPENMP) , omp_hip_policies + , omp_hip_explicit_policies #endif >; #endif // RAJA_ENABLE_HIP From 6817a3f402ab0b9ad2cde2b570b3467a1abe07fd Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Mon, 29 Nov 2021 16:36:43 -0800 Subject: [PATCH 06/21] WorkGroup modifications for CUDA and HIP. --- include/RAJA/policy/cuda/WorkGroup/Vtable.hpp | 17 ++ .../RAJA/policy/cuda/WorkGroup/WorkRunner.hpp | 273 +++++++++++++++++- include/RAJA/policy/cuda/policy.hpp | 11 +- include/RAJA/policy/hip/WorkGroup/Vtable.hpp | 17 ++ .../RAJA/policy/hip/WorkGroup/WorkRunner.hpp | 271 ++++++++++++++++- include/RAJA/policy/hip/policy.hpp | 11 +- test/include/RAJA_test-workgroup.hpp | 12 +- 7 files changed, 599 insertions(+), 13 deletions(-) diff --git a/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp b/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp index ba03b10aae..772ca6b9ce 100644 --- a/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp +++ b/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp @@ -110,6 +110,23 @@ inline const Vtable_T* get_Vtable(cuda_work const&) return &vtable; } +/*! +* Explicit BLOCKS_PER_SM version. +* Populate and return a Vtable object where the +* call operator is a device function +*/ +template < typename T, typename Vtable_T, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async > +inline const Vtable_T* get_Vtable(cuda_work_explicit const&) +{ + static Vtable_T vtable{ + &Vtable_T::template move_construct_destroy, + get_cached_Vtable_cuda_device_call(), + &Vtable_T::template destroy, + sizeof(T) + }; + return &vtable; +} + } // namespace detail } // namespace RAJA diff --git a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp index 37cc4c1e1b..67efaf23c6 100644 --- a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp +++ b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp @@ -87,6 +87,62 @@ struct WorkRunner< } }; +/*! + * Explicit BLOCKS_PER_SM version. + * Runs work in a storage container in order + * and returns any per run resources + */ +template +struct WorkRunner< + RAJA::cuda_work_explicit, + RAJA::ordered, + ALLOCATOR_T, + INDEX_T, + Args...> + : WorkRunnerForallOrdered< + RAJA::cuda_exec_explicit_async, + RAJA::cuda_work_explicit, + RAJA::ordered, + ALLOCATOR_T, + INDEX_T, + Args...> +{ + using base = WorkRunnerForallOrdered< + RAJA::cuda_exec_explicit_async, + RAJA::cuda_work_explicit, + RAJA::ordered, + ALLOCATOR_T, + INDEX_T, + Args...>; + using base::base; + using IndexType = INDEX_T; + using per_run_storage = typename base::per_run_storage; + + /// + /// run the loops in the given work container in order using forall + /// run all loops asynchronously and synchronize after is necessary + /// + template < typename WorkContainer > + per_run_storage run(WorkContainer const& storage, + typename base::resource_type r, Args... args) const + { + per_run_storage run_storage = + base::run(storage, r, std::forward(args)...); + + IndexType num_loops = std::distance(std::begin(storage), std::end(storage)); + + // Only synchronize if we had something to iterate over + if (num_loops > 0 && BLOCK_SIZE > 0) { + if (!Async) { RAJA::cuda::synchronize(r); } + } + + return run_storage; + } +}; + /*! * Runs work in a storage container in reverse order * and returns any per run resources @@ -143,6 +199,63 @@ struct WorkRunner< }; +/*! + * Explicit BLOCKS_PER_SM version. + * Runs work in a storage container in reverse order + * and returns any per run resources + */ +template +struct WorkRunner< + RAJA::cuda_work_explicit, + RAJA::reverse_ordered, + ALLOCATOR_T, + INDEX_T, + Args...> + : WorkRunnerForallReverse< + RAJA::cuda_exec_explicit_async, + RAJA::cuda_work_explicit, + RAJA::reverse_ordered, + ALLOCATOR_T, + INDEX_T, + Args...> +{ + using base = WorkRunnerForallReverse< + RAJA::cuda_exec_explicit_async, + RAJA::cuda_work_explicit, + RAJA::reverse_ordered, + ALLOCATOR_T, + INDEX_T, + Args...>; + using base::base; + using IndexType = INDEX_T; + using per_run_storage = typename base::per_run_storage; + + /// + /// run the loops in the given work container in reverse order using forall + /// run all loops asynchronously and synchronize after is necessary + /// + template < typename WorkContainer > + per_run_storage run(WorkContainer const& storage, + typename base::resource_type r, Args... args) const + { + per_run_storage run_storage = + base::run(storage, r, std::forward(args)...); + + IndexType num_loops = std::distance(std::begin(storage), std::end(storage)); + + // Only synchronize if we had something to iterate over + if (num_loops > 0 && BLOCK_SIZE > 0) { + if (!Async) { RAJA::cuda::synchronize(r); } + } + + return run_storage; + } +}; + + /*! * A body and segment holder for storing loops that will be executed * on the device @@ -177,11 +290,12 @@ struct HoldCudaDeviceXThreadblockLoop }; template < size_t BLOCK_SIZE, + size_t BLOCKS_PER_SM, typename StorageIter, typename value_type, typename index_type, typename ... Args > -__launch_bounds__(BLOCK_SIZE, 1) __global__ +__launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__ void cuda_unordered_y_block_global(StorageIter iter, Args... args) { const index_type i_loop = blockIdx.y; @@ -290,7 +404,162 @@ struct WorkRunner< per_run_storage run_storage{}; - auto func = cuda_unordered_y_block_global; + auto func = cuda_unordered_y_block_global; + + // + // Compute the requested iteration space size + // + Iterator begin = std::begin(storage); + Iterator end = std::end(storage); + IndexType num_loops = std::distance(begin, end); + + // Only launch kernel if we have something to iterate over + if (num_loops > 0 && BLOCK_SIZE > 0) { + + index_type average_iterations = m_total_iterations / static_cast(num_loops); + + // + // Compute the number of blocks + // + constexpr index_type block_size = static_cast(BLOCK_SIZE); + cuda_dim_t blockSize{static_cast(block_size), 1, 1}; + cuda_dim_t gridSize{static_cast((average_iterations + block_size - 1) / block_size), + static_cast(num_loops), + 1}; + + RAJA_FT_BEGIN; + + // + // Setup shared memory buffers + // + size_t shmem = 0; + + { + // + // Launch the kernel + // + void* func_args[] = { (void*)&begin, (void*)&args... }; + RAJA::cuda::launch((const void*)func, gridSize, blockSize, func_args, shmem, r, Async); + } + + RAJA_FT_END; + } + + return run_storage; + } + + // clear any state so ready to be destroyed or reused + void clear() + { + m_total_iterations = 0; + } + +private: + index_type m_total_iterations = 0; +}; + + +/*! + * Explicit BLOCKS_PER_SM version. + * Runs work in a storage container out of order with loops mapping to + * cuda blocks in the y direction and iterations mapping to threads in + * the x direction, with the number of threads in the x dimension determined + * by the average number of iterates per loop + */ +template +struct WorkRunner< + RAJA::cuda_work_explicit, + RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average, + ALLOCATOR_T, + INDEX_T, + Args...> +{ + using exec_policy = RAJA::cuda_work_explicit; + using order_policy = RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average; + using Allocator = ALLOCATOR_T; + using index_type = INDEX_T; + using resource_type = resources::Cuda; + + using vtable_type = Vtable, Args...>; + + WorkRunner() = default; + + WorkRunner(WorkRunner const&) = delete; + WorkRunner& operator=(WorkRunner const&) = delete; + + WorkRunner(WorkRunner && o) + : m_total_iterations(o.m_total_iterations) + { + o.m_total_iterations = 0; + } + WorkRunner& operator=(WorkRunner && o) + { + m_total_iterations = o.m_total_iterations; + + o.m_total_iterations = 0; + return *this; + } + + // The type that will hold the segment and loop body in work storage + template < typename ITERABLE, typename LOOP_BODY > + using holder_type = HoldCudaDeviceXThreadblockLoop; + + // The policy indicating where the call function is invoked + // in this case the values are called on the device + using vtable_exec_policy = exec_policy; + + // runner interfaces with storage to enqueue so the runner can get + // information from the segment and loop at enqueue time + template < typename WorkContainer, typename Iterable, typename LoopBody > + inline void enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body) + { + using Iterator = camp::decay; + using LOOP_BODY = camp::decay; + using ITERABLE = camp::decay; + using IndexType = camp::decay; + + using holder = holder_type; + + // using true_value_type = typename WorkContainer::template true_value_type; + + Iterator begin = std::begin(iter); + Iterator end = std::end(iter); + IndexType len = std::distance(begin, end); + + // Only launch kernel if we have something to iterate over + if (len > 0 && BLOCK_SIZE > 0) { + + m_total_iterations += len; + + // + // TODO: Privatize the loop_body, using make_launch_body to setup reductions + // + // LOOP_BODY body = RAJA::cuda::make_launch_body( + // gridSize, blockSize, shmem, stream, std::forward(loop_body)); + + storage.template emplace( + get_Vtable(vtable_exec_policy{}), + std::forward(iter), std::forward(loop_body)); + } + } + + // no extra storage required here + using per_run_storage = int; + + template < typename WorkContainer > + per_run_storage run(WorkContainer const& storage, resource_type r, Args... args) const + { + using Iterator = camp::decay; + using IndexType = camp::decay; + using value_type = typename WorkContainer::value_type; + + per_run_storage run_storage{}; + + auto func = cuda_unordered_y_block_global; // // Compute the requested iteration space size diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index 94817479c9..e28db8109f 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -101,8 +101,8 @@ struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform /// /// WorkGroup execution policies /// -template -struct cuda_work : public RAJA::make_policy_pattern_launch_platform_t< +template +struct cuda_work_explicit : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::cuda, RAJA::Pattern::workgroup_exec, detail::get_launch::value, @@ -231,10 +231,13 @@ using cuda_exec = policy::cuda::cuda_exec_explicit; template using cuda_exec_async = policy::cuda::cuda_exec_explicit; -using policy::cuda::cuda_work; +using policy::cuda::cuda_work_explicit; + +template +using cuda_work = policy::cuda::cuda_work_explicit; template -using cuda_work_async = policy::cuda::cuda_work; +using cuda_work_async = policy::cuda::cuda_work_explicit; using policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average; diff --git a/include/RAJA/policy/hip/WorkGroup/Vtable.hpp b/include/RAJA/policy/hip/WorkGroup/Vtable.hpp index e4ce5212bf..fceac6c102 100644 --- a/include/RAJA/policy/hip/WorkGroup/Vtable.hpp +++ b/include/RAJA/policy/hip/WorkGroup/Vtable.hpp @@ -109,6 +109,23 @@ inline const Vtable_T* get_Vtable(hip_work const&) return &vtable; } +/*! +* Explicit BLOCKS_PER_SM version. +* Populate and return a Vtable object where the +* call operator is a device function +*/ +template < typename T, typename Vtable_T, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async > +inline const Vtable_T* get_Vtable(hip_work_explicit const&) +{ + static Vtable_T vtable{ + &Vtable_T::template move_construct_destroy, + get_cached_Vtable_hip_device_call(), + &Vtable_T::template destroy, + sizeof(T) + }; + return &vtable; +} + #endif } // namespace detail diff --git a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp index 0b1e060a6d..a42577f643 100644 --- a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp +++ b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp @@ -87,6 +87,62 @@ struct WorkRunner< } }; +/*! + * Explicit BLOCKS_PER_SM version. + * Runs work in a storage container in order + * and returns any per run resources + */ +template +struct WorkRunner< + RAJA::hip_work_explicit, + RAJA::ordered, + ALLOCATOR_T, + INDEX_T, + Args...> + : WorkRunnerForallOrdered< + RAJA::hip_exec_explicit_async, + RAJA::hip_work_explicit, + RAJA::ordered, + ALLOCATOR_T, + INDEX_T, + Args...> +{ + using base = WorkRunnerForallOrdered< + RAJA::hip_exec_explicit_async, + RAJA::hip_work_explicit, + RAJA::ordered, + ALLOCATOR_T, + INDEX_T, + Args...>; + using base::base; + using IndexType = INDEX_T; + using per_run_storage = typename base::per_run_storage; + + /// + /// run the loops in the given work container in order using forall + /// run all loops asynchronously and synchronize after is necessary + /// + template < typename WorkContainer > + per_run_storage run(WorkContainer const& storage, + typename base::resource_type r, Args... args) const + { + per_run_storage run_storage = + base::run(storage, r, std::forward(args)...); + + IndexType num_loops = std::distance(std::begin(storage), std::end(storage)); + + // Only synchronize if we had something to iterate over + if (num_loops > 0 && BLOCK_SIZE > 0) { + if (!Async) { RAJA::hip::synchronize(r); } + } + + return run_storage; + } +}; + /*! * Runs work in a storage container in reverse order * and returns any per run resources @@ -143,6 +199,63 @@ struct WorkRunner< }; +/*! + * Explicit BLOCKS_PER_SM version. + * Runs work in a storage container in reverse order + * and returns any per run resources + */ +template +struct WorkRunner< + RAJA::hip_work_explicit, + RAJA::reverse_ordered, + ALLOCATOR_T, + INDEX_T, + Args...> + : WorkRunnerForallReverse< + RAJA::hip_exec_explicit_async, + RAJA::hip_work_explicit, + RAJA::reverse_ordered, + ALLOCATOR_T, + INDEX_T, + Args...> +{ + using base = WorkRunnerForallReverse< + RAJA::hip_exec_explicit_async, + RAJA::hip_work_explicit, + RAJA::reverse_ordered, + ALLOCATOR_T, + INDEX_T, + Args...>; + using base::base; + using IndexType = INDEX_T; + using per_run_storage = typename base::per_run_storage; + + /// + /// run the loops in the given work container in reverse order using forall + /// run all loops asynchronously and synchronize after is necessary + /// + template < typename WorkContainer > + per_run_storage run(WorkContainer const& storage, + typename base::resource_type r, Args... args) const + { + per_run_storage run_storage = + base::run(storage, r, std::forward(args)...); + + IndexType num_loops = std::distance(std::begin(storage), std::end(storage)); + + // Only synchronize if we had something to iterate over + if (num_loops > 0 && BLOCK_SIZE > 0) { + if (!Async) { RAJA::hip::synchronize(r); } + } + + return run_storage; + } +}; + + #if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL) /*! @@ -179,11 +292,12 @@ struct HoldHipDeviceXThreadblockLoop }; template < size_t BLOCK_SIZE, + size_t BLOCKS_PER_SM, typename StorageIter, typename value_type, typename index_type, typename ... Args > -__launch_bounds__(BLOCK_SIZE, 1) __global__ +__launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__ void hip_unordered_y_block_global(StorageIter iter, Args... args) { const index_type i_loop = blockIdx.y; @@ -346,6 +460,161 @@ struct WorkRunner< index_type m_total_iterations = 0; }; +/*! + * Explicit BLOCKS_PER_SM version. + * Runs work in a storage container out of order with loops mapping to + * hip blocks in the y direction and iterations mapping to threads in + * the x direction, with the number of threads in the x dimension determined + * by the average number of iterates per loop + */ +template +struct WorkRunner< + RAJA::hip_work_explicit, + RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average, + ALLOCATOR_T, + INDEX_T, + Args...> +{ + using exec_policy = RAJA::hip_work_explicit; + using order_policy = RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average; + using Allocator = ALLOCATOR_T; + using index_type = INDEX_T; + using resource_type = resources::Hip; + + using vtable_type = Vtable, Args...>; + + WorkRunner() = default; + + WorkRunner(WorkRunner const&) = delete; + WorkRunner& operator=(WorkRunner const&) = delete; + + WorkRunner(WorkRunner && o) + : m_total_iterations(o.m_total_iterations) + { + o.m_total_iterations = 0; + } + WorkRunner& operator=(WorkRunner && o) + { + m_total_iterations = o.m_total_iterations; + + o.m_total_iterations = 0; + return *this; + } + + // The type that will hold the segment and loop body in work storage + template < typename ITERABLE, typename LOOP_BODY > + using holder_type = HoldHipDeviceXThreadblockLoop; + + // The policy indicating where the call function is invoked + // in this case the values are called on the device + using vtable_exec_policy = exec_policy; + + // runner interfaces with storage to enqueue so the runner can get + // information from the segment and loop at enqueue time + template < typename WorkContainer, typename Iterable, typename LoopBody > + inline void enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body) + { + using Iterator = camp::decay; + using LOOP_BODY = camp::decay; + using ITERABLE = camp::decay; + using IndexType = camp::decay; + + using holder = holder_type; + + // using true_value_type = typename WorkContainer::template true_value_type; + + Iterator begin = std::begin(iter); + Iterator end = std::end(iter); + IndexType len = std::distance(begin, end); + + // Only launch kernel if we have something to iterate over + if (len > 0 && BLOCK_SIZE > 0) { + + m_total_iterations += len; + + // + // TODO: Privatize the loop_body, using make_launch_body to setup reductions + // + // LOOP_BODY body = RAJA::hip::make_launch_body( + // gridSize, blockSize, shmem, stream, std::forward(loop_body)); + + storage.template emplace( + get_Vtable(vtable_exec_policy{}), + std::forward(iter), std::forward(loop_body)); + } + } + + // no extra storage required here + using per_run_storage = int; + + template < typename WorkContainer > + per_run_storage run(WorkContainer const& storage, resource_type r, Args... args) const + { + using Iterator = camp::decay; + using IndexType = camp::decay; + using value_type = typename WorkContainer::value_type; + + per_run_storage run_storage{}; + + auto func = hip_unordered_y_block_global; + + // + // Compute the requested iteration space size + // + Iterator begin = std::begin(storage); + Iterator end = std::end(storage); + IndexType num_loops = std::distance(begin, end); + + // Only launch kernel if we have something to iterate over + if (num_loops > 0 && BLOCK_SIZE > 0) { + + index_type average_iterations = m_total_iterations / static_cast(num_loops); + + // + // Compute the number of blocks + // + constexpr index_type block_size = static_cast(BLOCK_SIZE); + hip_dim_t blockSize{static_cast(block_size), 1, 1}; + hip_dim_t gridSize{static_cast((average_iterations + block_size - 1) / block_size), + static_cast(num_loops), + 1}; + + RAJA_FT_BEGIN; + + // + // Setup shared memory buffers + // + size_t shmem = 0; + + { + // + // Launch the kernel + // + void* func_args[] = { (void*)&begin, (void*)&args... }; + RAJA::hip::launch((const void*)func, gridSize, blockSize, func_args, shmem, r, Async); + } + + RAJA_FT_END; + } + + return run_storage; + } + + // clear any state so ready to be destroyed or reused + void clear() + { + m_total_iterations = 0; + } + +private: + index_type m_total_iterations = 0; +}; + + #endif } // namespace detail diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 8cb14de4b0..34b23bf51e 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -94,8 +94,8 @@ struct hip_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_ /// /// WorkGroup execution policies /// -template -struct hip_work : public RAJA::make_policy_pattern_launch_platform_t< +template +struct hip_work_explicit : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::hip, RAJA::Pattern::workgroup_exec, detail::get_launch::value, @@ -233,10 +233,13 @@ using hip_exec = policy::hip::hip_exec_explicit; template using hip_exec_async = policy::hip::hip_exec_explicit; -using policy::hip::hip_work; +using policy::hip::hip_work_explicit; + +template +using hip_work = policy::hip::hip_work_explicit; template -using hip_work_async = policy::hip::hip_work; +using hip_work_async = policy::hip::hip_work_explicit; using policy::hip::hip_atomic; using policy::hip::hip_atomic_explicit; diff --git a/test/include/RAJA_test-workgroup.hpp b/test/include/RAJA_test-workgroup.hpp index bc8e59ff25..e2f44039ee 100644 --- a/test/include/RAJA_test-workgroup.hpp +++ b/test/include/RAJA_test-workgroup.hpp @@ -360,8 +360,13 @@ using OpenMPTargetStoragePolicyList = SequentialStoragePolicyList; #if defined(RAJA_ENABLE_CUDA) using CudaExecPolicyList = camp::list< + #if defined(RAJA_TEST_EXHAUSTIVE) + // avoid compilation error: + // tpl/camp/include/camp/camp.hpp(104): error #456: excessive recursion at instantiation of class RAJA::cuda_work<256>, - RAJA::cuda_work<1024> + #endif + RAJA::cuda_work<1024>, + RAJA::cuda_work_explicit<256, 2> >; using CudaOrderedPolicyList = SequentialOrderedPolicyList; using CudaOrderPolicyList = @@ -376,8 +381,11 @@ using CudaStoragePolicyList = SequentialStoragePolicyList; #if defined(RAJA_ENABLE_HIP) using HipExecPolicyList = camp::list< + #if defined(RAJA_TEST_EXHAUSTIVE) RAJA::hip_work<256>, - RAJA::hip_work<1024> + #endif + RAJA::hip_work<1024>, + RAJA::hip_work_explicit<256, 2> >; using HipOrderedPolicyList = SequentialOrderedPolicyList; using HipOrderPolicyList = From 454edc11569a334404bcb65f792d215685bf1cbf Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Mon, 29 Nov 2021 17:58:03 -0800 Subject: [PATCH 07/21] Remove old redundant policies. Hide _launch_explicit_t under expt. --- include/RAJA/policy/cuda/WorkGroup/Vtable.hpp | 17 -- .../RAJA/policy/cuda/WorkGroup/WorkRunner.hpp | 268 ------------------ include/RAJA/policy/cuda/policy.hpp | 8 +- include/RAJA/policy/cuda/teams.hpp | 244 +--------------- include/RAJA/policy/hip/WorkGroup/Vtable.hpp | 17 -- .../RAJA/policy/hip/WorkGroup/WorkRunner.hpp | 267 ----------------- include/RAJA/policy/hip/policy.hpp | 5 +- include/RAJA/policy/hip/teams.hpp | 237 +--------------- test/include/RAJA_test-teams-execpol.hpp | 8 +- 9 files changed, 17 insertions(+), 1054 deletions(-) diff --git a/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp b/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp index 772ca6b9ce..7f76961d3e 100644 --- a/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp +++ b/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp @@ -98,23 +98,6 @@ inline typename Vtable_T::call_sig get_cached_Vtable_cuda_device_call() * Populate and return a Vtable object where the * call operator is a device function */ -template < typename T, typename Vtable_T, size_t BLOCK_SIZE, bool Async > -inline const Vtable_T* get_Vtable(cuda_work const&) -{ - static Vtable_T vtable{ - &Vtable_T::template move_construct_destroy, - get_cached_Vtable_cuda_device_call(), - &Vtable_T::template destroy, - sizeof(T) - }; - return &vtable; -} - -/*! -* Explicit BLOCKS_PER_SM version. -* Populate and return a Vtable object where the -* call operator is a device function -*/ template < typename T, typename Vtable_T, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async > inline const Vtable_T* get_Vtable(cuda_work_explicit const&) { diff --git a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp index 67efaf23c6..46e109c974 100644 --- a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp +++ b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp @@ -36,62 +36,6 @@ namespace detail * Runs work in a storage container in order * and returns any per run resources */ -template -struct WorkRunner< - RAJA::cuda_work, - RAJA::ordered, - ALLOCATOR_T, - INDEX_T, - Args...> - : WorkRunnerForallOrdered< - RAJA::cuda_exec_async, - RAJA::cuda_work, - RAJA::ordered, - ALLOCATOR_T, - INDEX_T, - Args...> -{ - using base = WorkRunnerForallOrdered< - RAJA::cuda_exec_async, - RAJA::cuda_work, - RAJA::ordered, - ALLOCATOR_T, - INDEX_T, - Args...>; - using base::base; - using IndexType = INDEX_T; - using per_run_storage = typename base::per_run_storage; - - /// - /// run the loops in the given work container in order using forall - /// run all loops asynchronously and synchronize after is necessary - /// - template < typename WorkContainer > - per_run_storage run(WorkContainer const& storage, - typename base::resource_type r, Args... args) const - { - per_run_storage run_storage = - base::run(storage, r, std::forward(args)...); - - IndexType num_loops = std::distance(std::begin(storage), std::end(storage)); - - // Only synchronize if we had something to iterate over - if (num_loops > 0 && BLOCK_SIZE > 0) { - if (!Async) { RAJA::cuda::synchronize(r); } - } - - return run_storage; - } -}; - -/*! - * Explicit BLOCKS_PER_SM version. - * Runs work in a storage container in order - * and returns any per run resources - */ template -struct WorkRunner< - RAJA::cuda_work, - RAJA::reverse_ordered, - ALLOCATOR_T, - INDEX_T, - Args...> - : WorkRunnerForallReverse< - RAJA::cuda_exec_async, - RAJA::cuda_work, - RAJA::reverse_ordered, - ALLOCATOR_T, - INDEX_T, - Args...> -{ - using base = WorkRunnerForallReverse< - RAJA::cuda_exec_async, - RAJA::cuda_work, - RAJA::reverse_ordered, - ALLOCATOR_T, - INDEX_T, - Args...>; - using base::base; - using IndexType = INDEX_T; - using per_run_storage = typename base::per_run_storage; - - /// - /// run the loops in the given work container in reverse order using forall - /// run all loops asynchronously and synchronize after is necessary - /// - template < typename WorkContainer > - per_run_storage run(WorkContainer const& storage, - typename base::resource_type r, Args... args) const - { - per_run_storage run_storage = - base::run(storage, r, std::forward(args)...); - - IndexType num_loops = std::distance(std::begin(storage), std::end(storage)); - - // Only synchronize if we had something to iterate over - if (num_loops > 0 && BLOCK_SIZE > 0) { - if (!Async) { RAJA::cuda::synchronize(r); } - } - - return run_storage; - } -}; - - -/*! - * Explicit BLOCKS_PER_SM version. - * Runs work in a storage container in reverse order - * and returns any per run resources - */ template -struct WorkRunner< - RAJA::cuda_work, - RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average, - ALLOCATOR_T, - INDEX_T, - Args...> -{ - using exec_policy = RAJA::cuda_work; - using order_policy = RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average; - using Allocator = ALLOCATOR_T; - using index_type = INDEX_T; - using resource_type = resources::Cuda; - - using vtable_type = Vtable, Args...>; - - WorkRunner() = default; - - WorkRunner(WorkRunner const&) = delete; - WorkRunner& operator=(WorkRunner const&) = delete; - - WorkRunner(WorkRunner && o) - : m_total_iterations(o.m_total_iterations) - { - o.m_total_iterations = 0; - } - WorkRunner& operator=(WorkRunner && o) - { - m_total_iterations = o.m_total_iterations; - - o.m_total_iterations = 0; - return *this; - } - - // The type that will hold the segment and loop body in work storage - template < typename ITERABLE, typename LOOP_BODY > - using holder_type = HoldCudaDeviceXThreadblockLoop; - - // The policy indicating where the call function is invoked - // in this case the values are called on the device - using vtable_exec_policy = exec_policy; - - // runner interfaces with storage to enqueue so the runner can get - // information from the segment and loop at enqueue time - template < typename WorkContainer, typename Iterable, typename LoopBody > - inline void enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body) - { - using Iterator = camp::decay; - using LOOP_BODY = camp::decay; - using ITERABLE = camp::decay; - using IndexType = camp::decay; - - using holder = holder_type; - - // using true_value_type = typename WorkContainer::template true_value_type; - - Iterator begin = std::begin(iter); - Iterator end = std::end(iter); - IndexType len = std::distance(begin, end); - - // Only launch kernel if we have something to iterate over - if (len > 0 && BLOCK_SIZE > 0) { - - m_total_iterations += len; - - // - // TODO: Privatize the loop_body, using make_launch_body to setup reductions - // - // LOOP_BODY body = RAJA::cuda::make_launch_body( - // gridSize, blockSize, shmem, stream, std::forward(loop_body)); - - storage.template emplace( - get_Vtable(vtable_exec_policy{}), - std::forward(iter), std::forward(loop_body)); - } - } - - // no extra storage required here - using per_run_storage = int; - - template < typename WorkContainer > - per_run_storage run(WorkContainer const& storage, resource_type r, Args... args) const - { - using Iterator = camp::decay; - using IndexType = camp::decay; - using value_type = typename WorkContainer::value_type; - - per_run_storage run_storage{}; - - auto func = cuda_unordered_y_block_global; - - // - // Compute the requested iteration space size - // - Iterator begin = std::begin(storage); - Iterator end = std::end(storage); - IndexType num_loops = std::distance(begin, end); - - // Only launch kernel if we have something to iterate over - if (num_loops > 0 && BLOCK_SIZE > 0) { - - index_type average_iterations = m_total_iterations / static_cast(num_loops); - - // - // Compute the number of blocks - // - constexpr index_type block_size = static_cast(BLOCK_SIZE); - cuda_dim_t blockSize{static_cast(block_size), 1, 1}; - cuda_dim_t gridSize{static_cast((average_iterations + block_size - 1) / block_size), - static_cast(num_loops), - 1}; - - RAJA_FT_BEGIN; - - // - // Setup shared memory buffers - // - size_t shmem = 0; - - { - // - // Launch the kernel - // - void* func_args[] = { (void*)&begin, (void*)&args... }; - RAJA::cuda::launch((const void*)func, gridSize, blockSize, func_args, shmem, r, Async); - } - - RAJA_FT_END; - } - - return run_storage; - } - - // clear any state so ready to be destroyed or reused - void clear() - { - m_total_iterations = 0; - } - -private: - index_type m_total_iterations = 0; -}; - - -/*! - * Explicit BLOCKS_PER_SM version. - * Runs work in a storage container out of order with loops mapping to - * cuda blocks in the y direction and iterations mapping to threads in - * the x direction, with the number of threads in the x dimension determined - * by the average number of iterates per loop - */ template { }; +namespace expt +{ template struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::cuda, @@ -89,7 +91,7 @@ struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform detail::get_launch::value, RAJA::Platform::cuda> { }; - +} @@ -228,7 +230,7 @@ using cuda_exec_explicit_async = policy::cuda::cuda_exec_explicit using cuda_exec = policy::cuda::cuda_exec_explicit; -template +template using cuda_exec_async = policy::cuda::cuda_exec_explicit; using policy::cuda::cuda_work_explicit; @@ -265,7 +267,7 @@ using policy::cuda::cuda_synchronize; namespace expt { template - using cuda_launch_t = policy::cuda::cuda_launch_explicit_t; + using cuda_launch_t = policy::cuda::expt::cuda_launch_explicit_t; } diff --git a/include/RAJA/policy/cuda/teams.hpp b/include/RAJA/policy/cuda/teams.hpp index c5d5b74ef2..adc900f93d 100644 --- a/include/RAJA/policy/cuda/teams.hpp +++ b/include/RAJA/policy/cuda/teams.hpp @@ -31,130 +31,6 @@ namespace RAJA namespace expt { -template -__global__ void launch_global_fcn(LaunchContext ctx, BODY body_in) -{ - using RAJA::internal::thread_privatize; - auto privatizer = thread_privatize(body_in); - auto& body = privatizer.get_priv(); - body(ctx); -} - -template -struct LaunchExecute> { - - template - static void exec(LaunchContext const &ctx, BODY_IN &&body_in) - { - using BODY = camp::decay; - - auto func = launch_global_fcn; - - resources::Cuda cuda_res = resources::Cuda::get_default(); - - // - // Compute the number of blocks and threads - // - - cuda_dim_t gridSize{ static_cast(ctx.teams.value[0]), - static_cast(ctx.teams.value[1]), - static_cast(ctx.teams.value[2]) }; - - cuda_dim_t blockSize{ static_cast(ctx.threads.value[0]), - static_cast(ctx.threads.value[1]), - static_cast(ctx.threads.value[2]) }; - - // Only launch kernel if we have something to iterate over - constexpr cuda_dim_member_t zero = 0; - if ( gridSize.x > zero && gridSize.y > zero && gridSize.z > zero && - blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) { - - RAJA_FT_BEGIN; - - // - // Setup shared memory buffers - // - size_t shmem = 0; - - { - // - // Privatize the loop_body, using make_launch_body to setup reductions - // - BODY body = RAJA::cuda::make_launch_body( - gridSize, blockSize, shmem, cuda_res, std::forward(body_in)); - - // - // Launch the kernel - // - void *args[] = {(void*)&ctx, (void*)&body}; - { - RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, shmem, cuda_res, async, ctx.kernel_name); - } - } - - RAJA_FT_END; - } - - } - - template - static resources::EventProxy - exec(RAJA::resources::Resource res, LaunchContext const &ctx, BODY_IN &&body_in) - { - using BODY = camp::decay; - - auto func = launch_global_fcn; - - /*Get the concrete resource */ - resources::Cuda cuda_res = res.get(); - - // - // Compute the number of blocks and threads - // - - cuda_dim_t gridSize{ static_cast(ctx.teams.value[0]), - static_cast(ctx.teams.value[1]), - static_cast(ctx.teams.value[2]) }; - - cuda_dim_t blockSize{ static_cast(ctx.threads.value[0]), - static_cast(ctx.threads.value[1]), - static_cast(ctx.threads.value[2]) }; - - // Only launch kernel if we have something to iterate over - constexpr cuda_dim_member_t zero = 0; - if ( gridSize.x > zero && gridSize.y > zero && gridSize.z > zero && - blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) { - - RAJA_FT_BEGIN; - - // - // Setup shared memory buffers - // - size_t shmem = 0; - - { - // - // Privatize the loop_body, using make_launch_body to setup reductions - // - BODY body = RAJA::cuda::make_launch_body( - gridSize, blockSize, shmem, cuda_res, std::forward(body_in)); - - // - // Launch the kernel - // - void *args[] = {(void*)&ctx, (void*)&body}; - { - RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, shmem, cuda_res, async, ctx.kernel_name); - } - } - - RAJA_FT_END; - } - - return resources::EventProxy(res); - } -}; - template __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ void launch_global_fcn_fixed(LaunchContext ctx, BODY body_in) @@ -165,124 +41,8 @@ __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ body(ctx); } - -template -struct LaunchExecute> { - - template - static void exec(LaunchContext const &ctx, BODY_IN &&body_in) - { - using BODY = camp::decay; - - auto func = launch_global_fcn_fixed; - - resources::Cuda cuda_res = resources::Cuda::get_default(); - - // - // Compute the number of blocks and threads - // - - cuda_dim_t gridSize{ static_cast(ctx.teams.value[0]), - static_cast(ctx.teams.value[1]), - static_cast(ctx.teams.value[2]) }; - - cuda_dim_t blockSize{ static_cast(ctx.threads.value[0]), - static_cast(ctx.threads.value[1]), - static_cast(ctx.threads.value[2]) }; - - // Only launch kernel if we have something to iterate over - constexpr cuda_dim_member_t zero = 0; - if ( gridSize.x > zero && gridSize.y > zero && gridSize.z > zero && - blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) { - - RAJA_FT_BEGIN; - - // - // Setup shared memory buffers - // - size_t shmem = 0; - - { - // - // Privatize the loop_body, using make_launch_body to setup reductions - // - BODY body = RAJA::cuda::make_launch_body( - gridSize, blockSize, shmem, cuda_res, std::forward(body_in)); - - // - // Launch the kernel - // - void *args[] = {(void*)&ctx, (void*)&body}; - RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, shmem, cuda_res, async, ctx.kernel_name); - } - - RAJA_FT_END; - } - - } - - template - static resources::EventProxy - exec(RAJA::resources::Resource res, LaunchContext const &ctx, BODY_IN &&body_in) - { - using BODY = camp::decay; - - auto func = launch_global_fcn; - - /*Get the concrete resource */ - resources::Cuda cuda_res = res.get(); - - // - // Compute the number of blocks and threads - // - - cuda_dim_t gridSize{ static_cast(ctx.teams.value[0]), - static_cast(ctx.teams.value[1]), - static_cast(ctx.teams.value[2]) }; - - cuda_dim_t blockSize{ static_cast(ctx.threads.value[0]), - static_cast(ctx.threads.value[1]), - static_cast(ctx.threads.value[2]) }; - - // Only launch kernel if we have something to iterate over - constexpr cuda_dim_member_t zero = 0; - if ( gridSize.x > zero && gridSize.y > zero && gridSize.z > zero && - blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) { - - RAJA_FT_BEGIN; - - // - // Setup shared memory buffers - // - size_t shmem = 0; - - { - // - // Privatize the loop_body, using make_launch_body to setup reductions - // - BODY body = RAJA::cuda::make_launch_body( - gridSize, blockSize, shmem, cuda_res, std::forward(body_in)); - - // - // Launch the kernel - // - void *args[] = {(void*)&ctx, (void*)&body}; - { - RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, shmem, cuda_res, async, ctx.kernel_name); - } - } - - RAJA_FT_END; - } - - return resources::EventProxy(res); - } - -}; - - template -struct LaunchExecute> { +struct LaunchExecute> { template static void exec(LaunchContext const &ctx, BODY_IN &&body_in) @@ -342,7 +102,7 @@ struct LaunchExecute; - auto func = launch_global_fcn; + auto func = launch_global_fcn_fixed; /*Get the concrete resource */ resources::Cuda cuda_res = res.get(); diff --git a/include/RAJA/policy/hip/WorkGroup/Vtable.hpp b/include/RAJA/policy/hip/WorkGroup/Vtable.hpp index fceac6c102..f2369b23c9 100644 --- a/include/RAJA/policy/hip/WorkGroup/Vtable.hpp +++ b/include/RAJA/policy/hip/WorkGroup/Vtable.hpp @@ -97,23 +97,6 @@ inline typename Vtable_T::call_sig get_cached_Vtable_hip_device_call() * Populate and return a Vtable object where the * call operator is a device function */ -template < typename T, typename Vtable_T, size_t BLOCK_SIZE, bool Async > -inline const Vtable_T* get_Vtable(hip_work const&) -{ - static Vtable_T vtable{ - &Vtable_T::template move_construct_destroy, - get_cached_Vtable_hip_device_call(), - &Vtable_T::template destroy, - sizeof(T) - }; - return &vtable; -} - -/*! -* Explicit BLOCKS_PER_SM version. -* Populate and return a Vtable object where the -* call operator is a device function -*/ template < typename T, typename Vtable_T, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async > inline const Vtable_T* get_Vtable(hip_work_explicit const&) { diff --git a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp index a42577f643..a494ca6299 100644 --- a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp +++ b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp @@ -36,62 +36,6 @@ namespace detail * Runs work in a storage container in order * and returns any per run resources */ -template -struct WorkRunner< - RAJA::hip_work, - RAJA::ordered, - ALLOCATOR_T, - INDEX_T, - Args...> - : WorkRunnerForallOrdered< - RAJA::hip_exec_async, - RAJA::hip_work, - RAJA::ordered, - ALLOCATOR_T, - INDEX_T, - Args...> -{ - using base = WorkRunnerForallOrdered< - RAJA::hip_exec_async, - RAJA::hip_work, - RAJA::ordered, - ALLOCATOR_T, - INDEX_T, - Args...>; - using base::base; - using IndexType = INDEX_T; - using per_run_storage = typename base::per_run_storage; - - /// - /// run the loops in the given work container in order using forall - /// run all loops asynchronously and synchronize after is necessary - /// - template < typename WorkContainer > - per_run_storage run(WorkContainer const& storage, - typename base::resource_type r, Args... args) const - { - per_run_storage run_storage = - base::run(storage, r, std::forward(args)...); - - IndexType num_loops = std::distance(std::begin(storage), std::end(storage)); - - // Only synchronize if we had something to iterate over - if (num_loops > 0 && BLOCK_SIZE > 0) { - if (!Async) { RAJA::hip::synchronize(r); } - } - - return run_storage; - } -}; - -/*! - * Explicit BLOCKS_PER_SM version. - * Runs work in a storage container in order - * and returns any per run resources - */ template -struct WorkRunner< - RAJA::hip_work, - RAJA::reverse_ordered, - ALLOCATOR_T, - INDEX_T, - Args...> - : WorkRunnerForallReverse< - RAJA::hip_exec_async, - RAJA::hip_work, - RAJA::reverse_ordered, - ALLOCATOR_T, - INDEX_T, - Args...> -{ - using base = WorkRunnerForallReverse< - RAJA::hip_exec_async, - RAJA::hip_work, - RAJA::reverse_ordered, - ALLOCATOR_T, - INDEX_T, - Args...>; - using base::base; - using IndexType = INDEX_T; - using per_run_storage = typename base::per_run_storage; - - /// - /// run the loops in the given work container in reverse order using forall - /// run all loops asynchronously and synchronize after is necessary - /// - template < typename WorkContainer > - per_run_storage run(WorkContainer const& storage, - typename base::resource_type r, Args... args) const - { - per_run_storage run_storage = - base::run(storage, r, std::forward(args)...); - - IndexType num_loops = std::distance(std::begin(storage), std::end(storage)); - - // Only synchronize if we had something to iterate over - if (num_loops > 0 && BLOCK_SIZE > 0) { - if (!Async) { RAJA::hip::synchronize(r); } - } - - return run_storage; - } -}; - - -/*! - * Explicit BLOCKS_PER_SM version. - * Runs work in a storage container in reverse order - * and returns any per run resources - */ template -struct WorkRunner< - RAJA::hip_work, - RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average, - ALLOCATOR_T, - INDEX_T, - Args...> -{ - using exec_policy = RAJA::hip_work; - using order_policy = RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average; - using Allocator = ALLOCATOR_T; - using index_type = INDEX_T; - using resource_type = resources::Hip; - - using vtable_type = Vtable, Args...>; - - WorkRunner() = default; - - WorkRunner(WorkRunner const&) = delete; - WorkRunner& operator=(WorkRunner const&) = delete; - - WorkRunner(WorkRunner && o) - : m_total_iterations(o.m_total_iterations) - { - o.m_total_iterations = 0; - } - WorkRunner& operator=(WorkRunner && o) - { - m_total_iterations = o.m_total_iterations; - - o.m_total_iterations = 0; - return *this; - } - - // The type that will hold the segment and loop body in work storage - template < typename ITERABLE, typename LOOP_BODY > - using holder_type = HoldHipDeviceXThreadblockLoop; - - // The policy indicating where the call function is invoked - // in this case the values are called on the device - using vtable_exec_policy = exec_policy; - - // runner interfaces with storage to enqueue so the runner can get - // information from the segment and loop at enqueue time - template < typename WorkContainer, typename Iterable, typename LoopBody > - inline void enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body) - { - using Iterator = camp::decay; - using LOOP_BODY = camp::decay; - using ITERABLE = camp::decay; - using IndexType = camp::decay; - - using holder = holder_type; - - // using true_value_type = typename WorkContainer::template true_value_type; - - Iterator begin = std::begin(iter); - Iterator end = std::end(iter); - IndexType len = std::distance(begin, end); - - // Only launch kernel if we have something to iterate over - if (len > 0 && BLOCK_SIZE > 0) { - - m_total_iterations += len; - - // - // TODO: Privatize the loop_body, using make_launch_body to setup reductions - // - // LOOP_BODY body = RAJA::hip::make_launch_body( - // gridSize, blockSize, shmem, stream, std::forward(loop_body)); - - storage.template emplace( - get_Vtable(vtable_exec_policy{}), - std::forward(iter), std::forward(loop_body)); - } - } - - // no extra storage required here - using per_run_storage = int; - - template < typename WorkContainer > - per_run_storage run(WorkContainer const& storage, resource_type r, Args... args) const - { - using Iterator = camp::decay; - using IndexType = camp::decay; - using value_type = typename WorkContainer::value_type; - - per_run_storage run_storage{}; - - auto func = hip_unordered_y_block_global; - - // - // Compute the requested iteration space size - // - Iterator begin = std::begin(storage); - Iterator end = std::end(storage); - IndexType num_loops = std::distance(begin, end); - - // Only launch kernel if we have something to iterate over - if (num_loops > 0 && BLOCK_SIZE > 0) { - - index_type average_iterations = m_total_iterations / static_cast(num_loops); - - // - // Compute the number of blocks - // - constexpr index_type block_size = static_cast(BLOCK_SIZE); - hip_dim_t blockSize{static_cast(block_size), 1, 1}; - hip_dim_t gridSize{static_cast((average_iterations + block_size - 1) / block_size), - static_cast(num_loops), - 1}; - - RAJA_FT_BEGIN; - - // - // Setup shared memory buffers - // - size_t shmem = 0; - - { - // - // Launch the kernel - // - void* func_args[] = { (void*)&begin, (void*)&args... }; - RAJA::hip::launch((const void*)func, gridSize, blockSize, func_args, shmem, r, Async); - } - - RAJA_FT_END; - } - - return run_storage; - } - - // clear any state so ready to be destroyed or reused - void clear() - { - m_total_iterations = 0; - } - -private: - index_type m_total_iterations = 0; -}; - -/*! - * Explicit BLOCKS_PER_SM version. - * Runs work in a storage container out of order with loops mapping to - * hip blocks in the y direction and iterations mapping to threads in - * the x direction, with the number of threads in the x dimension determined - * by the average number of iterates per loop - */ template { }; +namespace expt +{ template struct hip_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::hip, @@ -85,6 +87,7 @@ struct hip_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_ detail::get_launch::value, RAJA::Platform::hip> { }; +} // @@ -230,7 +233,7 @@ using hip_exec_explicit_async = policy::hip::hip_exec_explicit using hip_exec = policy::hip::hip_exec_explicit; -template +template using hip_exec_async = policy::hip::hip_exec_explicit; using policy::hip::hip_work_explicit; diff --git a/include/RAJA/policy/hip/teams.hpp b/include/RAJA/policy/hip/teams.hpp index 75aaf3d62d..d07a37e7d7 100644 --- a/include/RAJA/policy/hip/teams.hpp +++ b/include/RAJA/policy/hip/teams.hpp @@ -31,126 +31,6 @@ namespace RAJA namespace expt { -template -__global__ void launch_global_fcn(LaunchContext ctx, BODY body_in) -{ - using RAJA::internal::thread_privatize; - auto privatizer = thread_privatize(body_in); - auto& body = privatizer.get_priv(); - body(ctx); -} - -template -struct LaunchExecute> { - - template - static void exec(LaunchContext const &ctx, BODY_IN &&body_in) - { - using BODY = camp::decay; - - auto func = launch_global_fcn; - - resources::Hip hip_res = resources::Hip::get_default(); - - // - // Compute the number of blocks and threads - // - - hip_dim_t gridSize{ static_cast(ctx.teams.value[0]), - static_cast(ctx.teams.value[1]), - static_cast(ctx.teams.value[2]) }; - - hip_dim_t blockSize{ static_cast(ctx.threads.value[0]), - static_cast(ctx.threads.value[1]), - static_cast(ctx.threads.value[2]) }; - - // Only launch kernel if we have something to iterate over - constexpr hip_dim_member_t zero = 0; - if ( gridSize.x > zero && gridSize.y > zero && gridSize.z > zero && - blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) { - - RAJA_FT_BEGIN; - - // - // Setup shared memory buffers - // - size_t shmem = 0; - - { - // - // Privatize the loop_body, using make_launch_body to setup reductions - // - BODY body = RAJA::hip::make_launch_body( - gridSize, blockSize, shmem, hip_res, std::forward(body_in)); - - // - // Launch the kernel - // - void *args[] = {(void*)&ctx, (void*)&body}; - RAJA::hip::launch((const void*)func, gridSize, blockSize, args, shmem, hip_res, async, ctx.kernel_name); - } - - RAJA_FT_END; - } - - } - - template - static resources::EventProxy - exec(RAJA::resources::Resource res, LaunchContext const &ctx, BODY_IN &&body_in) - { - using BODY = camp::decay; - - auto func = launch_global_fcn; - - resources::Hip hip_res = res.get(); - - // - // Compute the number of blocks and threads - // - - hip_dim_t gridSize{ static_cast(ctx.teams.value[0]), - static_cast(ctx.teams.value[1]), - static_cast(ctx.teams.value[2]) }; - - hip_dim_t blockSize{ static_cast(ctx.threads.value[0]), - static_cast(ctx.threads.value[1]), - static_cast(ctx.threads.value[2]) }; - - // Only launch kernel if we have something to iterate over - constexpr hip_dim_member_t zero = 0; - if ( gridSize.x > zero && gridSize.y > zero && gridSize.z > zero && - blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) { - - RAJA_FT_BEGIN; - - // - // Setup shared memory buffers - // - size_t shmem = 0; - - { - // - // Privatize the loop_body, using make_launch_body to setup reductions - // - BODY body = RAJA::hip::make_launch_body( - gridSize, blockSize, shmem, hip_res, std::forward(body_in)); - - // - // Launch the kernel - // - void *args[] = {(void*)&ctx, (void*)&body}; - RAJA::hip::launch((const void*)func, gridSize, blockSize, args, shmem, hip_res, async, ctx.kernel_name); - } - - RAJA_FT_END; - } - - return resources::EventProxy(res); - } - -}; - template __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ static void launch_global_fcn_fixed(LaunchContext ctx, BODY body_in) @@ -161,121 +41,8 @@ static void launch_global_fcn_fixed(LaunchContext ctx, BODY body_in) body(ctx); } - -template -struct LaunchExecute> { - - template - static void exec(LaunchContext const &ctx, BODY_IN &&body_in) - { - using BODY = camp::decay; - - auto func = launch_global_fcn_fixed; - - resources::Hip hip_res = resources::Hip::get_default(); - - // - // Compute the number of blocks and threads - // - - hip_dim_t gridSize{ static_cast(ctx.teams.value[0]), - static_cast(ctx.teams.value[1]), - static_cast(ctx.teams.value[2]) }; - - hip_dim_t blockSize{ static_cast(ctx.threads.value[0]), - static_cast(ctx.threads.value[1]), - static_cast(ctx.threads.value[2]) }; - - // Only launch kernel if we have something to iterate over - constexpr hip_dim_member_t zero = 0; - if ( gridSize.x > zero && gridSize.y > zero && gridSize.z > zero && - blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) { - - RAJA_FT_BEGIN; - - // - // Setup shared memory buffers - // - size_t shmem = 0; - - { - // - // Privatize the loop_body, using make_launch_body to setup reductions - // - BODY body = RAJA::hip::make_launch_body( - gridSize, blockSize, shmem, hip_res, std::forward(body_in)); - - // - // Launch the kernel - // - void *args[] = {(void*)&ctx, (void*)&body}; - RAJA::hip::launch((const void*)func, gridSize, blockSize, args, shmem, hip_res, async, ctx.kernel_name); - } - - RAJA_FT_END; - } - - } - - template - static resources::EventProxy - exec(RAJA::resources::Resource res, LaunchContext const &ctx, BODY_IN &&body_in) - { - using BODY = camp::decay; - - auto func = launch_global_fcn; - - resources::Hip hip_res = res.get(); - - // - // Compute the number of blocks and threads - // - - hip_dim_t gridSize{ static_cast(ctx.teams.value[0]), - static_cast(ctx.teams.value[1]), - static_cast(ctx.teams.value[2]) }; - - hip_dim_t blockSize{ static_cast(ctx.threads.value[0]), - static_cast(ctx.threads.value[1]), - static_cast(ctx.threads.value[2]) }; - - // Only launch kernel if we have something to iterate over - constexpr hip_dim_member_t zero = 0; - if ( gridSize.x > zero && gridSize.y > zero && gridSize.z > zero && - blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) { - - RAJA_FT_BEGIN; - - // - // Setup shared memory buffers - // - size_t shmem = 0; - - { - // - // Privatize the loop_body, using make_launch_body to setup reductions - // - BODY body = RAJA::hip::make_launch_body( - gridSize, blockSize, shmem, hip_res, std::forward(body_in)); - - // - // Launch the kernel - // - void *args[] = {(void*)&ctx, (void*)&body}; - RAJA::hip::launch((const void*)func, gridSize, blockSize, args, shmem, hip_res, async, ctx.kernel_name); - } - - RAJA_FT_END; - } - - return resources::EventProxy(res); - } - -}; - - template -struct LaunchExecute> { +struct LaunchExecute> { template static void exec(LaunchContext const &ctx, BODY_IN &&body_in) @@ -335,7 +102,7 @@ struct LaunchExecute; - auto func = launch_global_fcn; + auto func = launch_global_fcn_fixed; resources::Hip hip_res = res.get(); diff --git a/test/include/RAJA_test-teams-execpol.hpp b/test/include/RAJA_test-teams-execpol.hpp index c15773005e..318d9d3cad 100644 --- a/test/include/RAJA_test-teams-execpol.hpp +++ b/test/include/RAJA_test-teams-execpol.hpp @@ -23,7 +23,7 @@ using seq_cuda_policies = camp::list< RAJA::expt::LoopPolicy>; using seq_cuda_explicit_policies = camp::list< - RAJA::expt::LaunchPolicy>, + RAJA::expt::LaunchPolicy>, RAJA::expt::LoopPolicy, RAJA::expt::LoopPolicy>; @@ -39,7 +39,7 @@ using seq_hip_policies = camp::list< RAJA::expt::LoopPolicy>; using seq_hip_explicit_policies = camp::list< - RAJA::expt::LaunchPolicy>, + RAJA::expt::LaunchPolicy>, RAJA::expt::LoopPolicy, RAJA::expt::LoopPolicy>; @@ -67,7 +67,7 @@ using omp_cuda_policies = camp::list< >; using omp_cuda_explicit_policies = camp::list< - RAJA::expt::LaunchPolicy>, + RAJA::expt::LaunchPolicy>, RAJA::expt::LoopPolicy, RAJA::expt::LoopPolicy >; @@ -86,7 +86,7 @@ using omp_hip_policies = camp::list< >; using omp_hip_explicit_policies = camp::list< - RAJA::expt::LaunchPolicy>, + RAJA::expt::LaunchPolicy>, RAJA::expt::LoopPolicy, RAJA::expt::LoopPolicy >; From 58f143a2535f445a54071bfb1ca6db813c10bf1e Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Mon, 29 Nov 2021 18:20:10 -0800 Subject: [PATCH 08/21] Fix Hip namespace. --- include/RAJA/policy/hip/policy.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 7169fabb96..75c5003ee9 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -272,7 +272,7 @@ using policy::hip::hip_synchronize; namespace expt { template - using hip_launch_t = policy::hip::hip_launch_explicit_t; + using hip_launch_t = policy::hip::expt::hip_launch_explicit_t; } /*! From 189de7c2494c1177bd77155ddd90a26aeb70629d Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Mon, 29 Nov 2021 19:02:05 -0800 Subject: [PATCH 09/21] Fix HIP min warps per execution unit. --- include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp | 3 ++- include/RAJA/policy/hip/forall.hpp | 3 ++- include/RAJA/policy/hip/kernel/HipKernel.hpp | 3 ++- include/RAJA/policy/hip/teams.hpp | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp index a494ca6299..2b0ceda8e8 100644 --- a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp +++ b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp @@ -178,13 +178,14 @@ struct HoldHipDeviceXThreadblockLoop LoopBody m_body; }; +// HIP BLOCKS_PER_SM calculation is actually MIN_WARPS_PER_EXECUTION_UNIT template < size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename StorageIter, typename value_type, typename index_type, typename ... Args > -__launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__ +__launch_bounds__(BLOCK_SIZE, (BLOCK_SIZE * BLOCKS_PER_SM)/32) __global__ void hip_unordered_y_block_global(StorageIter iter, Args... args) { const index_type i_loop = blockIdx.y; diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp index 0ec9c5c1e9..db5226524b 100644 --- a/include/RAJA/policy/hip/forall.hpp +++ b/include/RAJA/policy/hip/forall.hpp @@ -127,12 +127,13 @@ __device__ __forceinline__ unsigned int getGlobalNumThreads_3D_3D() * ****************************************************************************** */ +// HIP BLOCKS_PER_SM calculation is actually MIN_WARPS_PER_EXECUTION_UNIT template -__launch_bounds__(BlockSize, BlocksPerSM) __global__ +__launch_bounds__(BlockSize, (BlockSize * BlocksPerSM)/32) __global__ void forall_hip_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length) diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp index 70b1afe88f..d5896a5948 100644 --- a/include/RAJA/policy/hip/kernel/HipKernel.hpp +++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp @@ -211,8 +211,9 @@ __global__ void HipKernelLauncher(Data data) * * This launcher is used by the HipKerelFixed policies. */ +// HIP BLOCKS_PER_SM calculation is actually MIN_WARPS_PER_EXECUTION_UNIT template -__launch_bounds__(BlockSize, BlocksPerSM) __global__ +__launch_bounds__(BlockSize, (BlockSize * BlocksPerSM)/32) __global__ void HipKernelLauncherFixed(Data data) { diff --git a/include/RAJA/policy/hip/teams.hpp b/include/RAJA/policy/hip/teams.hpp index d07a37e7d7..09e25049f8 100644 --- a/include/RAJA/policy/hip/teams.hpp +++ b/include/RAJA/policy/hip/teams.hpp @@ -31,8 +31,9 @@ namespace RAJA namespace expt { +// HIP BLOCKS_PER_SM calculation is actually MIN_WARPS_PER_EXECUTION_UNIT template -__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ +__launch_bounds__(num_threads, (num_threads * BLOCKS_PER_SM)/32) __global__ static void launch_global_fcn_fixed(LaunchContext ctx, BODY body_in) { using RAJA::internal::thread_privatize; From b8115fd1def8d91e8004b551ff45d3458904e9b7 Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Mon, 29 Nov 2021 19:38:16 -0800 Subject: [PATCH 10/21] Non-zero HIP threads and blocks. --- test/include/RAJA_test-teams-execpol.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/include/RAJA_test-teams-execpol.hpp b/test/include/RAJA_test-teams-execpol.hpp index 318d9d3cad..0311191790 100644 --- a/test/include/RAJA_test-teams-execpol.hpp +++ b/test/include/RAJA_test-teams-execpol.hpp @@ -39,7 +39,7 @@ using seq_hip_policies = camp::list< RAJA::expt::LoopPolicy>; using seq_hip_explicit_policies = camp::list< - RAJA::expt::LaunchPolicy>, + RAJA::expt::LaunchPolicy>, RAJA::expt::LoopPolicy, RAJA::expt::LoopPolicy>; @@ -86,7 +86,7 @@ using omp_hip_policies = camp::list< >; using omp_hip_explicit_policies = camp::list< - RAJA::expt::LaunchPolicy>, + RAJA::expt::LaunchPolicy>, RAJA::expt::LoopPolicy, RAJA::expt::LoopPolicy >; From eed727d80b2e7170cfe30a62ffdefc89bc9e2406 Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Mon, 29 Nov 2021 22:27:33 -0800 Subject: [PATCH 11/21] Default HIP blocks per SM to 1. --- include/RAJA/policy/hip/policy.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 75c5003ee9..4876dafbc9 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -272,7 +272,7 @@ using policy::hip::hip_synchronize; namespace expt { template - using hip_launch_t = policy::hip::expt::hip_launch_explicit_t; + using hip_launch_t = policy::hip::expt::hip_launch_explicit_t; } /*! From cd4ef2e465e1df82864d545aeda3afc838fa6328 Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Mon, 29 Nov 2021 22:35:56 -0800 Subject: [PATCH 12/21] HIP math. --- include/RAJA/policy/hip/policy.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 4876dafbc9..88583313b9 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -271,7 +271,7 @@ using policy::hip::hip_synchronize; namespace expt { - template + template using hip_launch_t = policy::hip::expt::hip_launch_explicit_t; } From dfc6b9e4988463fd76abe46811a7841cf355b67e Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Tue, 30 Nov 2021 09:54:20 -0800 Subject: [PATCH 13/21] Comment on placeholder cuda_exec. --- include/RAJA/policy/cuda/kernel/CudaKernel.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp index 2655ee630b..2d6bcd66d4 100644 --- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp +++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp @@ -84,8 +84,8 @@ namespace statement /*! * A RAJA::kernel statement that launches a CUDA kernel. - * - * + * Note - Statement requires a placeholder cuda_exec policy for the sake of + * object oriented inheritance. */ template struct CudaKernelExt From 490a3f6626f0899a827c0237e580f2d89d056e50 Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Tue, 30 Nov 2021 19:18:16 -0800 Subject: [PATCH 14/21] CUDA default blocks per SM. --- include/RAJA/policy/cuda/kernel/CudaKernel.hpp | 4 ++-- include/RAJA/policy/cuda/policy.hpp | 15 +++++++++------ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp index 2d6bcd66d4..2335ebad0e 100644 --- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp +++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp @@ -68,7 +68,7 @@ struct cuda_explicit_launch {}; * Blocks per SM defaults to 1. */ template -using cuda_launch = cuda_explicit_launch; +using cuda_launch = cuda_explicit_launch; /*! @@ -77,7 +77,7 @@ using cuda_launch = cuda_explicit_launch; * If num_threads is 0 then num_threads is chosen at runtime. */ template -using cuda_occ_calc_launch = cuda_explicit_launch; +using cuda_occ_calc_launch = cuda_explicit_launch; namespace statement { diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index b35b8cb1eb..918b138927 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -74,6 +74,9 @@ namespace policy namespace cuda { +constexpr const size_t MIN_BLOCKS_PER_SM = 1; +constexpr const size_t MAX_BLOCKS_PER_SM = 32; + template struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::cuda, @@ -84,7 +87,7 @@ struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t< namespace expt { -template +template struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::cuda, RAJA::Pattern::region, @@ -228,18 +231,18 @@ template using cuda_exec_explicit_async = policy::cuda::cuda_exec_explicit; template -using cuda_exec = policy::cuda::cuda_exec_explicit; +using cuda_exec = policy::cuda::cuda_exec_explicit; template -using cuda_exec_async = policy::cuda::cuda_exec_explicit; +using cuda_exec_async = policy::cuda::cuda_exec_explicit; using policy::cuda::cuda_work_explicit; template -using cuda_work = policy::cuda::cuda_work_explicit; +using cuda_work = policy::cuda::cuda_work_explicit; template -using cuda_work_async = policy::cuda::cuda_work_explicit; +using cuda_work_async = policy::cuda::cuda_work_explicit; using policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average; @@ -267,7 +270,7 @@ using policy::cuda::cuda_synchronize; namespace expt { template - using cuda_launch_t = policy::cuda::expt::cuda_launch_explicit_t; + using cuda_launch_t = policy::cuda::expt::cuda_launch_explicit_t; } From 7e48b0c2487f83488f5b8f8467a0649b8a0f138b Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Thu, 2 Dec 2021 14:22:42 -0800 Subject: [PATCH 15/21] Adding Teams default launch execute policy back. --- include/RAJA/policy/cuda/policy.hpp | 3 +- include/RAJA/policy/cuda/teams.hpp | 124 ++++++++++++++++++++++++++++ include/RAJA/policy/hip/policy.hpp | 1 + include/RAJA/policy/hip/teams.hpp | 121 +++++++++++++++++++++++++++ 4 files changed, 248 insertions(+), 1 deletion(-) diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index 918b138927..1497141968 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -269,7 +269,8 @@ using policy::cuda::cuda_synchronize; namespace expt { - template + // num_threads defaults to 1, but not expected to be used in kernel launch + template using cuda_launch_t = policy::cuda::expt::cuda_launch_explicit_t; } diff --git a/include/RAJA/policy/cuda/teams.hpp b/include/RAJA/policy/cuda/teams.hpp index adc900f93d..326757e506 100644 --- a/include/RAJA/policy/cuda/teams.hpp +++ b/include/RAJA/policy/cuda/teams.hpp @@ -31,6 +31,130 @@ namespace RAJA namespace expt { +template +__global__ void launch_global_fcn(LaunchContext ctx, BODY body_in) +{ + using RAJA::internal::thread_privatize; + auto privatizer = thread_privatize(body_in); + auto& body = privatizer.get_priv(); + body(ctx); +} + +template +struct LaunchExecute> { +// cuda_launch_t num_threads set to 1, but not used in launch of kernel + + template + static void exec(LaunchContext const &ctx, BODY_IN &&body_in) + { + using BODY = camp::decay; + + auto func = launch_global_fcn; + + resources::Cuda cuda_res = resources::Cuda::get_default(); + + // + // Compute the number of blocks and threads + // + + cuda_dim_t gridSize{ static_cast(ctx.teams.value[0]), + static_cast(ctx.teams.value[1]), + static_cast(ctx.teams.value[2]) }; + + cuda_dim_t blockSize{ static_cast(ctx.threads.value[0]), + static_cast(ctx.threads.value[1]), + static_cast(ctx.threads.value[2]) }; + + // Only launch kernel if we have something to iterate over + constexpr cuda_dim_member_t zero = 0; + if ( gridSize.x > zero && gridSize.y > zero && gridSize.z > zero && + blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) { + + RAJA_FT_BEGIN; + + // + // Setup shared memory buffers + // + size_t shmem = 0; + + { + // + // Privatize the loop_body, using make_launch_body to setup reductions + // + BODY body = RAJA::cuda::make_launch_body( + gridSize, blockSize, shmem, cuda_res, std::forward(body_in)); + + // + // Launch the kernel + // + void *args[] = {(void*)&ctx, (void*)&body}; + RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, shmem, cuda_res, async, ctx.kernel_name); + } + + RAJA_FT_END; + } + + } + + template + static resources::EventProxy + exec(RAJA::resources::Resource res, LaunchContext const &ctx, BODY_IN &&body_in) + { + using BODY = camp::decay; + + auto func = launch_global_fcn; + + /*Get the concrete resource */ + resources::Cuda cuda_res = res.get(); + + // + // Compute the number of blocks and threads + // + + cuda_dim_t gridSize{ static_cast(ctx.teams.value[0]), + static_cast(ctx.teams.value[1]), + static_cast(ctx.teams.value[2]) }; + + cuda_dim_t blockSize{ static_cast(ctx.threads.value[0]), + static_cast(ctx.threads.value[1]), + static_cast(ctx.threads.value[2]) }; + + // Only launch kernel if we have something to iterate over + constexpr cuda_dim_member_t zero = 0; + if ( gridSize.x > zero && gridSize.y > zero && gridSize.z > zero && + blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) { + + RAJA_FT_BEGIN; + + // + // Setup shared memory buffers + // + size_t shmem = 0; + + { + // + // Privatize the loop_body, using make_launch_body to setup reductions + // + BODY body = RAJA::cuda::make_launch_body( + gridSize, blockSize, shmem, cuda_res, std::forward(body_in)); + + // + // Launch the kernel + // + void *args[] = {(void*)&ctx, (void*)&body}; + { + RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, shmem, cuda_res, async, ctx.kernel_name); + } + } + + RAJA_FT_END; + } + + return resources::EventProxy(res); + } + +}; + template __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ void launch_global_fcn_fixed(LaunchContext ctx, BODY body_in) diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 88583313b9..f0886b37bd 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -271,6 +271,7 @@ using policy::hip::hip_synchronize; namespace expt { + // num_threads defaults to 1, but not expected to be used in kernel launch template using hip_launch_t = policy::hip::expt::hip_launch_explicit_t; } diff --git a/include/RAJA/policy/hip/teams.hpp b/include/RAJA/policy/hip/teams.hpp index 09e25049f8..d6e8ea28f1 100644 --- a/include/RAJA/policy/hip/teams.hpp +++ b/include/RAJA/policy/hip/teams.hpp @@ -31,6 +31,127 @@ namespace RAJA namespace expt { +template +__global__ static void launch_global_fcn(LaunchContext ctx, BODY body_in) +{ + using RAJA::internal::thread_privatize; + auto privatizer = thread_privatize(body_in); + auto& body = privatizer.get_priv(); + body(ctx); +} + +template +struct LaunchExecute> { +// hip_launch_t num_threads set to 1, but not used in launch of kernel + + template + static void exec(LaunchContext const &ctx, BODY_IN &&body_in) + { + using BODY = camp::decay; + + auto func = launch_global_fcn; + + resources::Hip hip_res = resources::Hip::get_default(); + + // + // Compute the number of blocks and threads + // + + hip_dim_t gridSize{ static_cast(ctx.teams.value[0]), + static_cast(ctx.teams.value[1]), + static_cast(ctx.teams.value[2]) }; + + hip_dim_t blockSize{ static_cast(ctx.threads.value[0]), + static_cast(ctx.threads.value[1]), + static_cast(ctx.threads.value[2]) }; + + // Only launch kernel if we have something to iterate over + constexpr hip_dim_member_t zero = 0; + if ( gridSize.x > zero && gridSize.y > zero && gridSize.z > zero && + blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) { + + RAJA_FT_BEGIN; + + // + // Setup shared memory buffers + // + size_t shmem = 0; + + { + // + // Privatize the loop_body, using make_launch_body to setup reductions + // + BODY body = RAJA::hip::make_launch_body( + gridSize, blockSize, shmem, hip_res, std::forward(body_in)); + + // + // Launch the kernel + // + void *args[] = {(void*)&ctx, (void*)&body}; + RAJA::hip::launch((const void*)func, gridSize, blockSize, args, shmem, hip_res, async, ctx.kernel_name); + } + + RAJA_FT_END; + } + + } + + template + static resources::EventProxy + exec(RAJA::resources::Resource res, LaunchContext const &ctx, BODY_IN &&body_in) + { + using BODY = camp::decay; + + auto func = launch_global_fcn; + + resources::Hip hip_res = res.get(); + + // + // Compute the number of blocks and threads + // + + hip_dim_t gridSize{ static_cast(ctx.teams.value[0]), + static_cast(ctx.teams.value[1]), + static_cast(ctx.teams.value[2]) }; + + hip_dim_t blockSize{ static_cast(ctx.threads.value[0]), + static_cast(ctx.threads.value[1]), + static_cast(ctx.threads.value[2]) }; + + // Only launch kernel if we have something to iterate over + constexpr hip_dim_member_t zero = 0; + if ( gridSize.x > zero && gridSize.y > zero && gridSize.z > zero && + blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) { + + RAJA_FT_BEGIN; + + // + // Setup shared memory buffers + // + size_t shmem = 0; + + { + // + // Privatize the loop_body, using make_launch_body to setup reductions + // + BODY body = RAJA::hip::make_launch_body( + gridSize, blockSize, shmem, hip_res, std::forward(body_in)); + + // + // Launch the kernel + // + void *args[] = {(void*)&ctx, (void*)&body}; + RAJA::hip::launch((const void*)func, gridSize, blockSize, args, shmem, hip_res, async, ctx.kernel_name); + } + + RAJA_FT_END; + } + + return resources::EventProxy(res); + } + +}; + // HIP BLOCKS_PER_SM calculation is actually MIN_WARPS_PER_EXECUTION_UNIT template __launch_bounds__(num_threads, (num_threads * BLOCKS_PER_SM)/32) __global__ From e517a78dc7b696ce2d063f0c2c83609b5e0dfaff Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Wed, 5 Jan 2022 16:38:59 -0800 Subject: [PATCH 16/21] Undo HIP changes, awaiting further guidance from AMD on launch_bounds. --- include/RAJA/policy/hip/WorkGroup/Vtable.hpp | 4 +- .../RAJA/policy/hip/WorkGroup/WorkRunner.hpp | 38 ++++++------ include/RAJA/policy/hip/forall.hpp | 15 ++--- include/RAJA/policy/hip/kernel/HipKernel.hpp | 59 ++++++++----------- include/RAJA/policy/hip/policy.hpp | 36 ++++------- include/RAJA/policy/hip/scan.hpp | 13 ++-- include/RAJA/policy/hip/sort.hpp | 48 +++++++-------- include/RAJA/policy/hip/teams.hpp | 18 +++--- include/RAJA/util/resource.hpp | 10 ---- test/include/RAJA_test-forall-execpol.hpp | 3 +- test/include/RAJA_test-teams-execpol.hpp | 19 +----- test/include/RAJA_test-workgroup.hpp | 3 +- test/old-tests/unit/test-kernel.cpp | 2 +- .../algorithm/tests/test-algorithm-sort.hpp | 3 +- .../tests/test-algorithm-stable-sort.hpp | 3 +- 15 files changed, 104 insertions(+), 170 deletions(-) diff --git a/include/RAJA/policy/hip/WorkGroup/Vtable.hpp b/include/RAJA/policy/hip/WorkGroup/Vtable.hpp index f2369b23c9..e4ce5212bf 100644 --- a/include/RAJA/policy/hip/WorkGroup/Vtable.hpp +++ b/include/RAJA/policy/hip/WorkGroup/Vtable.hpp @@ -97,8 +97,8 @@ inline typename Vtable_T::call_sig get_cached_Vtable_hip_device_call() * Populate and return a Vtable object where the * call operator is a device function */ -template < typename T, typename Vtable_T, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async > -inline const Vtable_T* get_Vtable(hip_work_explicit const&) +template < typename T, typename Vtable_T, size_t BLOCK_SIZE, bool Async > +inline const Vtable_T* get_Vtable(hip_work const&) { static Vtable_T vtable{ &Vtable_T::template move_construct_destroy, diff --git a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp index 2b0ceda8e8..29571812c7 100644 --- a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp +++ b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp @@ -36,27 +36,27 @@ namespace detail * Runs work in a storage container in order * and returns any per run resources */ -template struct WorkRunner< - RAJA::hip_work_explicit, + RAJA::hip_work, RAJA::ordered, ALLOCATOR_T, INDEX_T, Args...> : WorkRunnerForallOrdered< - RAJA::hip_exec_explicit_async, - RAJA::hip_work_explicit, + RAJA::hip_exec_async, + RAJA::hip_work, RAJA::ordered, ALLOCATOR_T, INDEX_T, Args...> { using base = WorkRunnerForallOrdered< - RAJA::hip_exec_explicit_async, - RAJA::hip_work_explicit, + RAJA::hip_exec_async, + RAJA::hip_work, RAJA::ordered, ALLOCATOR_T, INDEX_T, @@ -91,27 +91,27 @@ struct WorkRunner< * Runs work in a storage container in reverse order * and returns any per run resources */ -template struct WorkRunner< - RAJA::hip_work_explicit, + RAJA::hip_work, RAJA::reverse_ordered, ALLOCATOR_T, INDEX_T, Args...> : WorkRunnerForallReverse< - RAJA::hip_exec_explicit_async, - RAJA::hip_work_explicit, + RAJA::hip_exec_async, + RAJA::hip_work, RAJA::reverse_ordered, ALLOCATOR_T, INDEX_T, Args...> { using base = WorkRunnerForallReverse< - RAJA::hip_exec_explicit_async, - RAJA::hip_work_explicit, + RAJA::hip_exec_async, + RAJA::hip_work, RAJA::reverse_ordered, ALLOCATOR_T, INDEX_T, @@ -178,14 +178,12 @@ struct HoldHipDeviceXThreadblockLoop LoopBody m_body; }; -// HIP BLOCKS_PER_SM calculation is actually MIN_WARPS_PER_EXECUTION_UNIT template < size_t BLOCK_SIZE, - size_t BLOCKS_PER_SM, typename StorageIter, typename value_type, typename index_type, typename ... Args > -__launch_bounds__(BLOCK_SIZE, (BLOCK_SIZE * BLOCKS_PER_SM)/32) __global__ +__launch_bounds__(BLOCK_SIZE, 1) __global__ void hip_unordered_y_block_global(StorageIter iter, Args... args) { const index_type i_loop = blockIdx.y; @@ -201,24 +199,24 @@ __launch_bounds__(BLOCK_SIZE, (BLOCK_SIZE * BLOCKS_PER_SM)/32) __global__ * the x direction, with the number of threads in the x dimension determined * by the average number of iterates per loop */ -template struct WorkRunner< - RAJA::hip_work_explicit, + RAJA::hip_work, RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average, ALLOCATOR_T, INDEX_T, Args...> { - using exec_policy = RAJA::hip_work_explicit; + using exec_policy = RAJA::hip_work; using order_policy = RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average; using Allocator = ALLOCATOR_T; using index_type = INDEX_T; using resource_type = resources::Hip; - using vtable_type = Vtable, Args...>; + using vtable_type = Vtable, Args...>; WorkRunner() = default; @@ -294,7 +292,7 @@ struct WorkRunner< per_run_storage run_storage{}; - auto func = hip_unordered_y_block_global; + auto func = hip_unordered_y_block_global; // // Compute the requested iteration space size diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp index db5226524b..99e1b8a63a 100644 --- a/include/RAJA/policy/hip/forall.hpp +++ b/include/RAJA/policy/hip/forall.hpp @@ -127,13 +127,11 @@ __device__ __forceinline__ unsigned int getGlobalNumThreads_3D_3D() * ****************************************************************************** */ -// HIP BLOCKS_PER_SM calculation is actually MIN_WARPS_PER_EXECUTION_UNIT template -__launch_bounds__(BlockSize, (BlockSize * BlocksPerSM)/32) __global__ +__launch_bounds__(BlockSize, 1) __global__ void forall_hip_kernel(LOOP_BODY loop_body, const Iterator idx, IndexType length) @@ -157,9 +155,9 @@ __launch_bounds__(BlockSize, (BlockSize * BlocksPerSM)/32) __global__ //////////////////////////////////////////////////////////////////////// // -template +template RAJA_INLINE resources::EventProxy forall_impl(resources::Hip hip_res, - hip_exec_explicit, + hip_exec, Iterable&& iter, LoopBody&& loop_body) { @@ -167,7 +165,7 @@ RAJA_INLINE resources::EventProxy forall_impl(resources::Hip hip using LOOP_BODY = camp::decay; using IndexType = camp::decay; - auto func = impl::forall_hip_kernel; + auto func = impl::forall_hip_kernel; // // Compute the requested iteration space size @@ -237,12 +235,11 @@ RAJA_INLINE resources::EventProxy forall_impl(resources::Hip hip */ template RAJA_INLINE resources::EventProxy forall_impl(resources::Hip r, - ExecPolicy>, + ExecPolicy>, const TypedIndexSet& iset, LoopBody&& loop_body) { @@ -251,7 +248,7 @@ forall_impl(resources::Hip r, iset.segmentCall(r, isi, detail::CallForall(), - hip_exec_explicit(), + hip_exec(), loop_body); } // iterate over segments of index set diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp index d5896a5948..6acef19089 100644 --- a/include/RAJA/policy/hip/kernel/HipKernel.hpp +++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp @@ -45,30 +45,28 @@ namespace RAJA /*! * HIP kernel launch policy where the user may specify the number of physical - * thread blocks, threads per block, and blocks per SM. + * thread blocks and threads per block. * If num_blocks is 0 and num_threads is non-zero then num_blocks is chosen at * runtime. * Num_blocks is chosen to maximize the number of blocks running concurrently. - * Blocks per SM must be chosen by the user. + * If num_threads and num_blocks are both 0 then num_threads and num_blocks are + * chosen at runtime. + * Num_threads and num_blocks are determined by the HIP occupancy calculator. + * If num_threads is 0 and num_blocks is non-zero then num_threads is chosen at + * runtime. + * Num_threads is 1024, which may not be appropriate for all kernels. */ -template -struct hip_explicit_launch {}; +template +struct hip_launch {}; /*! * HIP kernel launch policy where the user specifies the number of physical * thread blocks and threads per block. * If num_blocks is 0 then num_blocks is chosen at runtime. * Num_blocks is chosen to maximize the number of blocks running concurrently. - * If num_threads and num_blocks are both 0 then num_threads and num_blocks are - * chosen at runtime. - * Num_threads and num_blocks are determined by the HIP occupancy calculator. - * If num_threads is 0 and num_blocks is non-zero then num_threads is chosen at - * runtime. - * Num_threads is 1024, which may not be appropriate for all kernels. - * Blocks per SM defaults to 1. */ template -using hip_launch = hip_explicit_launch; +using hip_explicit_launch = hip_launch; /*! @@ -77,7 +75,7 @@ using hip_launch = hip_explicit_launch; * If num_threads is 0 then num_threads is chosen at runtime. */ template -using hip_occ_calc_launch = hip_explicit_launch; +using hip_occ_calc_launch = hip_launch; namespace statement { @@ -89,7 +87,7 @@ namespace statement */ template struct HipKernelExt - : public internal::Statement, EnclosedStmts...> { + : public internal::Statement, EnclosedStmts...> { }; @@ -138,7 +136,7 @@ using HipKernelOccAsync = */ template using HipKernelFixed = - HipKernelExt, + HipKernelExt, EnclosedStmts...>; /*! @@ -158,17 +156,7 @@ using HipKernelFixedSM = */ template using HipKernelFixedAsync = - HipKernelExt, EnclosedStmts...>; - -/*! - * A RAJA::kernel statement that launches a HIP kernel with a fixed - * number of threads (specified by num_threads) and min blocks per sm. - * The kernel launch is asynchronous. - */ -template -using HipKernelFixedSMAsync = - HipKernelExt, - EnclosedStmts...>; + HipKernelExt, EnclosedStmts...>; /*! * A RAJA::kernel statement that launches a HIP kernel with 1024 threads @@ -211,9 +199,8 @@ __global__ void HipKernelLauncher(Data data) * * This launcher is used by the HipKerelFixed policies. */ -// HIP BLOCKS_PER_SM calculation is actually MIN_WARPS_PER_EXECUTION_UNIT -template -__launch_bounds__(BlockSize, (BlockSize * BlocksPerSM)/32) __global__ +template +__launch_bounds__(BlockSize, 1) __global__ void HipKernelLauncherFixed(Data data) { @@ -233,13 +220,13 @@ __launch_bounds__(BlockSize, (BlockSize * BlocksPerSM)/32) __global__ * The default case handles BlockSize != 0 and gets the fixed max block size * version of the kernel. */ -template +template struct HipKernelLauncherGetter { - using type = camp::decay)>; + using type = camp::decay)>; static constexpr type get() noexcept { - return internal::HipKernelLauncherFixed; + return internal::HipKernelLauncherFixed; } }; @@ -248,7 +235,7 @@ struct HipKernelLauncherGetter * block size version of the kernel. */ template -struct HipKernelLauncherGetter<0, 0, Data, executor_t> +struct HipKernelLauncherGetter<0, Data, executor_t> { using type = camp::decay)>; static constexpr type get() noexcept @@ -272,8 +259,8 @@ struct HipLaunchHelper; * The user may specify the number of threads and blocks or let one or both be * determined at runtime using the HIP occupancy calculator. */ -template -struct HipLaunchHelper,StmtList,Data,Types> +template +struct HipLaunchHelper,StmtList,Data,Types> { using Self = HipLaunchHelper; @@ -281,7 +268,7 @@ struct HipLaunchHelper; - using kernelGetter_t = HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, (blocks_per_sm <= 0) ? 0 : blocks_per_sm, Data, executor_t>; + using kernelGetter_t = HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, Data, executor_t>; inline static void recommended_blocks_threads(int shmem_size, int &recommended_blocks, int &recommended_threads) diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index f0886b37bd..18364c20a2 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -70,24 +70,21 @@ namespace policy namespace hip { -template -struct hip_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t< +template +struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::hip, RAJA::Pattern::forall, detail::get_launch::value, RAJA::Platform::hip> { }; -namespace expt -{ -template -struct hip_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t< +template +struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::hip, RAJA::Pattern::region, detail::get_launch::value, RAJA::Platform::hip> { }; -} // @@ -97,8 +94,8 @@ struct hip_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_ /// /// WorkGroup execution policies /// -template -struct hip_work_explicit : public RAJA::make_policy_pattern_launch_platform_t< +template +struct hip_work : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::hip, RAJA::Pattern::workgroup_exec, detail::get_launch::value, @@ -225,24 +222,15 @@ using hip_atomic = hip_atomic_explicit; } // end namespace hip } // end namespace policy -using policy::hip::hip_exec_explicit; - -template -using hip_exec_explicit_async = policy::hip::hip_exec_explicit; - -template -using hip_exec = policy::hip::hip_exec_explicit; +using policy::hip::hip_exec; template -using hip_exec_async = policy::hip::hip_exec_explicit; - -using policy::hip::hip_work_explicit; +using hip_exec_async = policy::hip::hip_exec; -template -using hip_work = policy::hip::hip_work_explicit; +using policy::hip::hip_work; template -using hip_work_async = policy::hip::hip_work_explicit; +using hip_work_async = policy::hip::hip_work; using policy::hip::hip_atomic; using policy::hip::hip_atomic_explicit; @@ -271,9 +259,7 @@ using policy::hip::hip_synchronize; namespace expt { - // num_threads defaults to 1, but not expected to be used in kernel launch - template - using hip_launch_t = policy::hip::expt::hip_launch_explicit_t; + using policy::hip::hip_launch_t; } /*! diff --git a/include/RAJA/policy/hip/scan.hpp b/include/RAJA/policy/hip/scan.hpp index 999837586c..85bc494abb 100644 --- a/include/RAJA/policy/hip/scan.hpp +++ b/include/RAJA/policy/hip/scan.hpp @@ -47,12 +47,12 @@ namespace scan \brief explicit inclusive inplace scan given range, function, and initial value */ -template +template RAJA_INLINE resources::EventProxy inclusive_inplace( resources::Hip hip_res, - hip_exec_explicit, + hip_exec, InputIter begin, InputIter end, Function binary_op) @@ -116,7 +116,6 @@ inclusive_inplace( initial value */ template exclusive_inplace( resources::Hip hip_res, - hip_exec_explicit, + hip_exec, InputIter begin, InputIter end, Function binary_op, @@ -193,7 +192,6 @@ exclusive_inplace( initial value */ template inclusive( resources::Hip hip_res, - hip_exec_explicit, + hip_exec, InputIter begin, InputIter end, OutputIter out, @@ -266,7 +264,6 @@ inclusive( initial value */ template exclusive( resources::Hip hip_res, - hip_exec_explicit, + hip_exec, InputIter begin, InputIter end, OutputIter out, diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp index 13b1151ca7..9090721ff5 100644 --- a/include/RAJA/policy/hip/sort.hpp +++ b/include/RAJA/policy/hip/sort.hpp @@ -73,7 +73,7 @@ namespace detail /*! \brief static assert unimplemented stable sort */ -template +template concepts::enable_if_t, concepts::negate>, @@ -83,7 +83,7 @@ concepts::enable_if_t, camp::is_same>>>>>> stable( resources::Hip hip_res, - hip_exec_explicit, + hip_exec, Iter, Iter, Compare) @@ -102,13 +102,13 @@ stable( /*! \brief stable sort given range in ascending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> stable( resources::Hip hip_res, - hip_exec_explicit, + hip_exec, Iter begin, Iter end, operators::less>) @@ -190,13 +190,13 @@ stable( /*! \brief stable sort given range in descending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> stable( resources::Hip hip_res, - hip_exec_explicit, + hip_exec, Iter begin, Iter end, operators::greater>) @@ -279,7 +279,7 @@ stable( /*! \brief static assert unimplemented sort */ -template +template concepts::enable_if_t, concepts::negate>, @@ -289,7 +289,7 @@ concepts::enable_if_t, camp::is_same>>>>>> unstable( resources::Hip hip_res, - hip_exec_explicit, + hip_exec, Iter, Iter, Compare) @@ -308,13 +308,13 @@ unstable( /*! \brief sort given range in ascending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> unstable( resources::Hip hip_res, - hip_exec_explicit p, + hip_exec p, Iter begin, Iter end, operators::less> comp) @@ -325,13 +325,13 @@ unstable( /*! \brief sort given range in descending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> unstable( resources::Hip hip_res, - hip_exec_explicit p, + hip_exec p, Iter begin, Iter end, operators::greater> comp) @@ -343,7 +343,7 @@ unstable( /*! \brief static assert unimplemented stable sort pairs */ -template concepts::enable_if_t, concepts::negate, camp::is_same>>>>>> stable_pairs( resources::Hip hip_res, - hip_exec_explicit, + hip_exec, KeyIter, KeyIter, ValIter, @@ -379,7 +379,7 @@ stable_pairs( /*! \brief stable sort given range of pairs in ascending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -387,7 +387,7 @@ concepts::enable_if_t, std::is_pointer> stable_pairs( resources::Hip hip_res, - hip_exec_explicit, + hip_exec, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, @@ -483,7 +483,7 @@ stable_pairs( /*! \brief stable sort given range of pairs in descending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -491,7 +491,7 @@ concepts::enable_if_t, std::is_pointer> stable_pairs( resources::Hip hip_res, - hip_exec_explicit, + hip_exec, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, @@ -588,7 +588,7 @@ stable_pairs( /*! \brief static assert unimplemented sort pairs */ -template concepts::enable_if_t, concepts::negate, camp::is_same>>>>>> unstable_pairs( resources::Hip hip_res, - hip_exec_explicit, + hip_exec, KeyIter, KeyIter, ValIter, @@ -624,7 +624,7 @@ unstable_pairs( /*! \brief stable sort given range of pairs in ascending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -632,7 +632,7 @@ concepts::enable_if_t, std::is_pointer> unstable_pairs( resources::Hip hip_res, - hip_exec_explicit p, + hip_exec p, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, @@ -644,7 +644,7 @@ unstable_pairs( /*! \brief stable sort given range of pairs in descending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -652,7 +652,7 @@ concepts::enable_if_t, std::is_pointer> unstable_pairs( resources::Hip hip_res, - hip_exec_explicit p, + hip_exec p, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, diff --git a/include/RAJA/policy/hip/teams.hpp b/include/RAJA/policy/hip/teams.hpp index d6e8ea28f1..f3dd62d560 100644 --- a/include/RAJA/policy/hip/teams.hpp +++ b/include/RAJA/policy/hip/teams.hpp @@ -32,7 +32,7 @@ namespace expt { template -__global__ static void launch_global_fcn(LaunchContext ctx, BODY body_in) +__global__ void launch_global_fcn(LaunchContext ctx, BODY body_in) { using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(body_in); @@ -41,8 +41,7 @@ __global__ static void launch_global_fcn(LaunchContext ctx, BODY body_in) } template -struct LaunchExecute> { -// hip_launch_t num_threads set to 1, but not used in launch of kernel +struct LaunchExecute> { template static void exec(LaunchContext const &ctx, BODY_IN &&body_in) @@ -152,9 +151,8 @@ struct LaunchExecute> { }; -// HIP BLOCKS_PER_SM calculation is actually MIN_WARPS_PER_EXECUTION_UNIT -template -__launch_bounds__(num_threads, (num_threads * BLOCKS_PER_SM)/32) __global__ +template +__launch_bounds__(num_threads, 1) __global__ static void launch_global_fcn_fixed(LaunchContext ctx, BODY body_in) { using RAJA::internal::thread_privatize; @@ -163,15 +161,15 @@ static void launch_global_fcn_fixed(LaunchContext ctx, BODY body_in) body(ctx); } -template -struct LaunchExecute> { +template +struct LaunchExecute> { template static void exec(LaunchContext const &ctx, BODY_IN &&body_in) { using BODY = camp::decay; - auto func = launch_global_fcn_fixed; + auto func = launch_global_fcn_fixed; resources::Hip hip_res = resources::Hip::get_default(); @@ -224,7 +222,7 @@ struct LaunchExecute; - auto func = launch_global_fcn_fixed; + auto func = launch_global_fcn; resources::Hip hip_res = res.get(); diff --git a/include/RAJA/util/resource.hpp b/include/RAJA/util/resource.hpp index 7fca8332bd..40d8f973ac 100644 --- a/include/RAJA/util/resource.hpp +++ b/include/RAJA/util/resource.hpp @@ -99,16 +99,6 @@ namespace RAJA struct get_resource>>{ using type = camp::resources::Hip; }; - - template - struct get_resource>{ - using type = camp::resources::Hip; - }; - - template - struct get_resource>>{ - using type = camp::resources::Hip; - }; #endif #if defined(RAJA_ENABLE_SYCL) diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp index 27d4624049..dfc42dd746 100644 --- a/test/include/RAJA_test-forall-execpol.hpp +++ b/test/include/RAJA_test-forall-execpol.hpp @@ -134,8 +134,7 @@ using CudaForallAtomicExecPols = CudaForallExecPols; #if defined(RAJA_ENABLE_HIP) using HipForallExecPols = camp::list< RAJA::hip_exec<128>, - RAJA::hip_exec<256>, - RAJA::hip_exec_explicit<256,2> >; + RAJA::hip_exec<256> >; using HipForallReduceExecPols = HipForallExecPols; diff --git a/test/include/RAJA_test-teams-execpol.hpp b/test/include/RAJA_test-teams-execpol.hpp index 0311191790..37f163d09d 100644 --- a/test/include/RAJA_test-teams-execpol.hpp +++ b/test/include/RAJA_test-teams-execpol.hpp @@ -38,14 +38,8 @@ using seq_hip_policies = camp::list< RAJA::expt::LoopPolicy, RAJA::expt::LoopPolicy>; -using seq_hip_explicit_policies = camp::list< - RAJA::expt::LaunchPolicy>, - RAJA::expt::LoopPolicy, - RAJA::expt::LoopPolicy>; - using Sequential_launch_policies = camp::list< - seq_hip_policies, - seq_hip_explicit_policies + seq_hip_policies >; #else using Sequential_launch_policies = camp::list< @@ -85,15 +79,8 @@ using omp_hip_policies = camp::list< RAJA::expt::LoopPolicy >; -using omp_hip_explicit_policies = camp::list< - RAJA::expt::LaunchPolicy>, - RAJA::expt::LoopPolicy, - RAJA::expt::LoopPolicy - >; - using OpenMP_launch_policies = camp::list< - omp_hip_policies, - omp_hip_explicit_policies + omp_hip_policies >; #else using OpenMP_launch_policies = camp::list< @@ -119,10 +106,8 @@ using Cuda_launch_policies = camp::list< #if defined(RAJA_ENABLE_HIP) using Hip_launch_policies = camp::list< seq_hip_policies - , seq_hip_explicit_policies #if defined(RAJA_ENABLE_OPENMP) , omp_hip_policies - , omp_hip_explicit_policies #endif >; #endif // RAJA_ENABLE_HIP diff --git a/test/include/RAJA_test-workgroup.hpp b/test/include/RAJA_test-workgroup.hpp index e2f44039ee..43be1b98e7 100644 --- a/test/include/RAJA_test-workgroup.hpp +++ b/test/include/RAJA_test-workgroup.hpp @@ -384,8 +384,7 @@ using HipExecPolicyList = #if defined(RAJA_TEST_EXHAUSTIVE) RAJA::hip_work<256>, #endif - RAJA::hip_work<1024>, - RAJA::hip_work_explicit<256, 2> + RAJA::hip_work<1024> >; using HipOrderedPolicyList = SequentialOrderedPolicyList; using HipOrderPolicyList = diff --git a/test/old-tests/unit/test-kernel.cpp b/test/old-tests/unit/test-kernel.cpp index 69d3de8171..6b00050aa4 100644 --- a/test/old-tests/unit/test-kernel.cpp +++ b/test/old-tests/unit/test-kernel.cpp @@ -3181,7 +3181,7 @@ GPU_TEST(Kernel_gpu, HipExec1c) // Loop Fusion using Pol = KernelPolicy< - HipKernelExt, + HipKernelExt, statement::Tile<2, tile_fixed<2>, hip_block_z_loop, For<0, hip_block_x_loop, For<1, hip_block_y_loop, diff --git a/test/unit/algorithm/tests/test-algorithm-sort.hpp b/test/unit/algorithm/tests/test-algorithm-sort.hpp index c0f584b80a..b0eb6d391b 100644 --- a/test/unit/algorithm/tests/test-algorithm-sort.hpp +++ b/test/unit/algorithm/tests/test-algorithm-sort.hpp @@ -127,8 +127,7 @@ using CudaSortSorters = using HipSortSorters = camp::list< PolicySort>, - PolicySortPairs>, - PolicySort> + PolicySortPairs> >; #endif diff --git a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp index 1001d12970..2ec301faaf 100644 --- a/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp +++ b/test/unit/algorithm/tests/test-algorithm-stable-sort.hpp @@ -127,8 +127,7 @@ using CudaStableSortSorters = using HipStableSortSorters = camp::list< PolicyStableSort>, - PolicyStableSortPairs>, - PolicyStableSort> + PolicyStableSortPairs> >; #endif From cfa84ae19c4fbdd9e30ab91788f142e9a2974f05 Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Wed, 5 Jan 2022 16:45:16 -0800 Subject: [PATCH 17/21] More HIP undos, and formatting. --- include/RAJA/policy/hip/kernel/HipKernel.hpp | 12 +----------- include/RAJA/policy/hip/policy.hpp | 8 ++++---- include/RAJA/policy/hip/teams.hpp | 4 ++-- test/include/RAJA_test-forall-execpol.hpp | 2 +- 4 files changed, 8 insertions(+), 18 deletions(-) diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp index 6acef19089..3a905ca723 100644 --- a/include/RAJA/policy/hip/kernel/HipKernel.hpp +++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp @@ -75,7 +75,7 @@ using hip_explicit_launch = hip_launch; * If num_threads is 0 then num_threads is chosen at runtime. */ template -using hip_occ_calc_launch = hip_launch; +using hip_occ_calc_launch = hip_launch; namespace statement { @@ -139,16 +139,6 @@ using HipKernelFixed = HipKernelExt, EnclosedStmts...>; -/*! - * A RAJA::kernel statement that launches a HIP kernel with a fixed - * number of threads (specified by num_threads) and min blocks per sm. - * The kernel launch is synchronous. - */ -template -using HipKernelFixedSM = - HipKernelExt, - EnclosedStmts...>; - /*! * A RAJA::kernel statement that launches a HIP kernel with a fixed * number of threads (specified by num_threads) diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 18364c20a2..d9b8d62fff 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -80,10 +80,10 @@ struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t< template struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t< - RAJA::Policy::hip, - RAJA::Pattern::region, - detail::get_launch::value, - RAJA::Platform::hip> { + RAJA::Policy::hip, + RAJA::Pattern::region, + detail::get_launch::value, + RAJA::Platform::hip> { }; diff --git a/include/RAJA/policy/hip/teams.hpp b/include/RAJA/policy/hip/teams.hpp index f3dd62d560..5e5cc39d4a 100644 --- a/include/RAJA/policy/hip/teams.hpp +++ b/include/RAJA/policy/hip/teams.hpp @@ -162,7 +162,7 @@ static void launch_global_fcn_fixed(LaunchContext ctx, BODY body_in) } template -struct LaunchExecute> { +struct LaunchExecute> { template static void exec(LaunchContext const &ctx, BODY_IN &&body_in) @@ -222,7 +222,7 @@ struct LaunchExecute> { { using BODY = camp::decay; - auto func = launch_global_fcn; + auto func = launch_global_fcn; resources::Hip hip_res = res.get(); diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp index dfc42dd746..fd2cdc18e8 100644 --- a/test/include/RAJA_test-forall-execpol.hpp +++ b/test/include/RAJA_test-forall-execpol.hpp @@ -134,7 +134,7 @@ using CudaForallAtomicExecPols = CudaForallExecPols; #if defined(RAJA_ENABLE_HIP) using HipForallExecPols = camp::list< RAJA::hip_exec<128>, - RAJA::hip_exec<256> >; + RAJA::hip_exec<256> >; using HipForallReduceExecPols = HipForallExecPols; From f263497a3044683e8628c9d96874ca5004476f6a Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Thu, 6 Jan 2022 14:36:02 -0800 Subject: [PATCH 18/21] Fix unrelated typo. --- docs/sphinx/user_guide/feature/policies.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index d66a392b3b..b24094ea09 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -684,7 +684,7 @@ used and the OpenMP version of the atomic operation is applied. Here is an example illustrating use of the ``auto_atomic`` policy:: - RAJA::forall< RAJA::cuda_execBLOCK_SIZE> >(RAJA::RangeSegment seg(0, N), + RAJA::forall< RAJA::cuda_exec >(RAJA::RangeSegment seg(0, N), [=] RAJA_DEVICE (RAJA::Index_type i) { RAJA::atomicAdd< RAJA::auto_atomic >(&sum, 1); From 96a1635247a58ffd03ff256c14b9e2f4ffe60a69 Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Thu, 6 Jan 2022 14:59:54 -0800 Subject: [PATCH 19/21] Documentation for cuda_exec_explicit. --- docs/sphinx/user_guide/feature/resource.rst | 1 + docs/sphinx/user_guide/tutorial/add_vectors.rst | 11 +++++++++++ examples/tut_add-vectors.cpp | 12 ++++++++++++ 3 files changed, 24 insertions(+) diff --git a/docs/sphinx/user_guide/feature/resource.rst b/docs/sphinx/user_guide/feature/resource.rst index 69842465b6..1a28720553 100644 --- a/docs/sphinx/user_guide/feature/resource.rst +++ b/docs/sphinx/user_guide/feature/resource.rst @@ -130,6 +130,7 @@ execution policy suport. ======== ============================== Cuda | cuda_exec | cuda_exec_async + | cuda_exec_explicit Hip | hip_exec | hip_exec_async Omp* | omp_target_parallel_for_exec diff --git a/docs/sphinx/user_guide/tutorial/add_vectors.rst b/docs/sphinx/user_guide/tutorial/add_vectors.rst index c615dbf4bf..73e7ed13d9 100644 --- a/docs/sphinx/user_guide/tutorial/add_vectors.rst +++ b/docs/sphinx/user_guide/tutorial/add_vectors.rst @@ -96,6 +96,17 @@ Note that the CUDA execution policy type accepts a template argument ``CUDA_BLOCK_SIZE``, which specifies that each CUDA thread block launched to execute the kernel will have the given number threads in the block. +For performance tuning, the ``RAJA::cuda_exec_explicit`` policy is also +provided. This allows the user to specify the number of blocks allocated +per streaming multiprocessor (SM) to allow additional block level +parallelism. Note that the third boolean argument representing asynchronous +execution can be omitted, and is ``false`` by default: + +.. literalinclude:: ../../../../examples/tut_add-vectors.cpp + :start-after: _rajacuda_explicit_vector_add_start + :end-before: _rajacuda_explicit_vector_add_end + :language: C++ + Since the lambda defining the loop body will be passed to a device kernel, it must be decorated with the ``__device__`` attribute when it is defined. This can be done directly or by using the ``RAJA_DEVICE`` macro. diff --git a/examples/tut_add-vectors.cpp b/examples/tut_add-vectors.cpp index cbc993cb03..c64ee4ab32 100644 --- a/examples/tut_add-vectors.cpp +++ b/examples/tut_add-vectors.cpp @@ -168,6 +168,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) checkResult(c, N); //printResult(c, N); + + std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n"; + + // _rajacuda_explicit_vector_add_start + RAJA::forall>(RAJA::RangeSegment(0, N), + [=] RAJA_DEVICE (int i) { + c[i] = a[i] + b[i]; + }); + // _rajacuda_explicit_vector_add_end + + checkResult(c, N); +//printResult(c, N); #endif //----------------------------------------------------------------------------// From 878b113829cab6d276bcd695364a5ea3681c394e Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Thu, 6 Jan 2022 15:38:33 -0800 Subject: [PATCH 20/21] Clarify async in example. --- examples/tut_add-vectors.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/tut_add-vectors.cpp b/examples/tut_add-vectors.cpp index c64ee4ab32..b9330fdeee 100644 --- a/examples/tut_add-vectors.cpp +++ b/examples/tut_add-vectors.cpp @@ -169,10 +169,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) checkResult(c, N); //printResult(c, N); + bool Asynchronous = false; std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n"; // _rajacuda_explicit_vector_add_start - RAJA::forall>(RAJA::RangeSegment(0, N), + RAJA::forall>(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { c[i] = a[i] + b[i]; }); From 0090cadf7f3bf1cbe8dc3698bbc64e94a0b9a3ee Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Thu, 6 Jan 2022 17:30:03 -0800 Subject: [PATCH 21/21] Satisfy NVCC const. --- examples/tut_add-vectors.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/tut_add-vectors.cpp b/examples/tut_add-vectors.cpp index b9330fdeee..9d77468276 100644 --- a/examples/tut_add-vectors.cpp +++ b/examples/tut_add-vectors.cpp @@ -169,11 +169,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) checkResult(c, N); //printResult(c, N); - bool Asynchronous = false; + const bool Asynchronous = false; std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n"; // _rajacuda_explicit_vector_add_start - RAJA::forall>(RAJA::RangeSegment(0, N), + RAJA::forall>(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { c[i] = a[i] + b[i]; });