From 51f416dbada7490f44c79e439dd33e53a452355a Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Tue, 1 Dec 2020 02:22:18 +0100 Subject: [PATCH] Simplify cudacompat layer to use a 1-dimensional grid Remove the possibility of changing the grid size used by the cms::cudacompat layer, and make it a constant equal to {1, 1, 1}. This avoids a thread-related problem caused by TBB using worker threads where the grid size had not been initialised. --- .../CUDAUtilities/interface/cudaCompat.h | 48 ++++++------------- .../CUDAUtilities/src/cudaCompat.cc | 17 ------- 2 files changed, 15 insertions(+), 50 deletions(-) delete mode 100644 HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h index f9b4b2f8a4c16..8bd51d3fa7959 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h +++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h @@ -11,21 +11,26 @@ #include #include +// include the CUDA runtime header to define some of the attributes, types and sybols also on the CPU #include +// make sure function are inlined to avoid multiple definition +#undef __global__ +#define __global__ inline __attribute__((always_inline)) + +#undef __forceinline__ +#define __forceinline__ inline __attribute__((always_inline)) + namespace cms { namespace cudacompat { -#ifndef __CUDA_RUNTIME_H__ - struct dim3 { - uint32_t x, y, z; - }; -#endif + // run serially with a single thread + // 1-dimensional block const dim3 threadIdx = {0, 0, 0}; const dim3 blockDim = {1, 1, 1}; - - extern thread_local dim3 blockIdx; - extern thread_local dim3 gridDim; + // 1-dimensional grid + const dim3 blockIdx = {0, 0, 0}; + const dim3 gridDim = {1, 1, 1}; template T1 atomicCAS(T1* address, T1 compare, T2 val) { @@ -78,35 +83,12 @@ namespace cms { return *x; } - inline void resetGrid() { - blockIdx = {0, 0, 0}; - gridDim = {1, 1, 1}; - } - } // namespace cudacompat } // namespace cms -// some not needed as done by cuda runtime... -#ifndef __CUDA_RUNTIME_H__ -#define __host__ -#define __device__ -#define __global__ -#define __shared__ -#define __forceinline__ -#endif - -// make sure function are inlined to avoid multiple definition -#ifndef __CUDA_ARCH__ -#undef __global__ -#define __global__ inline __attribute__((always_inline)) -#undef __forceinline__ -#define __forceinline__ inline __attribute__((always_inline)) -#endif - -#ifndef __CUDA_ARCH__ +// make the cudacompat implementation available in the global namespace using namespace cms::cudacompat; -#endif -#endif +#endif // __CUDACC__ #endif // HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h diff --git a/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc b/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc deleted file mode 100644 index 7b8efda8e3811..0000000000000 --- a/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc +++ /dev/null @@ -1,17 +0,0 @@ -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" - -namespace cms { - namespace cudacompat { - thread_local dim3 blockIdx; - thread_local dim3 gridDim; - } // namespace cudacompat -} // namespace cms - -namespace { - struct InitGrid { - InitGrid() { cms::cudacompat::resetGrid(); } - }; - - const InitGrid initGrid; - -} // namespace