From 51f416dbada7490f44c79e439dd33e53a452355a Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 1 Dec 2020 02:22:18 +0100
Subject: [PATCH] Simplify cudacompat layer to use a 1-dimensional grid

Remove the possibility of changing the grid size used by the
cms::cudacompat layer, and make it a constant equal to {1, 1, 1}.

This avoids a thread-related problem caused by TBB using worker threads
where the grid size had not been initialised.
---
 .../CUDAUtilities/interface/cudaCompat.h      | 48 ++++++-------------
 .../CUDAUtilities/src/cudaCompat.cc           | 17 -------
 2 files changed, 15 insertions(+), 50 deletions(-)
 delete mode 100644 HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
index f9b4b2f8a4c16..8bd51d3fa7959 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
@@ -11,21 +11,26 @@
 #include <cstdint>
 #include <cstring>
 
+// include the CUDA runtime header to define some of the attributes, types and sybols also on the CPU
 #include <cuda_runtime.h>
 
+// make sure function are inlined to avoid multiple definition
+#undef __global__
+#define __global__ inline __attribute__((always_inline))
+
+#undef __forceinline__
+#define __forceinline__ inline __attribute__((always_inline))
+
 namespace cms {
   namespace cudacompat {
 
-#ifndef __CUDA_RUNTIME_H__
-    struct dim3 {
-      uint32_t x, y, z;
-    };
-#endif
+    // run serially with a single thread
+    // 1-dimensional block
     const dim3 threadIdx = {0, 0, 0};
     const dim3 blockDim = {1, 1, 1};
-
-    extern thread_local dim3 blockIdx;
-    extern thread_local dim3 gridDim;
+    // 1-dimensional grid
+    const dim3 blockIdx = {0, 0, 0};
+    const dim3 gridDim = {1, 1, 1};
 
     template <typename T1, typename T2>
     T1 atomicCAS(T1* address, T1 compare, T2 val) {
@@ -78,35 +83,12 @@ namespace cms {
       return *x;
     }
 
-    inline void resetGrid() {
-      blockIdx = {0, 0, 0};
-      gridDim = {1, 1, 1};
-    }
-
   }  // namespace cudacompat
 }  // namespace cms
 
-// some  not needed as done by cuda runtime...
-#ifndef __CUDA_RUNTIME_H__
-#define __host__
-#define __device__
-#define __global__
-#define __shared__
-#define __forceinline__
-#endif
-
-// make sure function are inlined to avoid multiple definition
-#ifndef __CUDA_ARCH__
-#undef __global__
-#define __global__ inline __attribute__((always_inline))
-#undef __forceinline__
-#define __forceinline__ inline __attribute__((always_inline))
-#endif
-
-#ifndef __CUDA_ARCH__
+// make the cudacompat implementation available in the global namespace
 using namespace cms::cudacompat;
-#endif
 
-#endif
+#endif  // __CUDACC__
 
 #endif  // HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h
diff --git a/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc b/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc
deleted file mode 100644
index 7b8efda8e3811..0000000000000
--- a/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
-
-namespace cms {
-  namespace cudacompat {
-    thread_local dim3 blockIdx;
-    thread_local dim3 gridDim;
-  }  // namespace cudacompat
-}  // namespace cms
-
-namespace {
-  struct InitGrid {
-    InitGrid() { cms::cudacompat::resetGrid(); }
-  };
-
-  const InitGrid initGrid;
-
-}  // namespace