cms-patatrack · VinInn · Dec 4, 2019 · Dec 4, 2019 · Dec 4, 2019 · Dec 4, 2019
diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
@@ -5,6 +5,7 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
@@ -21,15 +22,15 @@ class HeterogeneousSoA {
 
   explicit HeterogeneousSoA(cudautils::device::unique_ptr<T> &&p) : dm_ptr(std::move(p)) {}
   explicit HeterogeneousSoA(cudautils::host::unique_ptr<T> &&p) : hm_ptr(std::move(p)) {}
-  explicit HeterogeneousSoA(std::unique_ptr<T> &&p) : std_ptr(std::move(p)) {}
+  explicit HeterogeneousSoA(cudautils::cpu::unique_ptr<T> &&p) : cm_ptr(std::move(p)) {}
 
-  auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
+  auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : cm_ptr.get()); }
 
   auto const &operator*() const { return *get(); }
 
   auto const *operator-> () const { return get(); }
 
-  auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
+  auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : cm_ptr.get()); }
 
   auto &operator*() { return *get(); }
 
@@ -47,12 +48,15 @@ class HeterogeneousSoA {
   // a union wan't do it, a variant will not be more efficienct
   cudautils::device::unique_ptr<T> dm_ptr;  //!
   cudautils::host::unique_ptr<T> hm_ptr;    //!
-  std::unique_ptr<T> std_ptr;               //!
+  cudautils::cpu::unique_ptr<T> cm_ptr;    //!
 };
 
 namespace cudaCompat {
 
   struct GPUTraits {
+    static constexpr const char * name = "GPU"; 
+    static constexpr bool runOnDevice = true;
+
     template <typename T>
     using unique_ptr = cudautils::device::unique_ptr<T>;
 
@@ -83,6 +87,9 @@ namespace cudaCompat {
   };
 
   struct HostTraits {
+    static constexpr const char * name = "HOST";
+    static constexpr bool runOnDevice = false;
+
     template <typename T>
     using unique_ptr = cudautils::host::unique_ptr<T>;
 
@@ -108,32 +115,45 @@ namespace cudaCompat {
   };
 
   struct CPUTraits {
+    static constexpr const char * name = "CPU";
+    static constexpr bool runOnDevice = false;
+
+    template <typename T>
+    using unique_ptr = cudautils::cpu::unique_ptr<T>;;
+
+    template <typename T>
+    static auto make_unique() {
+      return cudautils::make_cpu_unique<T>(cudaStreamDefault);
+    }
+
     template <typename T>
-    using unique_ptr = std::unique_ptr<T>;
+    static auto make_unique(size_t size) {
+      return cudautils::make_cpu_unique<T>(size,cudaStreamDefault);
+    }
 
     template <typename T>
-    static auto make_unique(cudaStream_t) {
-      return std::make_unique<T>();
+    static auto make_unique(cudaStream_t stream) {
+      return cudautils::make_cpu_unique<T>(stream);
     }
 
     template <typename T>
-    static auto make_unique(size_t size, cudaStream_t) {
-      return std::make_unique<T>(size);
+    static auto make_unique(size_t size, cudaStream_t stream) {
+      return cudautils::make_cpu_unique<T>(size, stream);
     }
 
     template <typename T>
-    static auto make_host_unique(cudaStream_t) {
-      return std::make_unique<T>();
+    static auto make_host_unique(cudaStream_t stream) {
+      return cudautils::make_cpu_unique<T>(stream);
     }
 
     template <typename T>
-    static auto make_device_unique(cudaStream_t) {
-      return std::make_unique<T>();
+    static auto make_device_unique(cudaStream_t stream) {
+      return cudautils::make_cpu_unique<T>(stream);
     }
 
     template <typename T>
-    static auto make_device_unique(size_t size, cudaStream_t) {
-      return std::make_unique<T>(size);
+    static auto make_device_unique(size_t size, cudaStream_t stream) {
+      return cudautils::make_cpu_unique<T>(size, stream);
     }
   };
 

diff --git a/CUDADataFormats/Common/interface/HostProduct.h b/CUDADataFormats/Common/interface/HostProduct.h
@@ -2,6 +2,7 @@
 #define CUDADataFormatsCommonHostProduct_H
 
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h"
 
 // a heterogeneous unique pointer...
 template <typename T>
@@ -13,17 +14,17 @@ class HostProduct {
   HostProduct& operator=(HostProduct&&) = default;
 
   explicit HostProduct(cudautils::host::unique_ptr<T>&& p) : hm_ptr(std::move(p)) {}
-  explicit HostProduct(std::unique_ptr<T>&& p) : std_ptr(std::move(p)) {}
+  explicit HostProduct(cudautils::cpu::unique_ptr<T>&& p) : cm_ptr(std::move(p)) {}
 
-  auto const* get() const { return hm_ptr ? hm_ptr.get() : std_ptr.get(); }
+  auto const* get() const { return hm_ptr ? hm_ptr.get() : cm_ptr.get(); }
 
   auto const& operator*() const { return *get(); }
 
   auto const* operator-> () const { return get(); }
 
 private:
   cudautils::host::unique_ptr<T> hm_ptr;  //!
-  std::unique_ptr<T> std_ptr;             //!
+  cudautils::cpu::unique_ptr<T>  cm_ptr;  //!
 };
 
 #endif
diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
@@ -2,7 +2,6 @@
 #define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
 
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
 
 #include <cuda_runtime.h>

diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
@@ -1,7 +1,6 @@
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 template <>

diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -72,7 +72,7 @@ namespace cudautils {
   inline void launchFinalize(Histo *__restrict__ h,
                              uint8_t *__restrict__ ws
 #ifndef __CUDACC__
-                             = cudaStreamDefault
+                             = nullptr
 #endif
                              ,
                              cudaStream_t stream

diff --git a/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h
@@ -0,0 +1,86 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_cpu_unique_ptr_h
+#define HeterogeneousCore_CUDAUtilities_interface_cpu_unique_ptr_h
+
+#include <memory>
+#include <functional>
+
+#include <cstdlib>
+#include <cuda_runtime.h>
+
+namespace cudautils {
+  namespace cpu {
+    namespace impl {
+      // Additional layer of types to distinguish from device:: and host::unique_ptr
+      class CPUDeleter {
+      public:
+        CPUDeleter() = default;
+
+        void operator()(void *ptr) {
+            ::free(ptr);
+        }
+      };
+    }  // namespace impl
+
+    template <typename T>
+    using unique_ptr = std::unique_ptr<T, impl::CPUDeleter>;
+
+    namespace impl {
+      template <typename T>
+      struct make_cpu_unique_selector {
+        using non_array = cudautils::cpu::unique_ptr<T>;
+      };
+      template <typename T>
+      struct make_cpu_unique_selector<T[]> {
+        using unbounded_array = cudautils::cpu::unique_ptr<T[]>;
+      };
+      template <typename T, size_t N>
+      struct make_cpu_unique_selector<T[N]> {
+        struct bounded_array {};
+      };
+    }  // namespace impl
+  }    // namespace cpu
+
+  template <typename T>
+  typename cpu::impl::make_cpu_unique_selector<T>::non_array make_cpu_unique(cudaStream_t) {
+    static_assert(std::is_trivially_constructible<T>::value,
+                  "Allocating with non-trivial constructor on the cpu memory is not supported");
+    void *mem = ::malloc(sizeof(T));
+    return typename cpu::impl::make_cpu_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                            cpu::impl::CPUDeleter()};
+  }
+
+  template <typename T>
+  typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array make_cpu_unique(size_t n, cudaStream_t) {
+    using element_type = typename std::remove_extent<T>::type;
+    static_assert(std::is_trivially_constructible<element_type>::value,
+                  "Allocating with non-trivial constructor on the cpu memory is not supported");
+    void *mem = ::malloc(n * sizeof(element_type));
+    return typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem),
+                                                                                  cpu::impl::CPUDeleter()};
+  }
+
+  template <typename T, typename... Args>
+  typename cpu::impl::make_cpu_unique_selector<T>::bounded_array make_cpu_unique(Args &&...) = delete;
+
+  // No check for the trivial constructor, make it clear in the interface
+  template <typename T>
+  typename cpu::impl::make_cpu_unique_selector<T>::non_array make_cpu_unique_uninitialized(cudaStream_t) {
+    void *mem = ::malloc(sizeof(T));
+    return typename cpu::impl::make_cpu_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                            cpu::impl::CPUDeleter()};
+  }
+
+  template <typename T>
+  typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array make_cpu_unique_uninitialized(size_t n, cudaStream_t) {
+    using element_type = typename std::remove_extent<T>::type;
+    void *mem = ::malloc(n * sizeof(element_type));
+    return typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem),
+                                                                                  cpu::impl::CPUDeleter()};
+  }
+
+  template <typename T, typename... Args>
+  typename cpu::impl::make_cpu_unique_selector<T>::bounded_array make_cpu_unique_uninitialized(Args &&...) =
+      delete;
+}  // namespace cudautils
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
@@ -6,6 +6,10 @@
  */
 
 #ifndef __CUDACC__
+  #define CUDA_KERNELS_ON_CPU
+#endif
+
+#ifdef CUDA_KERNELS_ON_CPU
 
 #include <algorithm>
 #include <cstdint>
@@ -86,18 +90,20 @@ namespace cudaCompat {
 #define __forceinline__
 #endif
 
-// make sure function are inlined to avoid multiple definition
 #ifndef __CUDA_ARCH__
+using namespace cudaCompat;
+#endif
+
+#endif  // CUDA_KERNELS_ON_CPU
+
+
+// make sure function are inlined to avoid multiple definition
+#ifndef __CUDACC__
 #undef __global__
 #define __global__ inline __attribute__((always_inline))
 #undef __forceinline__
 #define __forceinline__ inline __attribute__((always_inline))
 #endif
 
-#ifndef __CUDA_ARCH__
-using namespace cudaCompat;
-#endif
-
-#endif
 
 #endif  // HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/launch.h b/HeterogeneousCore/CUDAUtilities/interface/launch.h
@@ -94,10 +94,14 @@ namespace cudautils {
   }  // namespace detail
 
   // wrappers for cudaLaunchKernel
-
+  inline
   void launch(void (*kernel)(), LaunchParameters config) {
+#ifdef CUDA_KERNELS_ON_CPU
+    kernel();
+#else
     cudaCheck(cudaLaunchKernel(
         (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
+#endif
   }
 
   template <typename F, typename... Args>
@@ -107,6 +111,9 @@ namespace cudautils {
   std::enable_if_t<std::is_void<std::result_of_t<F && (Args && ...)> >::value>
 #endif
   launch(F* kernel, LaunchParameters config, Args&&... args) {
+#ifdef CUDA_KERNELS_ON_CPU
+    kernel(args...);
+#else
     using function_type = detail::kernel_traits<F>;
     typename function_type::argument_type_tuple args_copy(args...);
 
@@ -116,10 +123,11 @@ namespace cudautils {
     detail::pointer_setter<size>()(pointers, args_copy);
     cudaCheck(cudaLaunchKernel(
         (const void*)kernel, config.gridDim, config.blockDim, (void**)pointers, config.sharedMem, config.stream));
+#endif
   }
 
   // wrappers for cudaLaunchCooperativeKernel
-
+  inline
   void launch_cooperative(void (*kernel)(), LaunchParameters config) {
     cudaCheck(cudaLaunchCooperativeKernel(
         (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));

diff --git a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
@@ -3,8 +3,10 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h"
 
 #include <type_traits>
+#include <cstring>
 
 namespace cudautils {
   template <typename T>
@@ -15,6 +17,11 @@ namespace cudautils {
     cudaCheck(cudaMemsetAsync(ptr.get(), value, sizeof(T), stream));
   }
 
+  template <typename T>
+  inline void memsetAsync(cudautils::cpu::unique_ptr<T>& ptr, T value, cudaStream_t) {
+    ::memset(ptr.get(), value, sizeof(T));
+  }
+
   /**
    * The type of `value` is `int` because of `cudaMemsetAsync()` takes
    * it as an `int`. Note that `cudaMemsetAsync()` sets the value of
@@ -25,6 +32,12 @@ namespace cudautils {
   inline void memsetAsync(cudautils::device::unique_ptr<T[]>& ptr, int value, size_t nelements, cudaStream_t stream) {
     cudaCheck(cudaMemsetAsync(ptr.get(), value, nelements * sizeof(T), stream));
   }
+  template <typename T>
+  inline void memsetAsync(cudautils::cpu::unique_ptr<T[]>& ptr, int value, size_t nelements, cudaStream_t) {
+    ::memset(ptr.get(), value, nelements * sizeof(T));
+  }
+
+
 }  // namespace cudautils
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -7,6 +7,18 @@
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 </bin>
 
+<bin file="Launch_t.cpp Launch_t.cu" name="gpuLaunch_t">
+</bin>
+
+<bin file="Launch_t.cpp" name="cpuLaunch_t">
+  <flags CXXFLAGS="-DCUDA_KERNELS_ON_CPU"/>
+</bin>
+
+<bin file="Launch_t.cpp Launch_t.cu" name="mixedLaunch_t">
+  <flags CXXFLAGS="-DCUDA_KERNELS_ON_CPU -DLaunchInCU"/>
+</bin>
+
+
 <bin file="test_GPUSimpleVector.cu" name="test_GPUSimpleVector">
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 </bin>
@@ -65,7 +77,7 @@
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 </bin>
 
-<bin file="testCatch2Main.cpp,device_unique_ptr_t.cpp,host_unique_ptr_t.cpp,host_noncached_unique_ptr_t.cpp,copyAsync_t.cpp,memsetAsync_t.cpp" name="cudaMemUtils_t">
+<bin file="testCatch2Main.cpp,device_unique_ptr_t.cpp,host_unique_ptr_t.cpp,host_noncached_unique_ptr_t.cpp,cpu_unique_ptr_t.cpp,copyAsync_t.cpp,memsetAsync_t.cpp" name="cudaMemUtils_t">
   <use name="catch2"/>
 </bin>