cms-patatrack · makortel · Jun 16, 2020 · makortel · Jun 16, 2020
diff --git a/CUDADataFormats/Common/interface/ProductBase.h b/CUDADataFormats/Common/interface/ProductBase.h
@@ -9,9 +9,7 @@
 
 namespace cms {
   namespace cuda {
-    namespace impl {
-      class ScopedContextBase;
-    }
+    class ScopedContextBase;
 
     /**
      * Base class for all instantiations of CUDA<T> to hold the
@@ -59,7 +57,7 @@ namespace cms {
           : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {}
 
     private:
-      friend class impl::ScopedContextBase;
+      friend class ScopedContextBase;
       friend class ScopedContextProduce;
 
       // The following function is intended to be used only from ScopedContext

diff --git a/HeterogeneousCore/CUDACore/interface/ScopedContext.h b/HeterogeneousCore/CUDACore/interface/ScopedContext.h
@@ -10,6 +10,7 @@
 #include "FWCore/Utilities/interface/EDPutToken.h"
 #include "FWCore/Utilities/interface/StreamID.h"
 #include "HeterogeneousCore/CUDACore/interface/ContextState.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContextBase.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
@@ -20,39 +21,8 @@ namespace cms {
   }
 
   namespace cuda {
-
     namespace impl {
-      // This class is intended to be derived by other ScopedContext*, not for general use
-      class ScopedContextBase {
-      public:
-        int device() const { return currentDevice_; }
-
-        // cudaStream_t is a pointer to a thread-safe object, for which a
-        // mutable access is needed even if the ScopedContext itself
-        // would be const. Therefore it is ok to return a non-const
-        // pointer from a const method here.
-        cudaStream_t stream() const { return stream_.get(); }
-        const SharedStreamPtr& streamPtr() const { return stream_; }
-
-      protected:
-        // The constructors set the current device, but the device
-        // is not set back to the previous value at the destructor. This
-        // should be sufficient (and tiny bit faster) as all CUDA API
-        // functions relying on the current device should be called from
-        // the scope where this context is. The current device doesn't
-        // really matter between modules (or across TBB tasks).
-        explicit ScopedContextBase(edm::StreamID streamID);
-
-        explicit ScopedContextBase(const ProductBase& data);
-
-        explicit ScopedContextBase(int device, SharedStreamPtr stream);
-
-      private:
-        int currentDevice_;
-        SharedStreamPtr stream_;
-      };
-
-      class ScopedContextGetterBase : public ScopedContextBase {
+      class ScopedContextGetterBase : public cms::cuda::ScopedContextBase {
       public:
         template <typename T>
         const T& get(const Product<T>& data) {
@@ -189,7 +159,7 @@ namespace cms {
      * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
      * and enforce that those get done in a proper way in RAII fashion.
      */
-    class ScopedContextTask : public impl::ScopedContextBase {
+    class ScopedContextTask : public ScopedContextBase {
     public:
       /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
       explicit ScopedContextTask(ContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder)

diff --git a/HeterogeneousCore/CUDACore/interface/ScopedContextBase.h b/HeterogeneousCore/CUDACore/interface/ScopedContextBase.h
@@ -0,0 +1,62 @@
+#ifndef HeterogeneousCore_CUDACore_ScopedContextBase_h
+#define HeterogeneousCore_CUDACore_ScopedContextBase_h
+
+#include "FWCore/Utilities/interface/StreamID.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
+
+namespace cms {
+  namespace cuda {
+    class ProductBase;
+
+    class ScopedContextBase {
+    public:
+      ScopedContextBase(ScopedContextBase const&) = delete;
+      ScopedContextBase& operator=(ScopedContextBase const&) = delete;
+      ScopedContextBase(ScopedContextBase&&) = delete;
+      ScopedContextBase& operator=(ScopedContextBase&&) = delete;
+
+      int device() const { return currentDevice_; }
+
+      // cudaStream_t is a pointer to a thread-safe object, for which a
+      // mutable access is needed even if the ScopedContext itself
+      // would be const. Therefore it is ok to return a non-const
+      // pointer from a const method here.
+      cudaStream_t stream() const { return stream_.get(); }
+      const SharedStreamPtr& streamPtr() const { return stream_; }
+
+      template <typename T>
+      typename cms::cuda::device::impl::make_device_unique_selector<T>::non_array make_device_unique() {
+        return cms::cuda::make_device_unique<T>(stream());
+      }
+
+      template <typename T>
+      typename cms::cuda::device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique(size_t n) {
+        return cms::cuda::make_device_unique<T>(n, stream());
+      }
+
+      template <typename T, typename... Args>
+      typename cms::cuda::device::impl::make_device_unique_selector<T>::bounded_array make_device_unique(Args&&...) =
+          delete;
+
+    protected:
+      // The constructors set the current device, but the device
+      // is not set back to the previous value at the destructor. This
+      // should be sufficient (and tiny bit faster) as all CUDA API
+      // functions relying on the current device should be called from
+      // the scope where this context is. The current device doesn't
+      // really matter between modules (or across TBB tasks).
+      explicit ScopedContextBase(edm::StreamID streamID);
+
+      explicit ScopedContextBase(const ProductBase& data);
+
+      explicit ScopedContextBase(int device, SharedStreamPtr stream);
+
+    private:
+      int currentDevice_;
+      SharedStreamPtr stream_;
+    };
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/src/ScopedContext.cc b/HeterogeneousCore/CUDACore/src/ScopedContext.cc
@@ -3,11 +3,8 @@
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "FWCore/ServiceRegistry/interface/Service.h"
 #include "FWCore/Utilities/interface/Exception.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-#include "chooseDevice.h"
-
 namespace {
   struct CallbackData {
     edm::WaitingTaskWithArenaHolder holder;
@@ -38,27 +35,6 @@ namespace {
 
 namespace cms::cuda {
   namespace impl {
-    ScopedContextBase::ScopedContextBase(edm::StreamID streamID) : currentDevice_(chooseDevice(streamID)) {
-      cudaCheck(cudaSetDevice(currentDevice_));
-      stream_ = getStreamCache().get();
-    }
-
-    ScopedContextBase::ScopedContextBase(const ProductBase& data) : currentDevice_(data.device()) {
-      cudaCheck(cudaSetDevice(currentDevice_));
-      if (data.mayReuseStream()) {
-        stream_ = data.streamPtr();
-      } else {
-        stream_ = getStreamCache().get();
-      }
-    }
-
-    ScopedContextBase::ScopedContextBase(int device, SharedStreamPtr stream)
-        : currentDevice_(device), stream_(std::move(stream)) {
-      cudaCheck(cudaSetDevice(currentDevice_));
-    }
-
-    ////////////////////
-
     void ScopedContextGetterBase::synchronizeStreams(int dataDevice,
                                                      cudaStream_t dataStream,
                                                      bool available,

diff --git a/HeterogeneousCore/CUDACore/src/ScopedContextBase.cc b/HeterogeneousCore/CUDACore/src/ScopedContextBase.cc
@@ -0,0 +1,27 @@
+#include "CUDADataFormats/Common/interface/ProductBase.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContextBase.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
+
+#include "chooseDevice.h"
+
+namespace cms::cuda {
+  ScopedContextBase::ScopedContextBase(edm::StreamID streamID) : currentDevice_(chooseDevice(streamID)) {
+    cudaCheck(cudaSetDevice(currentDevice_));
+    stream_ = getStreamCache().get();
+  }
+
+  ScopedContextBase::ScopedContextBase(const ProductBase& data) : currentDevice_(data.device()) {
+    cudaCheck(cudaSetDevice(currentDevice_));
+    if (data.mayReuseStream()) {
+      stream_ = data.streamPtr();
+    } else {
+      stream_ = getStreamCache().get();
+    }
+  }
+
+  ScopedContextBase::ScopedContextBase(int device, SharedStreamPtr stream)
+      : currentDevice_(device), stream_(std::move(stream)) {
+    cudaCheck(cudaSetDevice(currentDevice_));
+  }
+}  // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
@@ -49,7 +49,7 @@ void TestCUDAProducerGPU::produce(edm::StreamID streamID, edm::Event& iEvent, ed
   cms::cuda::ScopedContextProduce ctx{in};
   cms::cudatest::Thing const& input = ctx.get(in);
 
-  ctx.emplace(iEvent, dstToken_, cms::cudatest::Thing{gpuAlgo_.runAlgo(label_, input.get(), ctx.stream())});
+  ctx.emplace(iEvent, dstToken_, cms::cudatest::Thing{gpuAlgo_.runAlgo(label_, input.get(), ctx)});
 
   edm::LogVerbatim("TestCUDAProducerGPU")
       << label_ << " TestCUDAProducerGPU::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();

diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
@@ -69,7 +69,7 @@ void TestCUDAProducerGPUEW::acquire(edm::Event const& iEvent,
   cms::cuda::ScopedContextAcquire ctx{in, std::move(waitingTaskHolder), ctxState_};
   cms::cudatest::Thing const& input = ctx.get(in);
 
-  devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx.stream());
+  devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx);
   // Mimick the need to transfer some of the GPU data back to CPU to
   // be used for something within this module, or to be put in the
   // event.

diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
@@ -79,7 +79,7 @@ void TestCUDAProducerGPUEWTask::acquire(edm::Event const& iEvent,
 
   cms::cudatest::Thing const& input = ctx.get(in);
 
-  devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx.stream());
+  devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx);
   // Mimick the need to transfer some of the GPU data back to CPU to
   // be used for something within this module, or to be put in the
   // event.
@@ -107,7 +107,7 @@ void TestCUDAProducerGPUEWTask::addSimpleWork(edm::EventNumber_t eventID,
 
     ctx.pushNextTask(
         [eventID, streamID, this](cms::cuda::ScopedContextTask ctx) { addSimpleWork(eventID, streamID, ctx); });
-    gpuAlgo_.runSimpleAlgo(devicePtr_.get(), ctx.stream());
+    gpuAlgo_.runSimpleAlgo(devicePtr_.get(), ctx);
     edm::LogVerbatim("TestCUDAProducerGPUEWTask")
         << label_ << " TestCUDAProducerGPUEWTask::addSimpleWork end event " << eventID << " stream " << streamID;
   } else {

diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
@@ -46,7 +46,7 @@ void TestCUDAProducerGPUFirst::produce(edm::StreamID streamID,
 
   cms::cuda::ScopedContextProduce ctx{streamID};
 
-  cms::cuda::device::unique_ptr<float[]> output = gpuAlgo_.runAlgo(label_, ctx.stream());
+  cms::cuda::device::unique_ptr<float[]> output = gpuAlgo_.runAlgo(label_, ctx);
   ctx.emplace(iEvent, dstToken_, std::move(output));
 
   edm::LogVerbatim("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce end event "

diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
@@ -64,7 +64,8 @@ namespace {
 
 cms::cuda::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const std::string &label,
                                                                           const float *d_input,
-                                                                          cudaStream_t stream) const {
+                                                                          cms::cuda::ScopedContextBase &ctx) const {
+  auto stream = ctx.stream();
   // First make the sanity check
   if (d_input != nullptr) {
     auto h_check = std::make_unique<float[]>(NUM_VALUES);
@@ -86,24 +87,24 @@ cms::cuda::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const
     h_b[i] = i * i;
   }
 
-  auto d_a = cms::cuda::make_device_unique<float[]>(NUM_VALUES, stream);
-  auto d_b = cms::cuda::make_device_unique<float[]>(NUM_VALUES, stream);
+  auto d_a = ctx.make_device_unique<float[]>(NUM_VALUES);
+  auto d_b = ctx.make_device_unique<float[]>(NUM_VALUES);
 
   cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream));
   cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream));
 
   int threadsPerBlock{32};
   int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
 
-  auto d_c = cms::cuda::make_device_unique<float[]>(NUM_VALUES, stream);
+  auto d_c = ctx.make_device_unique<float[]>(NUM_VALUES);
   auto current_device = cms::cuda::currentDevice();
   cms::cuda::LogVerbatim("TestHeterogeneousEDProducerGPU")
       << "  " << label << " GPU launching kernels device " << current_device << " CUDA stream " << stream;
   vectorAdd<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_a.get(), d_b.get(), d_c.get(), NUM_VALUES);
 
-  auto d_ma = cms::cuda::make_device_unique<float[]>(NUM_VALUES * NUM_VALUES, stream);
-  auto d_mb = cms::cuda::make_device_unique<float[]>(NUM_VALUES * NUM_VALUES, stream);
-  auto d_mc = cms::cuda::make_device_unique<float[]>(NUM_VALUES * NUM_VALUES, stream);
+  auto d_ma = ctx.make_device_unique<float[]>(NUM_VALUES * NUM_VALUES);
+  auto d_mb = ctx.make_device_unique<float[]>(NUM_VALUES * NUM_VALUES);
+  auto d_mc = ctx.make_device_unique<float[]>(NUM_VALUES * NUM_VALUES);
   dim3 threadsPerBlock3{NUM_VALUES, NUM_VALUES};
   dim3 blocksPerGrid3{1, 1};
   if (NUM_VALUES * NUM_VALUES > 32) {
@@ -124,8 +125,8 @@ cms::cuda::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const
   return d_a;
 }
 
-void TestCUDAProducerGPUKernel::runSimpleAlgo(float *d_data, cudaStream_t stream) const {
+void TestCUDAProducerGPUKernel::runSimpleAlgo(float *d_data, cms::cuda::ScopedContextBase &ctx) const {
   int threadsPerBlock{32};
   int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
-  vectorAddConstant<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_data, 1.0f, NUM_VALUES);
+  vectorAddConstant<<<blocksPerGrid, threadsPerBlock, 0, ctx.stream()>>>(d_data, 1.0f, NUM_VALUES);
 }
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h
@@ -5,6 +5,7 @@
 
 #include <cuda_runtime.h>
 
+#include "HeterogeneousCore/CUDACore/interface/ScopedContextBase.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
 /**
@@ -24,14 +25,14 @@ class TestCUDAProducerGPUKernel {
   ~TestCUDAProducerGPUKernel() = default;
 
   // returns (owning) pointer to device memory
-  cms::cuda::device::unique_ptr<float[]> runAlgo(const std::string& label, cudaStream_t stream) const {
-    return runAlgo(label, nullptr, stream);
+  cms::cuda::device::unique_ptr<float[]> runAlgo(const std::string& label, cms::cuda::ScopedContextBase& ctx) const {
+    return runAlgo(label, nullptr, ctx);
   }
   cms::cuda::device::unique_ptr<float[]> runAlgo(const std::string& label,
                                                  const float* d_input,
-                                                 cudaStream_t stream) const;
+                                                 cms::cuda::ScopedContextBase& ctx) const;
 
-  void runSimpleAlgo(float* d_data, cudaStream_t stream) const;
+  void runSimpleAlgo(float* d_data, cms::cuda::ScopedContextBase& ctx) const;
 };
 
 #endif