diff --git a/CUDADataFormats/Common/interface/ProductBase.h b/CUDADataFormats/Common/interface/ProductBase.h index efe2242903bd0..1d5999ebca171 100644 --- a/CUDADataFormats/Common/interface/ProductBase.h +++ b/CUDADataFormats/Common/interface/ProductBase.h @@ -9,9 +9,7 @@ namespace cms { namespace cuda { - namespace impl { - class ScopedContextBase; - } + class ScopedContextBase; /** * Base class for all instantiations of CUDA to hold the @@ -59,7 +57,7 @@ namespace cms { : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {} private: - friend class impl::ScopedContextBase; + friend class ScopedContextBase; friend class ScopedContextProduce; // The following function is intended to be used only from ScopedContext diff --git a/HeterogeneousCore/CUDACore/interface/ScopedContext.h b/HeterogeneousCore/CUDACore/interface/ScopedContext.h index cdc3e2dd2c620..e6a6b1fffb1c9 100644 --- a/HeterogeneousCore/CUDACore/interface/ScopedContext.h +++ b/HeterogeneousCore/CUDACore/interface/ScopedContext.h @@ -10,6 +10,7 @@ #include "FWCore/Utilities/interface/EDPutToken.h" #include "FWCore/Utilities/interface/StreamID.h" #include "HeterogeneousCore/CUDACore/interface/ContextState.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContextBase.h" #include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h" #include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h" #include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h" @@ -20,39 +21,8 @@ namespace cms { } namespace cuda { - namespace impl { - // This class is intended to be derived by other ScopedContext*, not for general use - class ScopedContextBase { - public: - int device() const { return currentDevice_; } - - // cudaStream_t is a pointer to a thread-safe object, for which a - // mutable access is needed even if the ScopedContext itself - // would be const. Therefore it is ok to return a non-const - // pointer from a const method here. - cudaStream_t stream() const { return stream_.get(); } - const SharedStreamPtr& streamPtr() const { return stream_; } - - protected: - // The constructors set the current device, but the device - // is not set back to the previous value at the destructor. This - // should be sufficient (and tiny bit faster) as all CUDA API - // functions relying on the current device should be called from - // the scope where this context is. The current device doesn't - // really matter between modules (or across TBB tasks). - explicit ScopedContextBase(edm::StreamID streamID); - - explicit ScopedContextBase(const ProductBase& data); - - explicit ScopedContextBase(int device, SharedStreamPtr stream); - - private: - int currentDevice_; - SharedStreamPtr stream_; - }; - - class ScopedContextGetterBase : public ScopedContextBase { + class ScopedContextGetterBase : public cms::cuda::ScopedContextBase { public: template const T& get(const Product& data) { @@ -189,7 +159,7 @@ namespace cms { * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary * and enforce that those get done in a proper way in RAII fashion. */ - class ScopedContextTask : public impl::ScopedContextBase { + class ScopedContextTask : public ScopedContextBase { public: /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) explicit ScopedContextTask(ContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder) diff --git a/HeterogeneousCore/CUDACore/interface/ScopedContextBase.h b/HeterogeneousCore/CUDACore/interface/ScopedContextBase.h new file mode 100644 index 0000000000000..c9ede3f5e751a --- /dev/null +++ b/HeterogeneousCore/CUDACore/interface/ScopedContextBase.h @@ -0,0 +1,62 @@ +#ifndef HeterogeneousCore_CUDACore_ScopedContextBase_h +#define HeterogeneousCore_CUDACore_ScopedContextBase_h + +#include "FWCore/Utilities/interface/StreamID.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h" + +namespace cms { + namespace cuda { + class ProductBase; + + class ScopedContextBase { + public: + ScopedContextBase(ScopedContextBase const&) = delete; + ScopedContextBase& operator=(ScopedContextBase const&) = delete; + ScopedContextBase(ScopedContextBase&&) = delete; + ScopedContextBase& operator=(ScopedContextBase&&) = delete; + + int device() const { return currentDevice_; } + + // cudaStream_t is a pointer to a thread-safe object, for which a + // mutable access is needed even if the ScopedContext itself + // would be const. Therefore it is ok to return a non-const + // pointer from a const method here. + cudaStream_t stream() const { return stream_.get(); } + const SharedStreamPtr& streamPtr() const { return stream_; } + + template + typename cms::cuda::device::impl::make_device_unique_selector::non_array make_device_unique() { + return cms::cuda::make_device_unique(stream()); + } + + template + typename cms::cuda::device::impl::make_device_unique_selector::unbounded_array make_device_unique(size_t n) { + return cms::cuda::make_device_unique(n, stream()); + } + + template + typename cms::cuda::device::impl::make_device_unique_selector::bounded_array make_device_unique(Args&&...) = + delete; + + protected: + // The constructors set the current device, but the device + // is not set back to the previous value at the destructor. This + // should be sufficient (and tiny bit faster) as all CUDA API + // functions relying on the current device should be called from + // the scope where this context is. The current device doesn't + // really matter between modules (or across TBB tasks). + explicit ScopedContextBase(edm::StreamID streamID); + + explicit ScopedContextBase(const ProductBase& data); + + explicit ScopedContextBase(int device, SharedStreamPtr stream); + + private: + int currentDevice_; + SharedStreamPtr stream_; + }; + } // namespace cuda +} // namespace cms + +#endif diff --git a/HeterogeneousCore/CUDACore/src/ScopedContext.cc b/HeterogeneousCore/CUDACore/src/ScopedContext.cc index 7461ebbee9f0d..021907c75a34d 100644 --- a/HeterogeneousCore/CUDACore/src/ScopedContext.cc +++ b/HeterogeneousCore/CUDACore/src/ScopedContext.cc @@ -3,11 +3,8 @@ #include "FWCore/MessageLogger/interface/MessageLogger.h" #include "FWCore/ServiceRegistry/interface/Service.h" #include "FWCore/Utilities/interface/Exception.h" -#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "chooseDevice.h" - namespace { struct CallbackData { edm::WaitingTaskWithArenaHolder holder; @@ -38,27 +35,6 @@ namespace { namespace cms::cuda { namespace impl { - ScopedContextBase::ScopedContextBase(edm::StreamID streamID) : currentDevice_(chooseDevice(streamID)) { - cudaCheck(cudaSetDevice(currentDevice_)); - stream_ = getStreamCache().get(); - } - - ScopedContextBase::ScopedContextBase(const ProductBase& data) : currentDevice_(data.device()) { - cudaCheck(cudaSetDevice(currentDevice_)); - if (data.mayReuseStream()) { - stream_ = data.streamPtr(); - } else { - stream_ = getStreamCache().get(); - } - } - - ScopedContextBase::ScopedContextBase(int device, SharedStreamPtr stream) - : currentDevice_(device), stream_(std::move(stream)) { - cudaCheck(cudaSetDevice(currentDevice_)); - } - - //////////////////// - void ScopedContextGetterBase::synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, diff --git a/HeterogeneousCore/CUDACore/src/ScopedContextBase.cc b/HeterogeneousCore/CUDACore/src/ScopedContextBase.cc new file mode 100644 index 0000000000000..a79689e35e06f --- /dev/null +++ b/HeterogeneousCore/CUDACore/src/ScopedContextBase.cc @@ -0,0 +1,27 @@ +#include "CUDADataFormats/Common/interface/ProductBase.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContextBase.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h" + +#include "chooseDevice.h" + +namespace cms::cuda { + ScopedContextBase::ScopedContextBase(edm::StreamID streamID) : currentDevice_(chooseDevice(streamID)) { + cudaCheck(cudaSetDevice(currentDevice_)); + stream_ = getStreamCache().get(); + } + + ScopedContextBase::ScopedContextBase(const ProductBase& data) : currentDevice_(data.device()) { + cudaCheck(cudaSetDevice(currentDevice_)); + if (data.mayReuseStream()) { + stream_ = data.streamPtr(); + } else { + stream_ = getStreamCache().get(); + } + } + + ScopedContextBase::ScopedContextBase(int device, SharedStreamPtr stream) + : currentDevice_(device), stream_(std::move(stream)) { + cudaCheck(cudaSetDevice(currentDevice_)); + } +} // namespace cms::cuda diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc index 6f92ac91dd922..90ca0f7409c19 100644 --- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc @@ -49,7 +49,7 @@ void TestCUDAProducerGPU::produce(edm::StreamID streamID, edm::Event& iEvent, ed cms::cuda::ScopedContextProduce ctx{in}; cms::cudatest::Thing const& input = ctx.get(in); - ctx.emplace(iEvent, dstToken_, cms::cudatest::Thing{gpuAlgo_.runAlgo(label_, input.get(), ctx.stream())}); + ctx.emplace(iEvent, dstToken_, cms::cudatest::Thing{gpuAlgo_.runAlgo(label_, input.get(), ctx)}); edm::LogVerbatim("TestCUDAProducerGPU") << label_ << " TestCUDAProducerGPU::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID(); diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc index 9b6fe85636026..f78f4251c2062 100644 --- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc @@ -69,7 +69,7 @@ void TestCUDAProducerGPUEW::acquire(edm::Event const& iEvent, cms::cuda::ScopedContextAcquire ctx{in, std::move(waitingTaskHolder), ctxState_}; cms::cudatest::Thing const& input = ctx.get(in); - devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx.stream()); + devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx); // Mimick the need to transfer some of the GPU data back to CPU to // be used for something within this module, or to be put in the // event. diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc index d1e4f94a30d96..701783d6d994c 100644 --- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc @@ -79,7 +79,7 @@ void TestCUDAProducerGPUEWTask::acquire(edm::Event const& iEvent, cms::cudatest::Thing const& input = ctx.get(in); - devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx.stream()); + devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx); // Mimick the need to transfer some of the GPU data back to CPU to // be used for something within this module, or to be put in the // event. @@ -107,7 +107,7 @@ void TestCUDAProducerGPUEWTask::addSimpleWork(edm::EventNumber_t eventID, ctx.pushNextTask( [eventID, streamID, this](cms::cuda::ScopedContextTask ctx) { addSimpleWork(eventID, streamID, ctx); }); - gpuAlgo_.runSimpleAlgo(devicePtr_.get(), ctx.stream()); + gpuAlgo_.runSimpleAlgo(devicePtr_.get(), ctx); edm::LogVerbatim("TestCUDAProducerGPUEWTask") << label_ << " TestCUDAProducerGPUEWTask::addSimpleWork end event " << eventID << " stream " << streamID; } else { diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc index b9752f6f41630..693c5b7aa2a49 100644 --- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc @@ -46,7 +46,7 @@ void TestCUDAProducerGPUFirst::produce(edm::StreamID streamID, cms::cuda::ScopedContextProduce ctx{streamID}; - cms::cuda::device::unique_ptr output = gpuAlgo_.runAlgo(label_, ctx.stream()); + cms::cuda::device::unique_ptr output = gpuAlgo_.runAlgo(label_, ctx); ctx.emplace(iEvent, dstToken_, std::move(output)); edm::LogVerbatim("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce end event " diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu index 69264a40aca62..00fbe3e9dae4d 100644 --- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu @@ -64,7 +64,8 @@ namespace { cms::cuda::device::unique_ptr TestCUDAProducerGPUKernel::runAlgo(const std::string &label, const float *d_input, - cudaStream_t stream) const { + cms::cuda::ScopedContextBase &ctx) const { + auto stream = ctx.stream(); // First make the sanity check if (d_input != nullptr) { auto h_check = std::make_unique(NUM_VALUES); @@ -86,8 +87,8 @@ cms::cuda::device::unique_ptr TestCUDAProducerGPUKernel::runAlgo(const h_b[i] = i * i; } - auto d_a = cms::cuda::make_device_unique(NUM_VALUES, stream); - auto d_b = cms::cuda::make_device_unique(NUM_VALUES, stream); + auto d_a = ctx.make_device_unique(NUM_VALUES); + auto d_b = ctx.make_device_unique(NUM_VALUES); cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream)); cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream)); @@ -95,15 +96,15 @@ cms::cuda::device::unique_ptr TestCUDAProducerGPUKernel::runAlgo(const int threadsPerBlock{32}; int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock; - auto d_c = cms::cuda::make_device_unique(NUM_VALUES, stream); + auto d_c = ctx.make_device_unique(NUM_VALUES); auto current_device = cms::cuda::currentDevice(); cms::cuda::LogVerbatim("TestHeterogeneousEDProducerGPU") << " " << label << " GPU launching kernels device " << current_device << " CUDA stream " << stream; vectorAdd<<>>(d_a.get(), d_b.get(), d_c.get(), NUM_VALUES); - auto d_ma = cms::cuda::make_device_unique(NUM_VALUES * NUM_VALUES, stream); - auto d_mb = cms::cuda::make_device_unique(NUM_VALUES * NUM_VALUES, stream); - auto d_mc = cms::cuda::make_device_unique(NUM_VALUES * NUM_VALUES, stream); + auto d_ma = ctx.make_device_unique(NUM_VALUES * NUM_VALUES); + auto d_mb = ctx.make_device_unique(NUM_VALUES * NUM_VALUES); + auto d_mc = ctx.make_device_unique(NUM_VALUES * NUM_VALUES); dim3 threadsPerBlock3{NUM_VALUES, NUM_VALUES}; dim3 blocksPerGrid3{1, 1}; if (NUM_VALUES * NUM_VALUES > 32) { @@ -124,8 +125,8 @@ cms::cuda::device::unique_ptr TestCUDAProducerGPUKernel::runAlgo(const return d_a; } -void TestCUDAProducerGPUKernel::runSimpleAlgo(float *d_data, cudaStream_t stream) const { +void TestCUDAProducerGPUKernel::runSimpleAlgo(float *d_data, cms::cuda::ScopedContextBase &ctx) const { int threadsPerBlock{32}; int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock; - vectorAddConstant<<>>(d_data, 1.0f, NUM_VALUES); + vectorAddConstant<<>>(d_data, 1.0f, NUM_VALUES); } diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h index 5eeba0009656e..b98be814bce4e 100644 --- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h @@ -5,6 +5,7 @@ #include +#include "HeterogeneousCore/CUDACore/interface/ScopedContextBase.h" #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" /** @@ -24,14 +25,14 @@ class TestCUDAProducerGPUKernel { ~TestCUDAProducerGPUKernel() = default; // returns (owning) pointer to device memory - cms::cuda::device::unique_ptr runAlgo(const std::string& label, cudaStream_t stream) const { - return runAlgo(label, nullptr, stream); + cms::cuda::device::unique_ptr runAlgo(const std::string& label, cms::cuda::ScopedContextBase& ctx) const { + return runAlgo(label, nullptr, ctx); } cms::cuda::device::unique_ptr runAlgo(const std::string& label, const float* d_input, - cudaStream_t stream) const; + cms::cuda::ScopedContextBase& ctx) const; - void runSimpleAlgo(float* d_data, cudaStream_t stream) const; + void runSimpleAlgo(float* d_data, cms::cuda::ScopedContextBase& ctx) const; }; #endif