From 06cc8f2eb874c6496ca7b1489a469bfbf4f5ec06 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Mon, 23 Aug 2021 06:44:58 -0700 Subject: [PATCH 01/12] Add Context class hierarchy --- src/cudadev/CUDACore/Context.cc | 103 ++++++++++++ src/cudadev/CUDACore/Context.h | 235 ++++++++++++++++++++++++++++ src/cudadev/CUDACore/ContextState.h | 3 + src/cudadev/CUDACore/Product.h | 5 +- src/cudadev/CUDACore/ProductBase.h | 5 +- 5 files changed, 349 insertions(+), 2 deletions(-) create mode 100644 src/cudadev/CUDACore/Context.cc create mode 100644 src/cudadev/CUDACore/Context.h diff --git a/src/cudadev/CUDACore/Context.cc b/src/cudadev/CUDACore/Context.cc new file mode 100644 index 000000000..5d1a46dff --- /dev/null +++ b/src/cudadev/CUDACore/Context.cc @@ -0,0 +1,103 @@ +#include "CUDACore/Context.h" + +#include "CUDACore/StreamCache.h" +#include "CUDACore/cudaCheck.h" + +#include "chooseDevice.h" + +namespace { + struct CallbackData { + edm::WaitingTaskWithArenaHolder holder; + int device; + }; + + void CUDART_CB cudaContextCallback(cudaStream_t streamId, cudaError_t status, void* data) { + std::unique_ptr guard{reinterpret_cast(data)}; + edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder; + int device = guard->device; + if (status == cudaSuccess) { + //std::cout << " GPU kernel finished (in callback) device " << device << " CUDA stream " + // << streamId << std::endl; + waitingTaskHolder.doneWaiting(nullptr); + } else { + // wrap the exception in a try-catch block to let GDB "catch throw" break on it + try { + auto error = cudaGetErrorName(status); + auto message = cudaGetErrorString(status); + throw std::runtime_error("Callback of CUDA stream " + + std::to_string(reinterpret_cast(streamId)) + " in device " + + std::to_string(device) + " error " + std::string(error) + ": " + std::string(message)); + } catch (std::exception&) { + waitingTaskHolder.doneWaiting(std::current_exception()); + } + } + } +} // namespace + +namespace cms::cuda { + namespace impl { + Context::Context(edm::StreamID streamID) : currentDevice_(chooseDevice(streamID)) { + cudaCheck(cudaSetDevice(currentDevice_)); + } + + Context::Context(int device, SharedStreamPtr stream) : currentDevice_(device), stream_(std::move(stream)) { + cudaCheck(cudaSetDevice(currentDevice_)); + } + + void Context::initialize() { stream_ = getStreamCache().get(); } + + void Context::initialize(const ProductBase& data) { + if (data.mayReuseStream()) { + stream_ = data.streamPtr(); + } else { + stream_ = getStreamCache().get(); + } + } + + //////////////////// + + void ContextGetterBase::synchronizeStreams(int dataDevice, + cudaStream_t dataStream, + bool available, + cudaEvent_t dataEvent) { + if (dataDevice != device()) { + // Eventually replace with prefetch to current device (assuming unified memory works) + // If we won't go to unified memory, need to figure out something else... + throw std::runtime_error("Handling data from multiple devices is not yet supported"); + } + + if (dataStream != stream()) { + // Different streams, need to synchronize + if (not available) { + // Event not yet occurred, so need to add synchronization + // here. Sychronization is done by making the CUDA stream to + // wait for an event, so all subsequent work in the stream + // will run only after the event has "occurred" (i.e. data + // product became available). + cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event"); + } + } + } + + //////////////////// + + void ContextHolderHelper::enqueueCallback(int device, cudaStream_t stream) { + cudaCheck(cudaStreamAddCallback(stream, cudaContextCallback, new CallbackData{waitingTaskHolder_, device}, 0)); + } + } // namespace impl + + //////////////////// + + void AcquireContext::commit() { + holderHelper_.enqueueCallback(device(), stream()); + contextState_.set(device(), streamPtr()); + } + + //////////////////// + + void ProduceContext::commit() { cudaCheck(cudaEventRecord(event_.get(), stream())); } + + //////////////////// + + void TaskContext::commit() { holderHelper_.enqueueCallback(device(), stream()); } +} // namespace cms::cuda diff --git a/src/cudadev/CUDACore/Context.h b/src/cudadev/CUDACore/Context.h new file mode 100644 index 000000000..6fe87dcc5 --- /dev/null +++ b/src/cudadev/CUDACore/Context.h @@ -0,0 +1,235 @@ +#ifndef HeterogeneousCore_CUDACore_Context_h +#define HeterogeneousCore_CUDACore_Context_h + +#include "CUDACore/Product.h" +#include "Framework/WaitingTaskWithArenaHolder.h" +#include "Framework/Event.h" +#include "Framework/EDGetToken.h" +#include "Framework/EDPutToken.h" +#include "CUDACore/ContextState.h" +#include "CUDACore/EventCache.h" +#include "CUDACore/SharedEventPtr.h" +#include "CUDACore/SharedStreamPtr.h" + +namespace cms::cuda { + namespace impl { + // This class is intended to be derived by other Context*, not for general use + class Context { + public: + Context(Context const&) = delete; + Context& operator=(Context const&) = delete; + Context(Context&&) = delete; + Context& operator=(Context&&) = delete; + + int device() const { return currentDevice_; } + + // cudaStream_t is a pointer to a thread-safe object, for which a + // mutable access is needed even if the Context itself + // would be const. Therefore it is ok to return a non-const + // pointer from a const method here. + cudaStream_t stream() { + if (not isInitialized()) { + initialize(); + } + return stream_.get(); + } + const SharedStreamPtr& streamPtr() { + if (not isInitialized()) { + initialize(); + } + return stream_; + } + + protected: + // The constructors set the current device, but the device + // is not set back to the previous value at the destructor. This + // should be sufficient (and tiny bit faster) as all CUDA API + // functions relying on the current device should be called from + // the scope where this context is. The current device doesn't + // really matter between modules (or across TBB tasks). + explicit Context(edm::StreamID streamID); + + explicit Context(int device, SharedStreamPtr stream); + + bool isInitialized() const { return bool(stream_); } + + void initialize(); + void initialize(const ProductBase& data); + + private: + int currentDevice_ = -1; + SharedStreamPtr stream_; + }; + + class ContextGetterBase : public Context { + public: + template + const T& get(const Product& data) { + if (not isInitialized()) { + initialize(data); + } + synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event()); + return data.data_; + } + + template + const T& get(const edm::Event& iEvent, edm::EDGetTokenT> token) { + return get(iEvent.get(token)); + } + + protected: + template + ContextGetterBase(Args&&... args) : Context(std::forward(args)...) {} + + private: + void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent); + }; + + class ContextHolderHelper { + public: + ContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : waitingTaskHolder_{std::move(waitingTaskHolder)} {} + + template + void pushNextTask(F&& f, ContextState const* state); + + void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + waitingTaskHolder_ = std::move(waitingTaskHolder); + } + + void enqueueCallback(int device, cudaStream_t stream); + + private: + edm::WaitingTaskWithArenaHolder waitingTaskHolder_; + }; + } // namespace impl + + /** + * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire(): + * - setting the current device + * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary + * - synchronizing between CUDA streams if necessary + * Users should not, however, construct it explicitly. + */ + class AcquireContext : public impl::ContextGetterBase { + public: + explicit AcquireContext(edm::StreamID streamID, + edm::WaitingTaskWithArenaHolder waitingTaskHolder, + ContextState& state) + : ContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{state} {} + ~AcquireContext() = default; + + template + void pushNextTask(F&& f) { + holderHelper_.pushNextTask(std::forward(f), contextState_); + } + + void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder)); + } + + // internal API + void commit(); + + private: + impl::ContextHolderHelper holderHelper_; + ContextState& contextState_; + }; + + /** + * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce(): + * - setting the current device + * - synchronizing between CUDA streams if necessary + * Users should not, however, construct it explicitly. + */ + class ProduceContext : public impl::ContextGetterBase { + public: + /// Constructor to create a new CUDA stream (non-ExternalWork module) + explicit ProduceContext(edm::StreamID streamID) : ContextGetterBase(streamID) {} + + /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) + explicit ProduceContext(ContextState& state) : ContextGetterBase(state.device(), state.releaseStreamPtr()) {} + + ~ProduceContext() = default; + + template + std::unique_ptr> wrap(T data) { + // make_unique doesn't work because of private constructor + return std::unique_ptr>(new Product(device(), streamPtr(), event_, std::move(data))); + } + + template + auto emplace(edm::Event& iEvent, edm::EDPutTokenT token, Args&&... args) { + return iEvent.emplace(token, device(), streamPtr(), event_, std::forward(args)...); + } + + // internal API + void commit(); + + private: + // This construcor is only meant for testing + explicit ProduceContext(int device, SharedStreamPtr stream, SharedEventPtr event) + : ContextGetterBase(device, std::move(stream)), event_{std::move(event)} {} + + // create the CUDA Event upfront to catch possible errors from its creation + SharedEventPtr event_ = getEventCache().get(); + }; + + /** + * The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire(): + * - setting the current device + * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary + */ + class TaskContext : public impl::Context { + public: + /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) + explicit TaskContext(ContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : Context(state->device(), state->streamPtr()), // don't move, state is re-used afterwards + holderHelper_{std::move(waitingTaskHolder)}, + contextState_{state} {} + + ~TaskContext() = default; + + template + void pushNextTask(F&& f) { + holderHelper_.pushNextTask(std::forward(f), contextState_); + } + + void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder)); + } + + // Internal API + void commit(); + + private: + impl::ContextHolderHelper holderHelper_; + ContextState const* contextState_; + }; + + /** + * The aim of this class is to do necessary per-event "initialization" in analyze() + * - setting the current device + * - synchronizing between CUDA streams if necessary + * and enforce that those get done in a proper way in RAII fashion. + */ + class AnalyzeContext : public impl::ContextGetterBase { + public: + /// Constructor to (possibly) re-use a CUDA stream + explicit AnalyzeContext(edm::StreamID streamID) : ContextGetterBase(streamID) {} + }; + + namespace impl { + template + void ContextHolderHelper::pushNextTask(F&& f, ContextState const* state) { + replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{ + edm::make_waiting_task_with_holder(tbb::task::allocate_root(), + std::move(waitingTaskHolder_), + [state, func = std::forward(f)](edm::WaitingTaskWithArenaHolder h) { + func(TaskContext{state, std::move(h)}); + })}); + } + } // namespace impl +} // namespace cms::cuda + +#endif diff --git a/src/cudadev/CUDACore/ContextState.h b/src/cudadev/CUDACore/ContextState.h index 3c73054ab..b0109e33d 100644 --- a/src/cudadev/CUDACore/ContextState.h +++ b/src/cudadev/CUDACore/ContextState.h @@ -26,6 +26,9 @@ namespace cms { friend class ScopedContextAcquire; friend class ScopedContextProduce; friend class ScopedContextTask; + friend class AcquireContext; + friend class ProduceContext; + friend class TaskContext; void set(int device, SharedStreamPtr stream) { throwIfStream(); diff --git a/src/cudadev/CUDACore/Product.h b/src/cudadev/CUDACore/Product.h index c60e994f0..5d9cab481 100644 --- a/src/cudadev/CUDACore/Product.h +++ b/src/cudadev/CUDACore/Product.h @@ -14,7 +14,8 @@ namespace cms { namespace cuda { namespace impl { class ScopedContextGetterBase; - } + class ContextGetterBase; + } // namespace impl /** * The purpose of this class is to wrap CUDA data to edm::Event in a @@ -43,6 +44,8 @@ namespace cms { private: friend class impl::ScopedContextGetterBase; friend class ScopedContextProduce; + friend class impl::ContextGetterBase; + friend class ProduceContext; friend class edm::Wrapper>; explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, T data) diff --git a/src/cudadev/CUDACore/ProductBase.h b/src/cudadev/CUDACore/ProductBase.h index cb3fd4db9..00d12b3fb 100644 --- a/src/cudadev/CUDACore/ProductBase.h +++ b/src/cudadev/CUDACore/ProductBase.h @@ -11,7 +11,8 @@ namespace cms { namespace cuda { namespace impl { class ScopedContextBase; - } + class Context; + } // namespace impl /** * Base class for all instantiations of CUDA to hold the @@ -61,6 +62,8 @@ namespace cms { private: friend class impl::ScopedContextBase; friend class ScopedContextProduce; + friend class impl::Context; + friend class ProduceContext; // The following function is intended to be used only from ScopedContext const SharedStreamPtr& streamPtr() const { return stream_; } From 44b14290e229da447659baec257a1ad55583dba2 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Mon, 23 Aug 2021 06:59:14 -0700 Subject: [PATCH 02/12] Add EDProducer base class, and runAcquire() and runProduce() functions Add runAcquire(), runProduce() functions --- src/cudadev/CUDACore/Context.h | 21 +++++++++++++++++++++ src/cudadev/CUDACore/EDProducer.cc | 17 +++++++++++++++++ src/cudadev/CUDACore/EDProducer.h | 29 +++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+) create mode 100644 src/cudadev/CUDACore/EDProducer.cc create mode 100644 src/cudadev/CUDACore/EDProducer.h diff --git a/src/cudadev/CUDACore/Context.h b/src/cudadev/CUDACore/Context.h index 6fe87dcc5..b85d29774 100644 --- a/src/cudadev/CUDACore/Context.h +++ b/src/cudadev/CUDACore/Context.h @@ -230,6 +230,27 @@ namespace cms::cuda { })}); } } // namespace impl + + template + void runAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder holder, ContextState& state, F func) { + AcquireContext context(streamID, std::move(holder), state); + func(context); + context.commit(); + } + + template + void runProduce(edm::StreamID streamID, F func) { + ProduceContext context(streamID); + func(context); + context.commit(); + } + + template + void runProduce(ContextState& state, F func) { + ProduceContext context(state); + func(context); + context.commit(); + } } // namespace cms::cuda #endif diff --git a/src/cudadev/CUDACore/EDProducer.cc b/src/cudadev/CUDACore/EDProducer.cc new file mode 100644 index 000000000..d41d77c31 --- /dev/null +++ b/src/cudadev/CUDACore/EDProducer.cc @@ -0,0 +1,17 @@ +#include "CUDACore/EDProducer.h" + +namespace cms::cuda { + void EDProducer::produce(edm::Event& event, edm::EventSetup const& eventSetup) { + runProduce(event.streamID(), [&](auto& ctx) { produce(event, eventSetup, ctx); }); + } + + void SynchronizingEDProducer::acquire(edm::Event const& event, + edm::EventSetup const& eventSetup, + edm::WaitingTaskWithArenaHolder holder) { + runAcquire(event.streamID(), std::move(holder), state_, [&](auto& ctx) { acquire(event, eventSetup, ctx); }); + } + + void SynchronizingEDProducer::produce(edm::Event& event, edm::EventSetup const& eventSetup) { + runProduce(state_, [&](auto& ctx) { produce(event, eventSetup, ctx); }); + } +} // namespace cms::cuda diff --git a/src/cudadev/CUDACore/EDProducer.h b/src/cudadev/CUDACore/EDProducer.h new file mode 100644 index 000000000..3fc917285 --- /dev/null +++ b/src/cudadev/CUDACore/EDProducer.h @@ -0,0 +1,29 @@ +#ifndef HeterogeneousCore_CUDACore_stream_EDProducer_h +#define HeterogeneousCore_CUDACore_stream_EDProducer_h + +#include "Framework/EDProducer.h" +#include "CUDACore/Context.h" + +namespace cms::cuda { + class EDProducer : public edm::EDProducer { + public: + void produce(edm::Event& event, edm::EventSetup const& eventSetup) override; + virtual void produce(edm::Event& event, edm::EventSetup const& eventSetup, ProduceContext& context) = 0; + }; + + class SynchronizingEDProducer : public edm::EDProducerExternalWork { + public: + void acquire(edm::Event const& event, + edm::EventSetup const& eventSetup, + edm::WaitingTaskWithArenaHolder holder) override; + void produce(edm::Event& event, edm::EventSetup const& eventSetup) override; + + virtual void acquire(edm::Event const& event, edm::EventSetup const& eventSetup, AcquireContext& context) = 0; + virtual void produce(edm::Event& event, edm::EventSetup const& eventSetup, ProduceContext& context) = 0; + + private: + ContextState state_; + }; +} // namespace cms::cuda + +#endif From 70341c6796050443706a0aa8c5ab5d53f2a3c2c4 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Mon, 23 Aug 2021 10:16:19 -0700 Subject: [PATCH 03/12] Migrate code from ScopedContext to Context --- .../plugin-BeamSpotProducer/BeamSpotToCUDA.cc | 11 ++---- .../PixelTrackSoAFromCUDA.cc | 20 ++++------ .../plugin-PixelTriplets/CAHitNtupletCUDA.cc | 13 +++---- .../PixelVertexProducerCUDA.cc | 13 +++---- .../PixelVertexSoAFromCUDA.cc | 20 ++++------ .../SiPixelRawToClusterCUDA.cc | 24 +++++------- .../SiPixelDigisSoAFromCUDA.cc | 19 ++++------ .../SiPixelRecHitCUDA.cc | 14 +++---- .../plugin-Validation/CountValidator.cc | 38 +++++++++---------- .../plugin-Validation/HistoValidator.cc | 20 ++++------ 10 files changed, 78 insertions(+), 114 deletions(-) diff --git a/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc b/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc index 48badcabf..b9633a856 100644 --- a/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc +++ b/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc @@ -2,23 +2,22 @@ #include +#include "CUDACore/EDProducer.h" #include "CUDACore/Product.h" -#include "CUDACore/ScopedContext.h" #include "CUDACore/copyAsync.h" #include "CUDACore/host_noncached_unique_ptr.h" #include "CUDADataFormats/BeamSpotCUDA.h" #include "DataFormats/BeamSpotPOD.h" -#include "Framework/EDProducer.h" #include "Framework/Event.h" #include "Framework/EventSetup.h" #include "Framework/PluginFactory.h" -class BeamSpotToCUDA : public edm::EDProducer { +class BeamSpotToCUDA : public cms::cuda::EDProducer { public: explicit BeamSpotToCUDA(edm::ProductRegistry& reg); ~BeamSpotToCUDA() override = default; - void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; + void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext& ctx) override; private: const edm::EDPutTokenT> bsPutToken_; @@ -30,11 +29,9 @@ BeamSpotToCUDA::BeamSpotToCUDA(edm::ProductRegistry& reg) : bsPutToken_{reg.produces>()}, bsHost{cms::cuda::make_host_noncached_unique(cudaHostAllocWriteCombined)} {} -void BeamSpotToCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { +void BeamSpotToCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext& ctx) { *bsHost = iSetup.get(); - cms::cuda::ScopedContextProduce ctx{iEvent.streamID()}; - BeamSpotCUDA bsDevice(ctx.stream()); cms::cuda::copyAsync(bsDevice.ptr(), bsHost, ctx.stream()); diff --git a/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc b/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc index 408450ea8..137bcb073 100644 --- a/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc +++ b/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc @@ -1,27 +1,25 @@ #include +#include "CUDACore/Context.h" +#include "CUDACore/EDProducer.h" #include "CUDACore/Product.h" #include "CUDACore/HostProduct.h" #include "CUDADataFormats/PixelTrackHeterogeneous.h" #include "Framework/EventSetup.h" #include "Framework/Event.h" #include "Framework/PluginFactory.h" -#include "Framework/EDProducer.h" -#include "CUDACore/ScopedContext.h" // Switch on to enable checks and printout for found tracks #undef PIXEL_DEBUG_PRODUCE -class PixelTrackSoAFromCUDA : public edm::EDProducerExternalWork { +class PixelTrackSoAFromCUDA : public cms::cuda::SynchronizingEDProducer { public: explicit PixelTrackSoAFromCUDA(edm::ProductRegistry& reg); ~PixelTrackSoAFromCUDA() override = default; private: - void acquire(edm::Event const& iEvent, - edm::EventSetup const& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; - void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; + void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, cms::cuda::AcquireContext& ctx) override; + void produce(edm::Event& iEvent, edm::EventSetup const& iSetup, cms::cuda::ProduceContext&) override; edm::EDGetTokenT> tokenCUDA_; edm::EDPutTokenT tokenSOA_; @@ -35,15 +33,13 @@ PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(edm::ProductRegistry& reg) void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); - cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; - auto const& inputData = ctx.get(inputDataWrapped); + cms::cuda::AcquireContext& ctx) { + auto const& inputData = ctx.get(iEvent, tokenCUDA_); soa_ = inputData.toHostAsync(ctx.stream()); } -void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { +void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup, cms::cuda::ProduceContext&) { #ifdef PIXEL_DEBUG_PRODUCE auto const& tsoa = *soa_; auto maxTracks = tsoa.stride(); diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc b/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc index 94085d784..aca6bd319 100644 --- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc +++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc @@ -1,24 +1,23 @@ #include #include "CUDACore/Product.h" +#include "CUDACore/EDProducer.h" #include "Framework/EventSetup.h" #include "Framework/Event.h" #include "Framework/PluginFactory.h" -#include "Framework/EDProducer.h" #include "Framework/RunningAverage.h" -#include "CUDACore/ScopedContext.h" #include "CAHitNtupletGeneratorOnGPU.h" #include "CUDADataFormats/PixelTrackHeterogeneous.h" #include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h" -class CAHitNtupletCUDA : public edm::EDProducer { +class CAHitNtupletCUDA : public cms::cuda::EDProducer { public: explicit CAHitNtupletCUDA(edm::ProductRegistry& reg); ~CAHitNtupletCUDA() override = default; private: - void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; + void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext& ctx) override; edm::EDGetTokenT> tokenHitGPU_; edm::EDPutTokenT> tokenTrackGPU_; @@ -31,12 +30,10 @@ CAHitNtupletCUDA::CAHitNtupletCUDA(edm::ProductRegistry& reg) tokenTrackGPU_{reg.produces>()}, gpuAlgo_(reg) {} -void CAHitNtupletCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es) { +void CAHitNtupletCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es, cms::cuda::ProduceContext& ctx) { auto bf = 0.0114256972711507; // 1/fieldInGeV - auto const& phits = iEvent.get(tokenHitGPU_); - cms::cuda::ScopedContextProduce ctx{phits}; - auto const& hits = ctx.get(phits); + auto const& hits = ctx.get(iEvent, tokenHitGPU_); ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream())); } diff --git a/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc b/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc index 723f7eb4c..50827a71a 100644 --- a/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc +++ b/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc @@ -6,7 +6,7 @@ #include "Framework/PluginFactory.h" #include "Framework/EDProducer.h" #include "Framework/RunningAverage.h" -#include "CUDACore/ScopedContext.h" +#include "CUDACore/Context.h" #include "gpuVertexFinder.h" @@ -58,14 +58,13 @@ PixelVertexProducerCUDA::PixelVertexProducerCUDA(edm::ProductRegistry& reg) } void PixelVertexProducerCUDA::produceOnGPU(edm::Event& iEvent, const edm::EventSetup& iSetup) { - auto const& ptracks = iEvent.get(tokenGPUTrack_); + cms::cuda::runProduce(iEvent.streamID(), [&](cms::cuda::ProduceContext& ctx) { + auto const* tracks = ctx.get(iEvent, tokenGPUTrack_).get(); - cms::cuda::ScopedContextProduce ctx{ptracks}; - auto const* tracks = ctx.get(ptracks).get(); + assert(tracks); - assert(tracks); - - ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_)); + ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_)); + }); } void PixelVertexProducerCUDA::produceOnCPU(edm::Event& iEvent, const edm::EventSetup& iSetup) { diff --git a/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc b/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc index d709f0c5e..79ad0ca91 100644 --- a/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc +++ b/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc @@ -1,25 +1,23 @@ #include +#include "CUDACore/Context.h" +#include "CUDACore/EDProducer.h" #include "CUDACore/Product.h" #include "CUDACore/HostProduct.h" #include "CUDADataFormats/ZVertexHeterogeneous.h" #include "Framework/EventSetup.h" #include "Framework/Event.h" #include "Framework/PluginFactory.h" -#include "Framework/EDProducer.h" #include "Framework/RunningAverage.h" -#include "CUDACore/ScopedContext.h" -class PixelVertexSoAFromCUDA : public edm::EDProducerExternalWork { +class PixelVertexSoAFromCUDA : public cms::cuda::SynchronizingEDProducer { public: explicit PixelVertexSoAFromCUDA(edm::ProductRegistry& reg); ~PixelVertexSoAFromCUDA() override = default; private: - void acquire(edm::Event const& iEvent, - edm::EventSetup const& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; - void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; + void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, cms::cuda::AcquireContext& ctx) override; + void produce(edm::Event& iEvent, edm::EventSetup const& iSetup, cms::cuda::ProduceContext&) override; edm::EDGetTokenT> tokenCUDA_; edm::EDPutTokenT tokenSOA_; @@ -33,15 +31,13 @@ PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(edm::ProductRegistry& reg) void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - auto const& inputDataWrapped = iEvent.get(tokenCUDA_); - cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; - auto const& inputData = ctx.get(inputDataWrapped); + cms::cuda::AcquireContext& ctx) { + auto const& inputData = ctx.get(iEvent, tokenCUDA_); m_soa = inputData.toHostAsync(ctx.stream()); } -void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { +void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup, cms::cuda::ProduceContext&) { // No copies.... iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(m_soa))); } diff --git a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc index a5229b295..7a2875ad9 100644 --- a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc +++ b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc @@ -4,8 +4,9 @@ #include // CMSSW includes +#include "CUDACore/Context.h" +#include "CUDACore/EDProducer.h" #include "CUDACore/Product.h" -#include "CUDACore/ScopedContext.h" #include "CUDADataFormats/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigiErrorsCUDA.h" #include "CUDADataFormats/SiPixelDigisCUDA.h" @@ -16,7 +17,6 @@ #include "DataFormats/FEDRawData.h" #include "DataFormats/FEDRawDataCollection.h" #include "DataFormats/SiPixelErrorCompact.h" -#include "Framework/EDProducer.h" #include "Framework/Event.h" #include "Framework/EventSetup.h" #include "Framework/PluginFactory.h" @@ -26,18 +26,14 @@ #include "SiPixelClusterThresholds.h" #include "SiPixelRawToClusterGPUKernel.h" -class SiPixelRawToClusterCUDA : public edm::EDProducerExternalWork { +class SiPixelRawToClusterCUDA : public cms::cuda::SynchronizingEDProducer { public: explicit SiPixelRawToClusterCUDA(edm::ProductRegistry& reg); ~SiPixelRawToClusterCUDA() override = default; private: - void acquire(const edm::Event& iEvent, - const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; - void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; - - cms::cuda::ContextState ctxState_; + void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::AcquireContext& ctx) override; + void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext& ctx) override; edm::EDGetTokenT rawGetToken_; edm::EDPutTokenT> digiPutToken_; @@ -72,9 +68,7 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(edm::ProductRegistry& reg) void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_}; - + cms::cuda::AcquireContext& ctx) { auto const& hgpuMap = iSetup.get(); if (hgpuMap.hasQuality() != useQuality_) { throw std::runtime_error( @@ -170,9 +164,9 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, ctx.stream()); } -void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { - cms::cuda::ScopedContextProduce ctx{ctxState_}; - +void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, + const edm::EventSetup& iSetup, + cms::cuda::ProduceContext& ctx) { auto tmp = gpuAlgo_.getResults(); ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first)); ctx.emplace(iEvent, clusterPutToken_, std::move(tmp.second)); diff --git a/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc b/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc index 448f4b797..8b90c5326 100644 --- a/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc +++ b/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc @@ -1,23 +1,21 @@ +#include "CUDACore/Context.h" +#include "CUDACore/EDProducer.h" #include "CUDACore/Product.h" #include "CUDADataFormats/SiPixelDigisCUDA.h" #include "DataFormats/SiPixelDigisSoA.h" #include "Framework/EventSetup.h" #include "Framework/Event.h" -#include "Framework/EDProducer.h" #include "Framework/PluginFactory.h" -#include "CUDACore/ScopedContext.h" #include "CUDACore/host_unique_ptr.h" -class SiPixelDigisSoAFromCUDA : public edm::EDProducerExternalWork { +class SiPixelDigisSoAFromCUDA : public cms::cuda::SynchronizingEDProducer { public: explicit SiPixelDigisSoAFromCUDA(edm::ProductRegistry& reg); ~SiPixelDigisSoAFromCUDA() override = default; private: - void acquire(const edm::Event& iEvent, - const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; - void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; + void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::AcquireContext& ctx) override; + void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext&) override; edm::EDGetTokenT> digiGetToken_; edm::EDPutTokenT digiPutToken_; @@ -36,10 +34,7 @@ SiPixelDigisSoAFromCUDA::SiPixelDigisSoAFromCUDA(edm::ProductRegistry& reg) void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - // Do the transfer in a CUDA stream parallel to the computation CUDA stream - cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)}; - + cms::cuda::AcquireContext& ctx) { const auto& gpuDigis = ctx.get(iEvent, digiGetToken_); nDigis_ = gpuDigis.nDigis(); @@ -49,7 +44,7 @@ void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent, clus_ = gpuDigis.clusToHostAsync(ctx.stream()); } -void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { +void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext&) { // The following line copies the data from the pinned host memory to // regular host memory. In principle that feels unnecessary (why not // just use the pinned host memory?). There are a few arguments for diff --git a/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc b/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc index 413982fc3..133fa8eaf 100644 --- a/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc +++ b/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc @@ -1,6 +1,7 @@ #include #include "CUDADataFormats/BeamSpotCUDA.h" +#include "CUDACore/EDProducer.h" #include "CUDACore/Product.h" #include "CUDADataFormats/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigisCUDA.h" @@ -8,19 +9,17 @@ #include "Framework/EventSetup.h" #include "Framework/Event.h" #include "Framework/PluginFactory.h" -#include "Framework/EDProducer.h" -#include "CUDACore/ScopedContext.h" #include "CondFormats/PixelCPEFast.h" #include "PixelRecHitGPUKernel.h" -class SiPixelRecHitCUDA : public edm::EDProducer { +class SiPixelRecHitCUDA : public cms::cuda::EDProducer { public: explicit SiPixelRecHitCUDA(edm::ProductRegistry& reg); ~SiPixelRecHitCUDA() override = default; private: - void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; + void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext& ctx) override; // The mess with inputs will be cleaned up when migrating to the new framework const edm::EDGetTokenT> tBeamSpot; @@ -36,13 +35,10 @@ SiPixelRecHitCUDA::SiPixelRecHitCUDA(edm::ProductRegistry& reg) tokenDigi_(reg.consumes>()), tokenHit_(reg.produces>()) {} -void SiPixelRecHitCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es) { +void SiPixelRecHitCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es, cms::cuda::ProduceContext& ctx) { PixelCPEFast const& fcpe = es.get(); - auto const& pclusters = iEvent.get(token_); - cms::cuda::ScopedContextProduce ctx{pclusters}; - - auto const& clusters = ctx.get(pclusters); + auto const& clusters = ctx.get(iEvent, token_); auto const& digis = ctx.get(iEvent, tokenDigi_); auto const& bs = ctx.get(iEvent, tBeamSpot); diff --git a/src/cudadev/plugin-Validation/CountValidator.cc b/src/cudadev/plugin-Validation/CountValidator.cc index 23352f5ba..1993e20eb 100644 --- a/src/cudadev/plugin-Validation/CountValidator.cc +++ b/src/cudadev/plugin-Validation/CountValidator.cc @@ -1,5 +1,5 @@ +#include "CUDACore/Context.h" #include "CUDACore/Product.h" -#include "CUDACore/ScopedContext.h" #include "CUDADataFormats/PixelTrackHeterogeneous.h" #include "CUDADataFormats/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigisCUDA.h" @@ -63,24 +63,24 @@ void CountValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) ss << "Event " << iEvent.eventID() << " "; { - auto const& pdigis = iEvent.get(digiToken_); - cms::cuda::ScopedContextProduce ctx{pdigis}; - auto const& count = iEvent.get(digiClusterCountToken_); - auto const& digis = ctx.get(iEvent, digiToken_); - auto const& clusters = ctx.get(iEvent, clusterToken_); - - if (digis.nModules() != count.nModules()) { - ss << "\n N(modules) is " << digis.nModules() << " expected " << count.nModules(); - ok = false; - } - if (digis.nDigis() != count.nDigis()) { - ss << "\n N(digis) is " << digis.nDigis() << " expected " << count.nDigis(); - ok = false; - } - if (clusters.nClusters() != count.nClusters()) { - ss << "\n N(clusters) is " << clusters.nClusters() << " expected " << count.nClusters(); - ok = false; - } + cms::cuda::runProduce(iEvent.streamID(), [&](cms::cuda::ProduceContext& ctx) { + auto const& count = iEvent.get(digiClusterCountToken_); + auto const& digis = ctx.get(iEvent, digiToken_); + auto const& clusters = ctx.get(iEvent, clusterToken_); + + if (digis.nModules() != count.nModules()) { + ss << "\n N(modules) is " << digis.nModules() << " expected " << count.nModules(); + ok = false; + } + if (digis.nDigis() != count.nDigis()) { + ss << "\n N(digis) is " << digis.nDigis() << " expected " << count.nDigis(); + ok = false; + } + if (clusters.nClusters() != count.nClusters()) { + ss << "\n N(clusters) is " << clusters.nClusters() << " expected " << count.nClusters(); + ok = false; + } + }); } { diff --git a/src/cudadev/plugin-Validation/HistoValidator.cc b/src/cudadev/plugin-Validation/HistoValidator.cc index 8a888666b..c23bb42af 100644 --- a/src/cudadev/plugin-Validation/HistoValidator.cc +++ b/src/cudadev/plugin-Validation/HistoValidator.cc @@ -1,5 +1,6 @@ +#include "CUDACore/Context.h" +#include "CUDACore/EDProducer.h" #include "CUDACore/Product.h" -#include "CUDACore/ScopedContext.h" #include "CUDADataFormats/PixelTrackHeterogeneous.h" #include "CUDADataFormats/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigisCUDA.h" @@ -8,22 +9,19 @@ #include "Framework/EventSetup.h" #include "Framework/Event.h" #include "Framework/PluginFactory.h" -#include "Framework/EDProducer.h" #include "SimpleAtomicHisto.h" #include #include -class HistoValidator : public edm::EDProducerExternalWork { +class HistoValidator : public cms::cuda::SynchronizingEDProducer { public: explicit HistoValidator(edm::ProductRegistry& reg); private: - void acquire(const edm::Event& iEvent, - const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; - void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; + void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::AcquireContext& ctx) override; + void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext&) override; void endJob() override; edm::EDGetTokenT> digiToken_; @@ -88,11 +86,7 @@ HistoValidator::HistoValidator(edm::ProductRegistry& reg) trackToken_(reg.consumes()), vertexToken_(reg.consumes()) {} -void HistoValidator::acquire(const edm::Event& iEvent, - const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - auto const& pdigis = iEvent.get(digiToken_); - cms::cuda::ScopedContextAcquire ctx{pdigis, std::move(waitingTaskHolder)}; +void HistoValidator::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::AcquireContext& ctx) { auto const& digis = ctx.get(iEvent, digiToken_); auto const& clusters = ctx.get(iEvent, clusterToken_); auto const& hits = ctx.get(iEvent, hitToken_); @@ -113,7 +107,7 @@ void HistoValidator::acquire(const edm::Event& iEvent, h_size = hits.sizeToHostAsync(ctx.stream()); } -void HistoValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { +void HistoValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext&) { histos["digi_n"].fill(nDigis); for (uint32_t i = 0; i < nDigis; ++i) { histos["digi_adc"].fill(h_adc[i]); From df786db1d4f7405af6a575dec88c79c3a86bbebc Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Tue, 31 Aug 2021 14:37:24 -0700 Subject: [PATCH 04/12] Avoid using ContextState --- src/cudadev/CUDACore/Context.cc | 17 ++++----- src/cudadev/CUDACore/Context.h | 60 ++++++++++++------------------ src/cudadev/CUDACore/EDProducer.cc | 4 +- src/cudadev/CUDACore/EDProducer.h | 3 -- src/cudadev/Framework/Event.h | 27 +++++++++++++- 5 files changed, 58 insertions(+), 53 deletions(-) diff --git a/src/cudadev/CUDACore/Context.cc b/src/cudadev/CUDACore/Context.cc index 5d1a46dff..80452494d 100644 --- a/src/cudadev/CUDACore/Context.cc +++ b/src/cudadev/CUDACore/Context.cc @@ -36,9 +36,9 @@ namespace { namespace cms::cuda { namespace impl { - Context::Context(edm::StreamID streamID) : currentDevice_(chooseDevice(streamID)) { - cudaCheck(cudaSetDevice(currentDevice_)); - } + Context::Context(edm::StreamID streamID) : Context(chooseDevice(streamID)) {} + + Context::Context(int device) : currentDevice_(device) { cudaCheck(cudaSetDevice(currentDevice_)); } Context::Context(int device, SharedStreamPtr stream) : currentDevice_(device), stream_(std::move(stream)) { cudaCheck(cudaSetDevice(currentDevice_)); @@ -81,17 +81,14 @@ namespace cms::cuda { //////////////////// - void ContextHolderHelper::enqueueCallback(int device, cudaStream_t stream) { - cudaCheck(cudaStreamAddCallback(stream, cudaContextCallback, new CallbackData{waitingTaskHolder_, device}, 0)); + void ContextHolderHelper::enqueueCallback(cudaStream_t stream) { + cudaCheck(cudaStreamAddCallback(stream, cudaContextCallback, new CallbackData{waitingTaskHolder_, device_}, 0)); } } // namespace impl //////////////////// - void AcquireContext::commit() { - holderHelper_.enqueueCallback(device(), stream()); - contextState_.set(device(), streamPtr()); - } + void AcquireContext::commit() { holderHelper_.enqueueCallback(stream()); } //////////////////// @@ -99,5 +96,5 @@ namespace cms::cuda { //////////////////// - void TaskContext::commit() { holderHelper_.enqueueCallback(device(), stream()); } + void TaskContext::commit() { holderHelper_.enqueueCallback(stream()); } } // namespace cms::cuda diff --git a/src/cudadev/CUDACore/Context.h b/src/cudadev/CUDACore/Context.h index b85d29774..73139d13c 100644 --- a/src/cudadev/CUDACore/Context.h +++ b/src/cudadev/CUDACore/Context.h @@ -6,7 +6,6 @@ #include "Framework/Event.h" #include "Framework/EDGetToken.h" #include "Framework/EDPutToken.h" -#include "CUDACore/ContextState.h" #include "CUDACore/EventCache.h" #include "CUDACore/SharedEventPtr.h" #include "CUDACore/SharedStreamPtr.h" @@ -49,6 +48,9 @@ namespace cms::cuda { // really matter between modules (or across TBB tasks). explicit Context(edm::StreamID streamID); + explicit Context(int device); + + // meant only for testing explicit Context(int device, SharedStreamPtr stream); bool isInitialized() const { return bool(stream_); } @@ -87,20 +89,21 @@ namespace cms::cuda { class ContextHolderHelper { public: - ContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder) - : waitingTaskHolder_{std::move(waitingTaskHolder)} {} + ContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder, int device) + : waitingTaskHolder_{std::move(waitingTaskHolder)}, device_{device} {} template - void pushNextTask(F&& f, ContextState const* state); + void pushNextTask(F&& f); void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { waitingTaskHolder_ = std::move(waitingTaskHolder); } - void enqueueCallback(int device, cudaStream_t stream); + void enqueueCallback(cudaStream_t stream); private: edm::WaitingTaskWithArenaHolder waitingTaskHolder_; + int device_; }; } // namespace impl @@ -113,15 +116,13 @@ namespace cms::cuda { */ class AcquireContext : public impl::ContextGetterBase { public: - explicit AcquireContext(edm::StreamID streamID, - edm::WaitingTaskWithArenaHolder waitingTaskHolder, - ContextState& state) - : ContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{state} {} + explicit AcquireContext(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : ContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder), device()} {} ~AcquireContext() = default; template void pushNextTask(F&& f) { - holderHelper_.pushNextTask(std::forward(f), contextState_); + holderHelper_.pushNextTask(std::forward(f)); } void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { @@ -133,7 +134,6 @@ namespace cms::cuda { private: impl::ContextHolderHelper holderHelper_; - ContextState& contextState_; }; /** @@ -144,12 +144,8 @@ namespace cms::cuda { */ class ProduceContext : public impl::ContextGetterBase { public: - /// Constructor to create a new CUDA stream (non-ExternalWork module) explicit ProduceContext(edm::StreamID streamID) : ContextGetterBase(streamID) {} - /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) - explicit ProduceContext(ContextState& state) : ContextGetterBase(state.device(), state.releaseStreamPtr()) {} - ~ProduceContext() = default; template @@ -183,16 +179,14 @@ namespace cms::cuda { class TaskContext : public impl::Context { public: /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) - explicit TaskContext(ContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder) - : Context(state->device(), state->streamPtr()), // don't move, state is re-used afterwards - holderHelper_{std::move(waitingTaskHolder)}, - contextState_{state} {} + explicit TaskContext(int device, edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : Context(device), holderHelper_{std::move(waitingTaskHolder), device} {} ~TaskContext() = default; template void pushNextTask(F&& f) { - holderHelper_.pushNextTask(std::forward(f), contextState_); + holderHelper_.pushNextTask(std::forward(f)); } void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { @@ -204,7 +198,6 @@ namespace cms::cuda { private: impl::ContextHolderHelper holderHelper_; - ContextState const* contextState_; }; /** @@ -221,19 +214,19 @@ namespace cms::cuda { namespace impl { template - void ContextHolderHelper::pushNextTask(F&& f, ContextState const* state) { - replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{ - edm::make_waiting_task_with_holder(tbb::task::allocate_root(), - std::move(waitingTaskHolder_), - [state, func = std::forward(f)](edm::WaitingTaskWithArenaHolder h) { - func(TaskContext{state, std::move(h)}); - })}); + void ContextHolderHelper::pushNextTask(F&& f) { + replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{edm::make_waiting_task_with_holder( + tbb::task::allocate_root(), + std::move(waitingTaskHolder_), + [device = device_, func = std::forward(f)](edm::WaitingTaskWithArenaHolder h) { + func(TaskContext{device, std::move(h)}); + })}); } } // namespace impl template - void runAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder holder, ContextState& state, F func) { - AcquireContext context(streamID, std::move(holder), state); + void runAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder holder, F func) { + AcquireContext context(streamID, std::move(holder)); func(context); context.commit(); } @@ -244,13 +237,6 @@ namespace cms::cuda { func(context); context.commit(); } - - template - void runProduce(ContextState& state, F func) { - ProduceContext context(state); - func(context); - context.commit(); - } } // namespace cms::cuda #endif diff --git a/src/cudadev/CUDACore/EDProducer.cc b/src/cudadev/CUDACore/EDProducer.cc index d41d77c31..2e08ea2d2 100644 --- a/src/cudadev/CUDACore/EDProducer.cc +++ b/src/cudadev/CUDACore/EDProducer.cc @@ -8,10 +8,10 @@ namespace cms::cuda { void SynchronizingEDProducer::acquire(edm::Event const& event, edm::EventSetup const& eventSetup, edm::WaitingTaskWithArenaHolder holder) { - runAcquire(event.streamID(), std::move(holder), state_, [&](auto& ctx) { acquire(event, eventSetup, ctx); }); + runAcquire(event.streamID(), std::move(holder), [&](auto& ctx) { acquire(event, eventSetup, ctx); }); } void SynchronizingEDProducer::produce(edm::Event& event, edm::EventSetup const& eventSetup) { - runProduce(state_, [&](auto& ctx) { produce(event, eventSetup, ctx); }); + runProduce(event.streamID(), [&](auto& ctx) { produce(event, eventSetup, ctx); }); } } // namespace cms::cuda diff --git a/src/cudadev/CUDACore/EDProducer.h b/src/cudadev/CUDACore/EDProducer.h index 3fc917285..6fe6962bd 100644 --- a/src/cudadev/CUDACore/EDProducer.h +++ b/src/cudadev/CUDACore/EDProducer.h @@ -20,9 +20,6 @@ namespace cms::cuda { virtual void acquire(edm::Event const& event, edm::EventSetup const& eventSetup, AcquireContext& context) = 0; virtual void produce(edm::Event& event, edm::EventSetup const& eventSetup, ProduceContext& context) = 0; - - private: - ContextState state_; }; } // namespace cms::cuda diff --git a/src/cudadev/Framework/Event.h b/src/cudadev/Framework/Event.h index 9f952c492..68a04e729 100644 --- a/src/cudadev/Framework/Event.h +++ b/src/cudadev/Framework/Event.h @@ -9,7 +9,32 @@ // type erasure namespace edm { - using StreamID = int; + class Event; + + class StreamID { + public: + ~StreamID() = default; + StreamID() = delete; + StreamID(const StreamID&) = default; + StreamID& operator=(const StreamID&) = default; + + bool operator==(const StreamID& iID) const { return iID.value_ == value_; } + + operator unsigned int() const { return value_; } + + /** \return value ranging from 0 to one less than max number of streams. + */ + unsigned int value() const { return value_; } + + static StreamID invalidStreamID() { return StreamID(0xFFFFFFFFU); } + + private: + ///Only a Event is allowed to create one of these + friend class Event; + explicit StreamID(unsigned int iValue) : value_(iValue) {} + + unsigned int value_; + }; class WrapperBase { public: From 8b3187ad32d15045382674ead24dfd54776ddba8 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Thu, 2 Sep 2021 06:24:17 -0700 Subject: [PATCH 05/12] Remove ScopedContext as obsolete --- src/cudadev/CUDACore/Product.h | 5 +- src/cudadev/CUDACore/ProductBase.h | 9 +- src/cudadev/CUDACore/ScopedContext.cc | 116 ------------- src/cudadev/CUDACore/ScopedContext.h | 241 -------------------------- 4 files changed, 4 insertions(+), 367 deletions(-) delete mode 100644 src/cudadev/CUDACore/ScopedContext.cc delete mode 100644 src/cudadev/CUDACore/ScopedContext.h diff --git a/src/cudadev/CUDACore/Product.h b/src/cudadev/CUDACore/Product.h index 5d9cab481..ae4123d79 100644 --- a/src/cudadev/CUDACore/Product.h +++ b/src/cudadev/CUDACore/Product.h @@ -13,7 +13,6 @@ namespace edm { namespace cms { namespace cuda { namespace impl { - class ScopedContextGetterBase; class ContextGetterBase; } // namespace impl @@ -21,7 +20,7 @@ namespace cms { * The purpose of this class is to wrap CUDA data to edm::Event in a * way which forces correct use of various utilities. * - * The non-default construction has to be done with cms::cuda::ScopedContext + * The non-default construction has to be done with cms::cuda::Context * (in order to properly register the CUDA event). * * The default constructor is needed only for the ROOT dictionary generation. @@ -42,8 +41,6 @@ namespace cms { Product& operator=(Product&&) = default; private: - friend class impl::ScopedContextGetterBase; - friend class ScopedContextProduce; friend class impl::ContextGetterBase; friend class ProduceContext; friend class edm::Wrapper>; diff --git a/src/cudadev/CUDACore/ProductBase.h b/src/cudadev/CUDACore/ProductBase.h index 00d12b3fb..47a7d7a93 100644 --- a/src/cudadev/CUDACore/ProductBase.h +++ b/src/cudadev/CUDACore/ProductBase.h @@ -10,7 +10,6 @@ namespace cms { namespace cuda { namespace impl { - class ScopedContextBase; class Context; } // namespace impl @@ -44,13 +43,13 @@ namespace cms { int device() const { return device_; } // cudaStream_t is a pointer to a thread-safe object, for which a - // mutable access is needed even if the cms::cuda::ScopedContext itself + // mutable access is needed even if the cms::cuda::Context itself // would be const. Therefore it is ok to return a non-const // pointer from a const method here. cudaStream_t stream() const { return stream_.get(); } // cudaEvent_t is a pointer to a thread-safe object, for which a - // mutable access is needed even if the cms::cuda::ScopedContext itself + // mutable access is needed even if the cms::cuda::Context itself // would be const. Therefore it is ok to return a non-const // pointer from a const method here. cudaEvent_t event() const { return event_.get(); } @@ -60,12 +59,10 @@ namespace cms { : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {} private: - friend class impl::ScopedContextBase; - friend class ScopedContextProduce; friend class impl::Context; friend class ProduceContext; - // The following function is intended to be used only from ScopedContext + // The following function is intended to be used only from Context const SharedStreamPtr& streamPtr() const { return stream_; } bool mayReuseStream() const { diff --git a/src/cudadev/CUDACore/ScopedContext.cc b/src/cudadev/CUDACore/ScopedContext.cc deleted file mode 100644 index 14bff04eb..000000000 --- a/src/cudadev/CUDACore/ScopedContext.cc +++ /dev/null @@ -1,116 +0,0 @@ -#include "CUDACore/ScopedContext.h" - -#include "CUDACore/StreamCache.h" -#include "CUDACore/cudaCheck.h" - -#include "chooseDevice.h" - -namespace { - struct CallbackData { - edm::WaitingTaskWithArenaHolder holder; - int device; - }; - - void CUDART_CB cudaScopedContextCallback(cudaStream_t streamId, cudaError_t status, void* data) { - std::unique_ptr guard{reinterpret_cast(data)}; - edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder; - int device = guard->device; - if (status == cudaSuccess) { - //std::cout << " GPU kernel finished (in callback) device " << device << " CUDA stream " - // << streamId << std::endl; - waitingTaskHolder.doneWaiting(nullptr); - } else { - // wrap the exception in a try-catch block to let GDB "catch throw" break on it - try { - auto error = cudaGetErrorName(status); - auto message = cudaGetErrorString(status); - throw std::runtime_error("Callback of CUDA stream " + - std::to_string(reinterpret_cast(streamId)) + " in device " + - std::to_string(device) + " error " + std::string(error) + ": " + std::string(message)); - } catch (std::exception&) { - waitingTaskHolder.doneWaiting(std::current_exception()); - } - } - } -} // namespace - -namespace cms::cuda { - namespace impl { - ScopedContextBase::ScopedContextBase(edm::StreamID streamID) : currentDevice_(chooseDevice(streamID)) { - cudaCheck(cudaSetDevice(currentDevice_)); - stream_ = getStreamCache().get(); - } - - ScopedContextBase::ScopedContextBase(const ProductBase& data) : currentDevice_(data.device()) { - cudaCheck(cudaSetDevice(currentDevice_)); - if (data.mayReuseStream()) { - stream_ = data.streamPtr(); - } else { - stream_ = getStreamCache().get(); - } - } - - ScopedContextBase::ScopedContextBase(int device, SharedStreamPtr stream) - : currentDevice_(device), stream_(std::move(stream)) { - cudaCheck(cudaSetDevice(currentDevice_)); - } - - //////////////////// - - void ScopedContextGetterBase::synchronizeStreams(int dataDevice, - cudaStream_t dataStream, - bool available, - cudaEvent_t dataEvent) { - if (dataDevice != device()) { - // Eventually replace with prefetch to current device (assuming unified memory works) - // If we won't go to unified memory, need to figure out something else... - throw std::runtime_error("Handling data from multiple devices is not yet supported"); - } - - if (dataStream != stream()) { - // Different streams, need to synchronize - if (not available) { - // Event not yet occurred, so need to add synchronization - // here. Sychronization is done by making the CUDA stream to - // wait for an event, so all subsequent work in the stream - // will run only after the event has "occurred" (i.e. data - // product became available). - cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event"); - } - } - } - - void ScopedContextHolderHelper::enqueueCallback(int device, cudaStream_t stream) { - cudaCheck( - cudaStreamAddCallback(stream, cudaScopedContextCallback, new CallbackData{waitingTaskHolder_, device}, 0)); - } - } // namespace impl - - //////////////////// - - ScopedContextAcquire::~ScopedContextAcquire() { - holderHelper_.enqueueCallback(device(), stream()); - if (contextState_) { - contextState_->set(device(), streamPtr()); - } - } - - void ScopedContextAcquire::throwNoState() { - throw std::runtime_error( - "Calling ScopedContextAcquire::insertNextTask() requires ScopedContextAcquire to be constructed with " - "ContextState, but that was not the case"); - } - - //////////////////// - - ScopedContextProduce::~ScopedContextProduce() { - // Intentionally not checking the return value to avoid throwing - // exceptions. If this call would fail, we should get failures - // elsewhere as well. - cudaEventRecord(event_.get(), stream()); - } - - //////////////////// - - ScopedContextTask::~ScopedContextTask() { holderHelper_.enqueueCallback(device(), stream()); } -} // namespace cms::cuda diff --git a/src/cudadev/CUDACore/ScopedContext.h b/src/cudadev/CUDACore/ScopedContext.h deleted file mode 100644 index 4f6669883..000000000 --- a/src/cudadev/CUDACore/ScopedContext.h +++ /dev/null @@ -1,241 +0,0 @@ -#ifndef HeterogeneousCore_CUDACore_ScopedContext_h -#define HeterogeneousCore_CUDACore_ScopedContext_h - -#include - -#include "CUDACore/Product.h" -#include "Framework/WaitingTaskWithArenaHolder.h" -#include "Framework/Event.h" -#include "Framework/EDGetToken.h" -#include "Framework/EDPutToken.h" -#include "CUDACore/ContextState.h" -#include "CUDACore/EventCache.h" -#include "CUDACore/SharedEventPtr.h" -#include "CUDACore/SharedStreamPtr.h" - -namespace cms { - namespace cudatest { - class TestScopedContext; - } - - namespace cuda { - - namespace impl { - // This class is intended to be derived by other ScopedContext*, not for general use - class ScopedContextBase { - public: - int device() const { return currentDevice_; } - - // cudaStream_t is a pointer to a thread-safe object, for which a - // mutable access is needed even if the ScopedContext itself - // would be const. Therefore it is ok to return a non-const - // pointer from a const method here. - cudaStream_t stream() const { return stream_.get(); } - const SharedStreamPtr& streamPtr() const { return stream_; } - - protected: - // The constructors set the current device, but the device - // is not set back to the previous value at the destructor. This - // should be sufficient (and tiny bit faster) as all CUDA API - // functions relying on the current device should be called from - // the scope where this context is. The current device doesn't - // really matter between modules (or across TBB tasks). - explicit ScopedContextBase(edm::StreamID streamID); - - explicit ScopedContextBase(const ProductBase& data); - - explicit ScopedContextBase(int device, SharedStreamPtr stream); - - private: - int currentDevice_; - SharedStreamPtr stream_; - }; - - class ScopedContextGetterBase : public ScopedContextBase { - public: - template - const T& get(const Product& data) { - synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event()); - return data.data_; - } - - template - const T& get(const edm::Event& iEvent, edm::EDGetTokenT> token) { - return get(iEvent.get(token)); - } - - protected: - template - ScopedContextGetterBase(Args&&... args) : ScopedContextBase(std::forward(args)...) {} - - void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent); - }; - - class ScopedContextHolderHelper { - public: - ScopedContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder) - : waitingTaskHolder_{std::move(waitingTaskHolder)} {} - - template - void pushNextTask(F&& f, ContextState const* state); - - void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - waitingTaskHolder_ = std::move(waitingTaskHolder); - } - - void enqueueCallback(int device, cudaStream_t stream); - - private: - edm::WaitingTaskWithArenaHolder waitingTaskHolder_; - }; - } // namespace impl - - /** - * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire(): - * - setting the current device - * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary - * - synchronizing between CUDA streams if necessary - * and enforce that those get done in a proper way in RAII fashion. - */ - class ScopedContextAcquire : public impl::ScopedContextGetterBase { - public: - /// Constructor to create a new CUDA stream (no need for context beyond acquire()) - explicit ScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder) - : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)} {} - - /// Constructor to create a new CUDA stream, and the context is needed after acquire() - explicit ScopedContextAcquire(edm::StreamID streamID, - edm::WaitingTaskWithArenaHolder waitingTaskHolder, - ContextState& state) - : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {} - - /// Constructor to (possibly) re-use a CUDA stream (no need for context beyond acquire()) - explicit ScopedContextAcquire(const ProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder) - : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)} {} - - /// Constructor to (possibly) re-use a CUDA stream, and the context is needed after acquire() - explicit ScopedContextAcquire(const ProductBase& data, - edm::WaitingTaskWithArenaHolder waitingTaskHolder, - ContextState& state) - : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {} - - ~ScopedContextAcquire(); - - template - void pushNextTask(F&& f) { - if (contextState_ == nullptr) - throwNoState(); - holderHelper_.pushNextTask(std::forward(f), contextState_); - } - - void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder)); - } - - private: - void throwNoState(); - - impl::ScopedContextHolderHelper holderHelper_; - ContextState* contextState_ = nullptr; - }; - - /** - * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce(): - * - setting the current device - * - synchronizing between CUDA streams if necessary - * and enforce that those get done in a proper way in RAII fashion. - */ - class ScopedContextProduce : public impl::ScopedContextGetterBase { - public: - /// Constructor to create a new CUDA stream (non-ExternalWork module) - explicit ScopedContextProduce(edm::StreamID streamID) : ScopedContextGetterBase(streamID) {} - - /// Constructor to (possibly) re-use a CUDA stream (non-ExternalWork module) - explicit ScopedContextProduce(const ProductBase& data) : ScopedContextGetterBase(data) {} - - /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) - explicit ScopedContextProduce(ContextState& state) - : ScopedContextGetterBase(state.device(), state.releaseStreamPtr()) {} - - /// Record the CUDA event, all asynchronous work must have been queued before the destructor - ~ScopedContextProduce(); - - template - std::unique_ptr> wrap(T data) { - // make_unique doesn't work because of private constructor - return std::unique_ptr>(new Product(device(), streamPtr(), event_, std::move(data))); - } - - template - auto emplace(edm::Event& iEvent, edm::EDPutTokenT token, Args&&... args) { - return iEvent.emplace(token, device(), streamPtr(), event_, std::forward(args)...); - } - - private: - friend class cudatest::TestScopedContext; - - // This construcor is only meant for testing - explicit ScopedContextProduce(int device, SharedStreamPtr stream, SharedEventPtr event) - : ScopedContextGetterBase(device, std::move(stream)), event_{std::move(event)} {} - - // create the CUDA Event upfront to catch possible errors from its creation - SharedEventPtr event_ = getEventCache().get(); - }; - - /** - * The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire(): - * - setting the current device - * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary - * and enforce that those get done in a proper way in RAII fashion. - */ - class ScopedContextTask : public impl::ScopedContextBase { - public: - /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) - explicit ScopedContextTask(ContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder) - : ScopedContextBase(state->device(), state->streamPtr()), // don't move, state is re-used afterwards - holderHelper_{std::move(waitingTaskHolder)}, - contextState_{state} {} - - ~ScopedContextTask(); - - template - void pushNextTask(F&& f) { - holderHelper_.pushNextTask(std::forward(f), contextState_); - } - - void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder)); - } - - private: - impl::ScopedContextHolderHelper holderHelper_; - ContextState const* contextState_; - }; - - /** - * The aim of this class is to do necessary per-event "initialization" in analyze() - * - setting the current device - * - synchronizing between CUDA streams if necessary - * and enforce that those get done in a proper way in RAII fashion. - */ - class ScopedContextAnalyze : public impl::ScopedContextGetterBase { - public: - /// Constructor to (possibly) re-use a CUDA stream - explicit ScopedContextAnalyze(const ProductBase& data) : ScopedContextGetterBase(data) {} - }; - - namespace impl { - template - void ScopedContextHolderHelper::pushNextTask(F&& f, ContextState const* state) { - replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{ - edm::make_waiting_task_with_holder(tbb::task::allocate_root(), - std::move(waitingTaskHolder_), - [state, func = std::forward(f)](edm::WaitingTaskWithArenaHolder h) { - func(ScopedContextTask{state, std::move(h)}); - })}); - } - } // namespace impl - } // namespace cuda -} // namespace cms - -#endif From 4cd8dab1134029e269ee0111e11f377210e6a171 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Thu, 2 Sep 2021 08:39:44 -0700 Subject: [PATCH 06/12] Share the CUDA stream between Event products from the same EDProducer in better-defined way --- src/cudadev/CUDACore/Context.cc | 11 ++-- src/cudadev/CUDACore/Context.h | 17 +++++-- src/cudadev/CUDACore/Product.h | 7 ++- src/cudadev/CUDACore/ProductBase.h | 81 +++++++++++++++++------------- 4 files changed, 71 insertions(+), 45 deletions(-) diff --git a/src/cudadev/CUDACore/Context.cc b/src/cudadev/CUDACore/Context.cc index 80452494d..b34041e53 100644 --- a/src/cudadev/CUDACore/Context.cc +++ b/src/cudadev/CUDACore/Context.cc @@ -40,18 +40,21 @@ namespace cms::cuda { Context::Context(int device) : currentDevice_(device) { cudaCheck(cudaSetDevice(currentDevice_)); } - Context::Context(int device, SharedStreamPtr stream) : currentDevice_(device), stream_(std::move(stream)) { + Context::Context(int device, SharedStreamPtr stream) + : currentDevice_(device), stream_(std::make_shared(std::move(stream))) { cudaCheck(cudaSetDevice(currentDevice_)); } - void Context::initialize() { stream_ = getStreamCache().get(); } + void Context::initialize() { stream_ = std::make_shared(getStreamCache().get()); } void Context::initialize(const ProductBase& data) { + SharedStreamPtr stream; if (data.mayReuseStream()) { - stream_ = data.streamPtr(); + stream = data.streamPtr(); } else { - stream_ = getStreamCache().get(); + stream = getStreamCache().get(); } + stream_ = std::make_shared(std::move(stream)); } //////////////////// diff --git a/src/cudadev/CUDACore/Context.h b/src/cudadev/CUDACore/Context.h index 73139d13c..d2b7f6343 100644 --- a/src/cudadev/CUDACore/Context.h +++ b/src/cudadev/CUDACore/Context.h @@ -30,13 +30,13 @@ namespace cms::cuda { if (not isInitialized()) { initialize(); } - return stream_.get(); + return stream_->streamPtr().get(); } const SharedStreamPtr& streamPtr() { if (not isInitialized()) { initialize(); } - return stream_; + return stream_->streamPtr(); } protected: @@ -58,9 +58,16 @@ namespace cms::cuda { void initialize(); void initialize(const ProductBase& data); + const std::shared_ptr& streamSharingHelper() { + if (not isInitialized()) { + initialize(); + } + return stream_; + } + private: int currentDevice_ = -1; - SharedStreamPtr stream_; + std::shared_ptr stream_; }; class ContextGetterBase : public Context { @@ -151,12 +158,12 @@ namespace cms::cuda { template std::unique_ptr> wrap(T data) { // make_unique doesn't work because of private constructor - return std::unique_ptr>(new Product(device(), streamPtr(), event_, std::move(data))); + return std::unique_ptr>(new Product(device(), streamSharingHelper(), event_, std::move(data))); } template auto emplace(edm::Event& iEvent, edm::EDPutTokenT token, Args&&... args) { - return iEvent.emplace(token, device(), streamPtr(), event_, std::forward(args)...); + return iEvent.emplace(token, device(), streamSharingHelper(), event_, std::forward(args)...); } // internal API diff --git a/src/cudadev/CUDACore/Product.h b/src/cudadev/CUDACore/Product.h index ae4123d79..759079003 100644 --- a/src/cudadev/CUDACore/Product.h +++ b/src/cudadev/CUDACore/Product.h @@ -45,11 +45,14 @@ namespace cms { friend class ProduceContext; friend class edm::Wrapper>; - explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, T data) + explicit Product(int device, std::shared_ptr stream, SharedEventPtr event, T data) : ProductBase(device, std::move(stream), std::move(event)), data_(std::move(data)) {} template - explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, Args&&... args) + explicit Product(int device, + std::shared_ptr stream, + SharedEventPtr event, + Args&&... args) : ProductBase(device, std::move(stream), std::move(event)), data_(std::forward(args)...) {} T data_; //! diff --git a/src/cudadev/CUDACore/ProductBase.h b/src/cudadev/CUDACore/ProductBase.h index 47a7d7a93..e2a367341 100644 --- a/src/cudadev/CUDACore/ProductBase.h +++ b/src/cudadev/CUDACore/ProductBase.h @@ -11,6 +11,40 @@ namespace cms { namespace cuda { namespace impl { class Context; + + /** + * The CUDA stream is shared between all the Event products of + * the EDProducer. If the stream gets re-used, only one consumer + * of all the products should be allowed to use the stream. An + * objects of this class is shared between such Event products + * and takes care of letting only those consumers get the stream. + */ + class StreamSharingHelper { + public: + explicit StreamSharingHelper(SharedStreamPtr stream) : stream_(std::move(stream)) {} + StreamSharingHelper(const StreamSharingHelper&) = delete; + StreamSharingHelper& operator=(const StreamSharingHelper&) = delete; + StreamSharingHelper(StreamSharingHelper&&) = delete; + StreamSharingHelper& operator=(StreamSharingHelper) = delete; + + const SharedStreamPtr& streamPtr() const { return stream_; } + + bool mayReuseStream() const { + bool expected = true; + bool changed = mayReuseStream_.compare_exchange_strong(expected, false); + // If the current thread is the one flipping the flag, it may + // reuse the stream. + return changed; + } + + private: + SharedStreamPtr stream_; + + // This flag tells whether the CUDA stream may be reused by a + // consumer or not. The goal is to have a "chain" of modules to + // queue their work to the same stream. + mutable std::atomic mayReuseStream_ = true; + }; } // namespace impl /** @@ -24,20 +58,9 @@ namespace cms { ProductBase(const ProductBase&) = delete; ProductBase& operator=(const ProductBase&) = delete; - ProductBase(ProductBase&& other) - : stream_{std::move(other.stream_)}, - event_{std::move(other.event_)}, - mayReuseStream_{other.mayReuseStream_.load()}, - device_{other.device_} {} - ProductBase& operator=(ProductBase&& other) { - stream_ = std::move(other.stream_); - event_ = std::move(other.event_); - mayReuseStream_ = other.mayReuseStream_.load(); - device_ = other.device_; - return *this; - } - - bool isValid() const { return stream_.get() != nullptr; } + ProductBase(ProductBase&& other) = default; + ProductBase& operator=(ProductBase&& other) = default; + bool isAvailable() const; int device() const { return device_; } @@ -46,7 +69,7 @@ namespace cms { // mutable access is needed even if the cms::cuda::Context itself // would be const. Therefore it is ok to return a non-const // pointer from a const method here. - cudaStream_t stream() const { return stream_.get(); } + cudaStream_t stream() const { return stream_->streamPtr().get(); } // cudaEvent_t is a pointer to a thread-safe object, for which a // mutable access is needed even if the cms::cuda::Context itself @@ -55,7 +78,7 @@ namespace cms { cudaEvent_t event() const { return event_.get(); } protected: - explicit ProductBase(int device, SharedStreamPtr stream, SharedEventPtr event) + explicit ProductBase(int device, std::shared_ptr stream, SharedEventPtr event) : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {} private: @@ -63,27 +86,17 @@ namespace cms { friend class ProduceContext; // The following function is intended to be used only from Context - const SharedStreamPtr& streamPtr() const { return stream_; } - - bool mayReuseStream() const { - bool expected = true; - bool changed = mayReuseStream_.compare_exchange_strong(expected, false); - // If the current thread is the one flipping the flag, it may - // reuse the stream. - return changed; - } - - // The cudaStream_t is really shared among edm::Event products, so - // using shared_ptr also here - SharedStreamPtr stream_; //! + const SharedStreamPtr& streamPtr() const { return stream_->streamPtr(); } + + bool mayReuseStream() const { return stream_->mayReuseStream(); } + + // Helper shared between all cms::cuda::Product event + // products of an EDProducer + std::shared_ptr stream_; //! + // shared_ptr because of caching in cms::cuda::EventCache SharedEventPtr event_; //! - // This flag tells whether the CUDA stream may be reused by a - // consumer or not. The goal is to have a "chain" of modules to - // queue their work to the same stream. - mutable std::atomic mayReuseStream_ = true; //! - // The CUDA device associated with this product int device_ = -1; //! }; From 55effecbf227aa45b46637ad551efe8b1b842375 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Wed, 8 Sep 2021 18:23:39 -0700 Subject: [PATCH 07/12] Split Context.h into one class per file --- src/cudadev/CUDACore/AcquireContext.cc | 5 + src/cudadev/CUDACore/AcquireContext.h | 45 ++++ src/cudadev/CUDACore/AnalyzeContext.h | 20 ++ src/cudadev/CUDACore/Context.cc | 103 -------- src/cudadev/CUDACore/Context.h | 248 +----------------- src/cudadev/CUDACore/EDGetterContextBase.cc | 27 ++ src/cudadev/CUDACore/EDGetterContextBase.h | 38 +++ src/cudadev/CUDACore/FwkContextBase.cc | 28 ++ src/cudadev/CUDACore/FwkContextBase.h | 66 +++++ src/cudadev/CUDACore/FwkContextHolderHelper.h | 25 ++ src/cudadev/CUDACore/ProduceContext.cc | 6 + src/cudadev/CUDACore/ProduceContext.h | 52 ++++ src/cudadev/CUDACore/Product.h | 4 +- src/cudadev/CUDACore/ProductBase.h | 8 +- src/cudadev/CUDACore/TaskContext.cc | 41 +++ src/cudadev/CUDACore/TaskContext.h | 71 +++++ 16 files changed, 433 insertions(+), 354 deletions(-) create mode 100644 src/cudadev/CUDACore/AcquireContext.cc create mode 100644 src/cudadev/CUDACore/AcquireContext.h create mode 100644 src/cudadev/CUDACore/AnalyzeContext.h delete mode 100644 src/cudadev/CUDACore/Context.cc create mode 100644 src/cudadev/CUDACore/EDGetterContextBase.cc create mode 100644 src/cudadev/CUDACore/EDGetterContextBase.h create mode 100644 src/cudadev/CUDACore/FwkContextBase.cc create mode 100644 src/cudadev/CUDACore/FwkContextBase.h create mode 100644 src/cudadev/CUDACore/FwkContextHolderHelper.h create mode 100644 src/cudadev/CUDACore/ProduceContext.cc create mode 100644 src/cudadev/CUDACore/ProduceContext.h create mode 100644 src/cudadev/CUDACore/TaskContext.cc create mode 100644 src/cudadev/CUDACore/TaskContext.h diff --git a/src/cudadev/CUDACore/AcquireContext.cc b/src/cudadev/CUDACore/AcquireContext.cc new file mode 100644 index 000000000..dd6e8d1de --- /dev/null +++ b/src/cudadev/CUDACore/AcquireContext.cc @@ -0,0 +1,5 @@ +#include "CUDACore/AcquireContext.h" + +namespace cms::cuda { + void AcquireContext::commit() { holderHelper_.enqueueCallback(stream()); } +} diff --git a/src/cudadev/CUDACore/AcquireContext.h b/src/cudadev/CUDACore/AcquireContext.h new file mode 100644 index 000000000..409320abe --- /dev/null +++ b/src/cudadev/CUDACore/AcquireContext.h @@ -0,0 +1,45 @@ +#ifndef HeterogeneousCore_CUDACore_AcquireContext_h +#define HeterogeneousCore_CUDACore_AcquireContext_h + +#include "CUDACore/EDGetterContextBase.h" +#include "CUDACore/TaskContext.h" + +namespace cms::cuda { + /** + * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire(): + * - setting the current device + * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary + * - synchronizing between CUDA streams if necessary + * Users should not, however, construct it explicitly. + */ + class AcquireContext : public impl::EDGetterContextBase { + public: + explicit AcquireContext(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : EDGetterContextBase(streamID), holderHelper_{std::move(waitingTaskHolder), device()} {} + ~AcquireContext() = default; + + template + void pushNextTask(F&& f) { + holderHelper_.pushNextTask(std::forward(f)); + } + + void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder)); + } + + // internal API + void commit(); + + private: + impl::FwkContextHolderHelper holderHelper_; + }; + + template + void runAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder holder, F func) { + AcquireContext context(streamID, std::move(holder)); + func(context); + context.commit(); + } +} + +#endif diff --git a/src/cudadev/CUDACore/AnalyzeContext.h b/src/cudadev/CUDACore/AnalyzeContext.h new file mode 100644 index 000000000..cbe9a66dd --- /dev/null +++ b/src/cudadev/CUDACore/AnalyzeContext.h @@ -0,0 +1,20 @@ +#ifndef HeterogeneousCore_CUDACore_AnalyzeContext_h +#define HeterogeneousCore_CUDACore_AnalyzeContext_h + +#include "CUDACore/EDGetterContextBase.h" + +namespace cms::cuda { + /** + * The aim of this class is to do necessary per-event "initialization" in analyze() + * - setting the current device + * - synchronizing between CUDA streams if necessary + * and enforce that those get done in a proper way in RAII fashion. + */ + class AnalyzeContext : public impl::EDGetterContextBase { + public: + /// Constructor to (possibly) re-use a CUDA stream + explicit AnalyzeContext(edm::StreamID streamID) : EDGetterContextBase(streamID) {} + }; +} + +#endif diff --git a/src/cudadev/CUDACore/Context.cc b/src/cudadev/CUDACore/Context.cc deleted file mode 100644 index b34041e53..000000000 --- a/src/cudadev/CUDACore/Context.cc +++ /dev/null @@ -1,103 +0,0 @@ -#include "CUDACore/Context.h" - -#include "CUDACore/StreamCache.h" -#include "CUDACore/cudaCheck.h" - -#include "chooseDevice.h" - -namespace { - struct CallbackData { - edm::WaitingTaskWithArenaHolder holder; - int device; - }; - - void CUDART_CB cudaContextCallback(cudaStream_t streamId, cudaError_t status, void* data) { - std::unique_ptr guard{reinterpret_cast(data)}; - edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder; - int device = guard->device; - if (status == cudaSuccess) { - //std::cout << " GPU kernel finished (in callback) device " << device << " CUDA stream " - // << streamId << std::endl; - waitingTaskHolder.doneWaiting(nullptr); - } else { - // wrap the exception in a try-catch block to let GDB "catch throw" break on it - try { - auto error = cudaGetErrorName(status); - auto message = cudaGetErrorString(status); - throw std::runtime_error("Callback of CUDA stream " + - std::to_string(reinterpret_cast(streamId)) + " in device " + - std::to_string(device) + " error " + std::string(error) + ": " + std::string(message)); - } catch (std::exception&) { - waitingTaskHolder.doneWaiting(std::current_exception()); - } - } - } -} // namespace - -namespace cms::cuda { - namespace impl { - Context::Context(edm::StreamID streamID) : Context(chooseDevice(streamID)) {} - - Context::Context(int device) : currentDevice_(device) { cudaCheck(cudaSetDevice(currentDevice_)); } - - Context::Context(int device, SharedStreamPtr stream) - : currentDevice_(device), stream_(std::make_shared(std::move(stream))) { - cudaCheck(cudaSetDevice(currentDevice_)); - } - - void Context::initialize() { stream_ = std::make_shared(getStreamCache().get()); } - - void Context::initialize(const ProductBase& data) { - SharedStreamPtr stream; - if (data.mayReuseStream()) { - stream = data.streamPtr(); - } else { - stream = getStreamCache().get(); - } - stream_ = std::make_shared(std::move(stream)); - } - - //////////////////// - - void ContextGetterBase::synchronizeStreams(int dataDevice, - cudaStream_t dataStream, - bool available, - cudaEvent_t dataEvent) { - if (dataDevice != device()) { - // Eventually replace with prefetch to current device (assuming unified memory works) - // If we won't go to unified memory, need to figure out something else... - throw std::runtime_error("Handling data from multiple devices is not yet supported"); - } - - if (dataStream != stream()) { - // Different streams, need to synchronize - if (not available) { - // Event not yet occurred, so need to add synchronization - // here. Sychronization is done by making the CUDA stream to - // wait for an event, so all subsequent work in the stream - // will run only after the event has "occurred" (i.e. data - // product became available). - cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event"); - } - } - } - - //////////////////// - - void ContextHolderHelper::enqueueCallback(cudaStream_t stream) { - cudaCheck(cudaStreamAddCallback(stream, cudaContextCallback, new CallbackData{waitingTaskHolder_, device_}, 0)); - } - } // namespace impl - - //////////////////// - - void AcquireContext::commit() { holderHelper_.enqueueCallback(stream()); } - - //////////////////// - - void ProduceContext::commit() { cudaCheck(cudaEventRecord(event_.get(), stream())); } - - //////////////////// - - void TaskContext::commit() { holderHelper_.enqueueCallback(stream()); } -} // namespace cms::cuda diff --git a/src/cudadev/CUDACore/Context.h b/src/cudadev/CUDACore/Context.h index d2b7f6343..c55fc59fb 100644 --- a/src/cudadev/CUDACore/Context.h +++ b/src/cudadev/CUDACore/Context.h @@ -1,249 +1,7 @@ #ifndef HeterogeneousCore_CUDACore_Context_h #define HeterogeneousCore_CUDACore_Context_h -#include "CUDACore/Product.h" -#include "Framework/WaitingTaskWithArenaHolder.h" -#include "Framework/Event.h" -#include "Framework/EDGetToken.h" -#include "Framework/EDPutToken.h" -#include "CUDACore/EventCache.h" -#include "CUDACore/SharedEventPtr.h" -#include "CUDACore/SharedStreamPtr.h" - -namespace cms::cuda { - namespace impl { - // This class is intended to be derived by other Context*, not for general use - class Context { - public: - Context(Context const&) = delete; - Context& operator=(Context const&) = delete; - Context(Context&&) = delete; - Context& operator=(Context&&) = delete; - - int device() const { return currentDevice_; } - - // cudaStream_t is a pointer to a thread-safe object, for which a - // mutable access is needed even if the Context itself - // would be const. Therefore it is ok to return a non-const - // pointer from a const method here. - cudaStream_t stream() { - if (not isInitialized()) { - initialize(); - } - return stream_->streamPtr().get(); - } - const SharedStreamPtr& streamPtr() { - if (not isInitialized()) { - initialize(); - } - return stream_->streamPtr(); - } - - protected: - // The constructors set the current device, but the device - // is not set back to the previous value at the destructor. This - // should be sufficient (and tiny bit faster) as all CUDA API - // functions relying on the current device should be called from - // the scope where this context is. The current device doesn't - // really matter between modules (or across TBB tasks). - explicit Context(edm::StreamID streamID); - - explicit Context(int device); - - // meant only for testing - explicit Context(int device, SharedStreamPtr stream); - - bool isInitialized() const { return bool(stream_); } - - void initialize(); - void initialize(const ProductBase& data); - - const std::shared_ptr& streamSharingHelper() { - if (not isInitialized()) { - initialize(); - } - return stream_; - } - - private: - int currentDevice_ = -1; - std::shared_ptr stream_; - }; - - class ContextGetterBase : public Context { - public: - template - const T& get(const Product& data) { - if (not isInitialized()) { - initialize(data); - } - synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event()); - return data.data_; - } - - template - const T& get(const edm::Event& iEvent, edm::EDGetTokenT> token) { - return get(iEvent.get(token)); - } - - protected: - template - ContextGetterBase(Args&&... args) : Context(std::forward(args)...) {} - - private: - void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent); - }; - - class ContextHolderHelper { - public: - ContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder, int device) - : waitingTaskHolder_{std::move(waitingTaskHolder)}, device_{device} {} - - template - void pushNextTask(F&& f); - - void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - waitingTaskHolder_ = std::move(waitingTaskHolder); - } - - void enqueueCallback(cudaStream_t stream); - - private: - edm::WaitingTaskWithArenaHolder waitingTaskHolder_; - int device_; - }; - } // namespace impl - - /** - * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire(): - * - setting the current device - * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary - * - synchronizing between CUDA streams if necessary - * Users should not, however, construct it explicitly. - */ - class AcquireContext : public impl::ContextGetterBase { - public: - explicit AcquireContext(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder) - : ContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder), device()} {} - ~AcquireContext() = default; - - template - void pushNextTask(F&& f) { - holderHelper_.pushNextTask(std::forward(f)); - } - - void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder)); - } - - // internal API - void commit(); - - private: - impl::ContextHolderHelper holderHelper_; - }; - - /** - * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce(): - * - setting the current device - * - synchronizing between CUDA streams if necessary - * Users should not, however, construct it explicitly. - */ - class ProduceContext : public impl::ContextGetterBase { - public: - explicit ProduceContext(edm::StreamID streamID) : ContextGetterBase(streamID) {} - - ~ProduceContext() = default; - - template - std::unique_ptr> wrap(T data) { - // make_unique doesn't work because of private constructor - return std::unique_ptr>(new Product(device(), streamSharingHelper(), event_, std::move(data))); - } - - template - auto emplace(edm::Event& iEvent, edm::EDPutTokenT token, Args&&... args) { - return iEvent.emplace(token, device(), streamSharingHelper(), event_, std::forward(args)...); - } - - // internal API - void commit(); - - private: - // This construcor is only meant for testing - explicit ProduceContext(int device, SharedStreamPtr stream, SharedEventPtr event) - : ContextGetterBase(device, std::move(stream)), event_{std::move(event)} {} - - // create the CUDA Event upfront to catch possible errors from its creation - SharedEventPtr event_ = getEventCache().get(); - }; - - /** - * The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire(): - * - setting the current device - * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary - */ - class TaskContext : public impl::Context { - public: - /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) - explicit TaskContext(int device, edm::WaitingTaskWithArenaHolder waitingTaskHolder) - : Context(device), holderHelper_{std::move(waitingTaskHolder), device} {} - - ~TaskContext() = default; - - template - void pushNextTask(F&& f) { - holderHelper_.pushNextTask(std::forward(f)); - } - - void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder)); - } - - // Internal API - void commit(); - - private: - impl::ContextHolderHelper holderHelper_; - }; - - /** - * The aim of this class is to do necessary per-event "initialization" in analyze() - * - setting the current device - * - synchronizing between CUDA streams if necessary - * and enforce that those get done in a proper way in RAII fashion. - */ - class AnalyzeContext : public impl::ContextGetterBase { - public: - /// Constructor to (possibly) re-use a CUDA stream - explicit AnalyzeContext(edm::StreamID streamID) : ContextGetterBase(streamID) {} - }; - - namespace impl { - template - void ContextHolderHelper::pushNextTask(F&& f) { - replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{edm::make_waiting_task_with_holder( - tbb::task::allocate_root(), - std::move(waitingTaskHolder_), - [device = device_, func = std::forward(f)](edm::WaitingTaskWithArenaHolder h) { - func(TaskContext{device, std::move(h)}); - })}); - } - } // namespace impl - - template - void runAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder holder, F func) { - AcquireContext context(streamID, std::move(holder)); - func(context); - context.commit(); - } - - template - void runProduce(edm::StreamID streamID, F func) { - ProduceContext context(streamID); - func(context); - context.commit(); - } -} // namespace cms::cuda - +#include "CUDACore/AcquireContext.h" +#include "CUDACore/ProduceContext.h" +#include "CUDACore/AnalyzeContext.h" #endif diff --git a/src/cudadev/CUDACore/EDGetterContextBase.cc b/src/cudadev/CUDACore/EDGetterContextBase.cc new file mode 100644 index 000000000..cfeccb178 --- /dev/null +++ b/src/cudadev/CUDACore/EDGetterContextBase.cc @@ -0,0 +1,27 @@ +#include "CUDACore/EDGetterContextBase.h" +#include "CUDACore/cudaCheck.h" + +namespace cms::cuda::impl { + void EDGetterContextBase::synchronizeStreams(int dataDevice, + cudaStream_t dataStream, + bool available, + cudaEvent_t dataEvent) { + if (dataDevice != device()) { + // Eventually replace with prefetch to current device (assuming unified memory works) + // If we won't go to unified memory, need to figure out something else... + throw std::runtime_error("Handling data from multiple devices is not yet supported"); + } + + if (dataStream != stream()) { + // Different streams, need to synchronize + if (not available) { + // Event not yet occurred, so need to add synchronization + // here. Sychronization is done by making the CUDA stream to + // wait for an event, so all subsequent work in the stream + // will run only after the event has "occurred" (i.e. data + // product became available). + cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event"); + } + } + } +} diff --git a/src/cudadev/CUDACore/EDGetterContextBase.h b/src/cudadev/CUDACore/EDGetterContextBase.h new file mode 100644 index 000000000..d17814d69 --- /dev/null +++ b/src/cudadev/CUDACore/EDGetterContextBase.h @@ -0,0 +1,38 @@ +#ifndef HeterogeneousCore_CUDACore_EDGetterContextBase_h +#define HeterogeneousCore_CUDACore_EDGetterContextBase_h + +#include "CUDACore/FwkContextBase.h" +#include "CUDACore/Product.h" +#include "Framework/EDGetToken.h" + +namespace cms::cuda::impl { + /** + * This class is a base class for Context classes that should be + * able to read Event Data products + */ + class EDGetterContextBase : public FwkContextBase { + public: + template + const T& get(const Product& data) { + if (not isInitialized()) { + initialize(data); + } + synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event()); + return data.data_; + } + + template + const T& get(const edm::Event& iEvent, edm::EDGetTokenT> token) { + return get(iEvent.get(token)); + } + + protected: + template + EDGetterContextBase(Args&&... args) : FwkContextBase(std::forward(args)...) {} + + private: + void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent); + }; +} + +#endif diff --git a/src/cudadev/CUDACore/FwkContextBase.cc b/src/cudadev/CUDACore/FwkContextBase.cc new file mode 100644 index 000000000..47e2d8199 --- /dev/null +++ b/src/cudadev/CUDACore/FwkContextBase.cc @@ -0,0 +1,28 @@ +#include "CUDACore/FwkContextBase.h" +#include "CUDACore/StreamCache.h" +#include "CUDACore/cudaCheck.h" + +#include "chooseDevice.h" + +namespace cms::cuda::impl { + FwkContextBase::FwkContextBase(edm::StreamID streamID) : FwkContextBase(chooseDevice(streamID)) {} + + FwkContextBase::FwkContextBase(int device) : currentDevice_(device) { cudaCheck(cudaSetDevice(currentDevice_)); } + + FwkContextBase::FwkContextBase(int device, SharedStreamPtr stream) + : currentDevice_(device), stream_(std::make_shared(std::move(stream))) { + cudaCheck(cudaSetDevice(currentDevice_)); + } + + void FwkContextBase::initialize() { stream_ = std::make_shared(getStreamCache().get()); } + + void FwkContextBase::initialize(const ProductBase& data) { + SharedStreamPtr stream; + if (data.mayReuseStream()) { + stream = data.streamPtr(); + } else { + stream = getStreamCache().get(); + } + stream_ = std::make_shared(std::move(stream)); + } +} diff --git a/src/cudadev/CUDACore/FwkContextBase.h b/src/cudadev/CUDACore/FwkContextBase.h new file mode 100644 index 000000000..1b329cb03 --- /dev/null +++ b/src/cudadev/CUDACore/FwkContextBase.h @@ -0,0 +1,66 @@ +#ifndef HeterogeneousCore_CUDACore_FwkContextBase_h +#define HeterogeneousCore_CUDACore_FwkContextBase_h + +#include "CUDACore/ProductBase.h" +#include "CUDACore/SharedStreamPtr.h" +#include "Framework/Event.h" + +namespace cms::cuda::impl { + /** + * This class is a base class for other Context classes for interacting with the framework + */ + class FwkContextBase { + public: + FwkContextBase(FwkContextBase const&) = delete; + FwkContextBase& operator=(FwkContextBase const&) = delete; + FwkContextBase(FwkContextBase&&) = delete; + FwkContextBase& operator=(FwkContextBase&&) = delete; + + int device() const { return currentDevice_; } + + cudaStream_t stream() { + if (not isInitialized()) { + initialize(); + } + return stream_->streamPtr().get(); + } + const SharedStreamPtr& streamPtr() { + if (not isInitialized()) { + initialize(); + } + return stream_->streamPtr(); + } + + protected: + // The constructors set the current device, but the device + // is not set back to the previous value at the destructor. This + // should be sufficient (and tiny bit faster) as all CUDA API + // functions relying on the current device should be called from + // the scope where this context is. The current device doesn't + // really matter between modules (or across TBB tasks). + explicit FwkContextBase(edm::StreamID streamID); + + explicit FwkContextBase(int device); + + // meant only for testing + explicit FwkContextBase(int device, SharedStreamPtr stream); + + bool isInitialized() const { return bool(stream_); } + + void initialize(); + void initialize(const ProductBase& data); + + const std::shared_ptr& streamSharingHelper() { + if (not isInitialized()) { + initialize(); + } + return stream_; + } + + private: + int currentDevice_ = -1; + std::shared_ptr stream_; + }; +} + +#endif diff --git a/src/cudadev/CUDACore/FwkContextHolderHelper.h b/src/cudadev/CUDACore/FwkContextHolderHelper.h new file mode 100644 index 000000000..18e465142 --- /dev/null +++ b/src/cudadev/CUDACore/FwkContextHolderHelper.h @@ -0,0 +1,25 @@ +#ifndef HeterogeneousCore_CUDACore_FwkContextHolderHelper_h +#define HeterogeneousCore_CUDACore_FwkContextHolderHelper_h + +namespace cms::cuda::impl { + class FwkContextHolderHelper { + public: + FwkContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder, int device) + : waitingTaskHolder_{std::move(waitingTaskHolder)}, device_{device} {} + + template + void pushNextTask(F&& f); + + void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + waitingTaskHolder_ = std::move(waitingTaskHolder); + } + + void enqueueCallback(cudaStream_t stream); + + private: + edm::WaitingTaskWithArenaHolder waitingTaskHolder_; + int device_; + }; +} + +#endif diff --git a/src/cudadev/CUDACore/ProduceContext.cc b/src/cudadev/CUDACore/ProduceContext.cc new file mode 100644 index 000000000..0af59c3a7 --- /dev/null +++ b/src/cudadev/CUDACore/ProduceContext.cc @@ -0,0 +1,6 @@ +#include "CUDACore/ProduceContext.h" +#include "CUDACore/cudaCheck.h" + +namespace cms::cuda { + void ProduceContext::commit() { cudaCheck(cudaEventRecord(event_.get(), stream())); } +} diff --git a/src/cudadev/CUDACore/ProduceContext.h b/src/cudadev/CUDACore/ProduceContext.h new file mode 100644 index 000000000..980e53377 --- /dev/null +++ b/src/cudadev/CUDACore/ProduceContext.h @@ -0,0 +1,52 @@ +#ifndef HeterogeneousCore_CUDACore_ProduceContext_h +#define HeterogeneousCore_CUDACore_ProduceContext_h + +#include "CUDACore/EDGetterContextBase.h" +#include "CUDACore/EventCache.h" +#include "Framework/EDPutToken.h" + +namespace cms::cuda { + /** + * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce(): + * - setting the current device + * - synchronizing between CUDA streams if necessary + * Users should not, however, construct it explicitly. + */ + class ProduceContext : public impl::EDGetterContextBase { + public: + explicit ProduceContext(edm::StreamID streamID) : EDGetterContextBase(streamID) {} + + ~ProduceContext() = default; + + template + std::unique_ptr> wrap(T data) { + // make_unique doesn't work because of private constructor + return std::unique_ptr>(new Product(device(), streamSharingHelper(), event_, std::move(data))); + } + + template + auto emplace(edm::Event& iEvent, edm::EDPutTokenT token, Args&&... args) { + return iEvent.emplace(token, device(), streamSharingHelper(), event_, std::forward(args)...); + } + + // internal API + void commit(); + + private: + // This construcor is only meant for testing + explicit ProduceContext(int device, SharedStreamPtr stream, SharedEventPtr event) + : EDGetterContextBase(device, std::move(stream)), event_{std::move(event)} {} + + // create the CUDA Event upfront to catch possible errors from its creation + SharedEventPtr event_ = getEventCache().get(); + }; + + template + void runProduce(edm::StreamID streamID, F&& func) { + ProduceContext context(streamID); + func(context); + context.commit(); + } +} + +#endif diff --git a/src/cudadev/CUDACore/Product.h b/src/cudadev/CUDACore/Product.h index 759079003..5d9aaf36e 100644 --- a/src/cudadev/CUDACore/Product.h +++ b/src/cudadev/CUDACore/Product.h @@ -13,7 +13,7 @@ namespace edm { namespace cms { namespace cuda { namespace impl { - class ContextGetterBase; + class EDGetterContextBase; } // namespace impl /** @@ -41,7 +41,7 @@ namespace cms { Product& operator=(Product&&) = default; private: - friend class impl::ContextGetterBase; + friend class impl::EDGetterContextBase; friend class ProduceContext; friend class edm::Wrapper>; diff --git a/src/cudadev/CUDACore/ProductBase.h b/src/cudadev/CUDACore/ProductBase.h index e2a367341..e1f04ec12 100644 --- a/src/cudadev/CUDACore/ProductBase.h +++ b/src/cudadev/CUDACore/ProductBase.h @@ -10,7 +10,7 @@ namespace cms { namespace cuda { namespace impl { - class Context; + class FwkContextBase; /** * The CUDA stream is shared between all the Event products of @@ -66,13 +66,13 @@ namespace cms { int device() const { return device_; } // cudaStream_t is a pointer to a thread-safe object, for which a - // mutable access is needed even if the cms::cuda::Context itself + // mutable access is needed even if the ProductBase itself // would be const. Therefore it is ok to return a non-const // pointer from a const method here. cudaStream_t stream() const { return stream_->streamPtr().get(); } // cudaEvent_t is a pointer to a thread-safe object, for which a - // mutable access is needed even if the cms::cuda::Context itself + // mutable access is needed even if the ProductBase itself // would be const. Therefore it is ok to return a non-const // pointer from a const method here. cudaEvent_t event() const { return event_.get(); } @@ -82,7 +82,7 @@ namespace cms { : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {} private: - friend class impl::Context; + friend class impl::FwkContextBase; friend class ProduceContext; // The following function is intended to be used only from Context diff --git a/src/cudadev/CUDACore/TaskContext.cc b/src/cudadev/CUDACore/TaskContext.cc new file mode 100644 index 000000000..a2c5b0194 --- /dev/null +++ b/src/cudadev/CUDACore/TaskContext.cc @@ -0,0 +1,41 @@ +#include "CUDACore/TaskContext.h" +#include "CUDACore/cudaCheck.h" + +namespace { + struct CallbackData { + edm::WaitingTaskWithArenaHolder holder; + int device; + }; + + void CUDART_CB cudaContextCallback(cudaStream_t streamId, cudaError_t status, void* data) { + std::unique_ptr guard{reinterpret_cast(data)}; + edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder; + int device = guard->device; + if (status == cudaSuccess) { + //std::cout << " GPU kernel finished (in callback) device " << device << " CUDA stream " + // << streamId << std::endl; + waitingTaskHolder.doneWaiting(nullptr); + } else { + // wrap the exception in a try-catch block to let GDB "catch throw" break on it + try { + auto error = cudaGetErrorName(status); + auto message = cudaGetErrorString(status); + throw std::runtime_error("Callback of CUDA stream " + + std::to_string(reinterpret_cast(streamId)) + " in device " + + std::to_string(device) + " error " + std::string(error) + ": " + std::string(message)); + } catch (std::exception&) { + waitingTaskHolder.doneWaiting(std::current_exception()); + } + } + } +} // namespace + +namespace cms::cuda { + namespace impl { + void FwkContextHolderHelper::enqueueCallback(cudaStream_t stream) { + cudaCheck(cudaStreamAddCallback(stream, cudaContextCallback, new CallbackData{waitingTaskHolder_, device_}, 0)); + } + } + + void TaskContext::commit() { holderHelper_.enqueueCallback(stream()); } +} diff --git a/src/cudadev/CUDACore/TaskContext.h b/src/cudadev/CUDACore/TaskContext.h new file mode 100644 index 000000000..121c9165d --- /dev/null +++ b/src/cudadev/CUDACore/TaskContext.h @@ -0,0 +1,71 @@ +#ifndef HeterogeneousCore_CUDACore_TaskContext_h +#define HeterogeneousCore_CUDACore_TaskContext_h + +#include "CUDACore/FwkContextBase.h" +#include "Framework/WaitingTaskWithArenaHolder.h" + +namespace cms::cuda { + namespace impl { + class FwkContextHolderHelper { + public: + FwkContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder, int device) + : waitingTaskHolder_{std::move(waitingTaskHolder)}, device_{device} {} + + template + void pushNextTask(F&& f); + + void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + waitingTaskHolder_ = std::move(waitingTaskHolder); + } + + void enqueueCallback(cudaStream_t stream); + + private: + edm::WaitingTaskWithArenaHolder waitingTaskHolder_; + int device_; + }; + } + + /** + * The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire(): + * - setting the current device + * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary + */ + class TaskContext : public impl::FwkContextBase { + public: + /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) + explicit TaskContext(int device, edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : FwkContextBase(device), holderHelper_{std::move(waitingTaskHolder), device} {} + + ~TaskContext() = default; + + template + void pushNextTask(F&& f) { + holderHelper_.pushNextTask(std::forward(f)); + } + + void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder)); + } + + // Internal API + void commit(); + + private: + impl::FwkContextHolderHelper holderHelper_; + }; + + namespace impl { + template + void FwkContextHolderHelper::pushNextTask(F&& f) { + replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{edm::make_waiting_task_with_holder( + tbb::task::allocate_root(), + std::move(waitingTaskHolder_), + [device = device_, func = std::forward(f)](edm::WaitingTaskWithArenaHolder h) { + func(TaskContext{device, std::move(h)}); + })}); + } + } // namespace impl +} + +#endif From e5b8bddea19f8ca28ec2dc859ebe78bc9d2c8ced Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Wed, 8 Sep 2021 18:32:51 -0700 Subject: [PATCH 08/12] Migrate code away from Context.h --- src/cudadev/CUDACore/EDProducer.h | 3 ++- src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc | 1 - .../plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc | 2 +- .../plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc | 1 - .../plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc | 1 - src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc | 1 - src/cudadev/plugin-Validation/CountValidator.cc | 2 +- src/cudadev/plugin-Validation/HistoValidator.cc | 1 - 8 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/cudadev/CUDACore/EDProducer.h b/src/cudadev/CUDACore/EDProducer.h index 6fe6962bd..92529c502 100644 --- a/src/cudadev/CUDACore/EDProducer.h +++ b/src/cudadev/CUDACore/EDProducer.h @@ -2,7 +2,8 @@ #define HeterogeneousCore_CUDACore_stream_EDProducer_h #include "Framework/EDProducer.h" -#include "CUDACore/Context.h" +#include "CUDACore/AcquireContext.h" +#include "CUDACore/ProduceContext.h" namespace cms::cuda { class EDProducer : public edm::EDProducer { diff --git a/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc b/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc index 137bcb073..c9e0cc5d9 100644 --- a/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc +++ b/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc @@ -1,6 +1,5 @@ #include -#include "CUDACore/Context.h" #include "CUDACore/EDProducer.h" #include "CUDACore/Product.h" #include "CUDACore/HostProduct.h" diff --git a/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc b/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc index 50827a71a..d4ca2196b 100644 --- a/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc +++ b/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc @@ -1,12 +1,12 @@ #include #include "CUDACore/Product.h" +#include "CUDACore/ProduceContext.h" #include "Framework/EventSetup.h" #include "Framework/Event.h" #include "Framework/PluginFactory.h" #include "Framework/EDProducer.h" #include "Framework/RunningAverage.h" -#include "CUDACore/Context.h" #include "gpuVertexFinder.h" diff --git a/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc b/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc index 79ad0ca91..4f38c0ad8 100644 --- a/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc +++ b/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc @@ -1,6 +1,5 @@ #include -#include "CUDACore/Context.h" #include "CUDACore/EDProducer.h" #include "CUDACore/Product.h" #include "CUDACore/HostProduct.h" diff --git a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc index 7a2875ad9..4f06bf540 100644 --- a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc +++ b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc @@ -4,7 +4,6 @@ #include // CMSSW includes -#include "CUDACore/Context.h" #include "CUDACore/EDProducer.h" #include "CUDACore/Product.h" #include "CUDADataFormats/SiPixelClustersCUDA.h" diff --git a/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc b/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc index 8b90c5326..8d1f8b3c4 100644 --- a/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc +++ b/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc @@ -1,4 +1,3 @@ -#include "CUDACore/Context.h" #include "CUDACore/EDProducer.h" #include "CUDACore/Product.h" #include "CUDADataFormats/SiPixelDigisCUDA.h" diff --git a/src/cudadev/plugin-Validation/CountValidator.cc b/src/cudadev/plugin-Validation/CountValidator.cc index 1993e20eb..bd9f40989 100644 --- a/src/cudadev/plugin-Validation/CountValidator.cc +++ b/src/cudadev/plugin-Validation/CountValidator.cc @@ -1,4 +1,4 @@ -#include "CUDACore/Context.h" +#include "CUDACore/ProduceContext.h" #include "CUDACore/Product.h" #include "CUDADataFormats/PixelTrackHeterogeneous.h" #include "CUDADataFormats/SiPixelClustersCUDA.h" diff --git a/src/cudadev/plugin-Validation/HistoValidator.cc b/src/cudadev/plugin-Validation/HistoValidator.cc index c23bb42af..f8af1773d 100644 --- a/src/cudadev/plugin-Validation/HistoValidator.cc +++ b/src/cudadev/plugin-Validation/HistoValidator.cc @@ -1,4 +1,3 @@ -#include "CUDACore/Context.h" #include "CUDACore/EDProducer.h" #include "CUDACore/Product.h" #include "CUDADataFormats/PixelTrackHeterogeneous.h" From eaf55b24d80adc9917a5653f1f5ef62a23ce1971 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Wed, 8 Sep 2021 18:33:11 -0700 Subject: [PATCH 09/12] Remove Context.h --- src/cudadev/CUDACore/Context.h | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 src/cudadev/CUDACore/Context.h diff --git a/src/cudadev/CUDACore/Context.h b/src/cudadev/CUDACore/Context.h deleted file mode 100644 index c55fc59fb..000000000 --- a/src/cudadev/CUDACore/Context.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef HeterogeneousCore_CUDACore_Context_h -#define HeterogeneousCore_CUDACore_Context_h - -#include "CUDACore/AcquireContext.h" -#include "CUDACore/ProduceContext.h" -#include "CUDACore/AnalyzeContext.h" -#endif From 744c94f783121adbd80cdaeb762d774fcbe0039b Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Fri, 3 Sep 2021 12:18:45 -0700 Subject: [PATCH 10/12] Add lightweight Context class with memory allocation oriented counterparts --- src/cudadev/CUDACore/Context.h | 46 +++++++++++++++++++++++++++ src/cudadev/CUDACore/FwkContextBase.h | 5 +++ 2 files changed, 51 insertions(+) create mode 100644 src/cudadev/CUDACore/Context.h diff --git a/src/cudadev/CUDACore/Context.h b/src/cudadev/CUDACore/Context.h new file mode 100644 index 000000000..17e814bab --- /dev/null +++ b/src/cudadev/CUDACore/Context.h @@ -0,0 +1,46 @@ +#ifndef HeterogeneousCore_CUDAUtilities_Context_h +#define HeterogeneousCore_CUDAUtilities_Context_h + +#include "CUDACore/allocate_device.h" +#include "CUDACore/allocate_host.h" + +namespace cms::cuda { + class HostAllocatorContext { + public: + explicit HostAllocatorContext(cudaStream_t stream) : stream_(stream) {} + + void *allocate_host(size_t nbytes) const { return cms::cuda::allocate_host(nbytes, stream_); } + + void free_host(void *ptr) const { cms::cuda::free_host(ptr); } + + private: + cudaStream_t stream_; + }; + + class DeviceAllocatorContext { + public: + explicit DeviceAllocatorContext(cudaStream_t stream) : stream_(stream) {} + + void *allocate_device(size_t nbytes) const { return cms::cuda::allocate_device(nbytes, stream_); } + + void free_device(void *ptr) const { cms::cuda::free_device(ptr, stream_); } + + private: + cudaStream_t stream_; + }; + + class Context { + public: + explicit Context(cudaStream_t stream) : stream_(stream) {} + + cudaStream_t stream() const { return stream_; } + + operator HostAllocatorContext() const { return HostAllocatorContext(stream()); } + operator DeviceAllocatorContext() const { return DeviceAllocatorContext(stream()); } + + private: + cudaStream_t stream_; + }; +} // namespace cms::cuda + +#endif diff --git a/src/cudadev/CUDACore/FwkContextBase.h b/src/cudadev/CUDACore/FwkContextBase.h index 1b329cb03..7fb82478a 100644 --- a/src/cudadev/CUDACore/FwkContextBase.h +++ b/src/cudadev/CUDACore/FwkContextBase.h @@ -1,6 +1,7 @@ #ifndef HeterogeneousCore_CUDACore_FwkContextBase_h #define HeterogeneousCore_CUDACore_FwkContextBase_h +#include "CUDACore/Context.h" #include "CUDACore/ProductBase.h" #include "CUDACore/SharedStreamPtr.h" #include "Framework/Event.h" @@ -31,6 +32,10 @@ namespace cms::cuda::impl { return stream_->streamPtr(); } + operator HostAllocatorContext() { return HostAllocatorContext(stream()); } + operator DeviceAllocatorContext() { return DeviceAllocatorContext(stream()); } + operator Context() { return Context(stream()); } + protected: // The constructors set the current device, but the device // is not set back to the previous value at the destructor. This From 0429ad5dfb6c4b8941b8753debb9030ba99d34ad Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Tue, 7 Sep 2021 14:06:43 -0700 Subject: [PATCH 11/12] Add TestContext for tests --- src/cudadev/CUDACore/TestContext.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 src/cudadev/CUDACore/TestContext.h diff --git a/src/cudadev/CUDACore/TestContext.h b/src/cudadev/CUDACore/TestContext.h new file mode 100644 index 000000000..ce0c8c35a --- /dev/null +++ b/src/cudadev/CUDACore/TestContext.h @@ -0,0 +1,22 @@ +#ifndef HeterogeneousCore_CUDAUtilities_TestContext_h +#define HeterogeneousCore_CUDAUtilities_TestContext_h + +#include "CUDACore/Context.h" +#include "CUDACore/currentDevice.h" + +namespace cms::cudatest { + class TestContext { + public: + TestContext() : TestContext(cudaStreamDefault) {} + explicit TestContext(cudaStream_t stream) : stream_{stream} {} + + operator cms::cuda::HostAllocatorContext() const { return cms::cuda::HostAllocatorContext(stream_); } + operator cms::cuda::DeviceAllocatorContext() const { return cms::cuda::DeviceAllocatorContext(stream_); } + operator cms::cuda::Context() const { return cms::cuda::Context(stream_); } + + private: + cudaStream_t stream_; + }; +} // namespace cms::cudatest + +#endif From c91e0e309db65d7aca1094021a68011ecdfc4d8b Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Tue, 7 Sep 2021 14:07:36 -0700 Subject: [PATCH 12/12] Migrate make_(device|host)_unique and all calling code to pass cms::cuda::Context objects instead of cudaStream_t --- src/cudadev/CUDACore/device_unique_ptr.h | 41 ++--- src/cudadev/CUDACore/host_unique_ptr.h | 40 +++-- src/cudadev/CUDADataFormats/BeamSpotCUDA.h | 4 +- .../CUDADataFormats/HeterogeneousSoA.h | 66 ++++---- .../CUDADataFormats/SiPixelClustersCUDA.cc | 16 +- .../CUDADataFormats/SiPixelClustersCUDA.h | 2 +- .../CUDADataFormats/SiPixelDigiErrorsCUDA.cc | 24 +-- .../CUDADataFormats/SiPixelDigiErrorsCUDA.h | 6 +- .../CUDADataFormats/SiPixelDigisCUDA.cc | 46 +++--- .../CUDADataFormats/SiPixelDigisCUDA.h | 10 +- .../TrackingRecHit2DHeterogeneous.cc | 38 +++-- .../TrackingRecHit2DHeterogeneous.h | 31 ++-- .../plugin-BeamSpotProducer/BeamSpotToCUDA.cc | 2 +- .../PixelTrackSoAFromCUDA.cc | 2 +- .../BrokenLineFitOnGPU.cu | 82 +++++----- .../plugin-PixelTriplets/CAHitNtupletCUDA.cc | 2 +- .../CAHitNtupletGeneratorKernels.cc | 2 +- .../CAHitNtupletGeneratorKernels.cu | 44 ++--- .../CAHitNtupletGeneratorKernels.h | 4 +- .../CAHitNtupletGeneratorKernelsAlloc.h | 22 +-- .../CAHitNtupletGeneratorOnGPU.cc | 23 +-- .../CAHitNtupletGeneratorOnGPU.h | 4 +- .../plugin-PixelTriplets/HelixFitOnGPU.h | 10 +- .../plugin-PixelTriplets/RiemannFitOnGPU.cu | 154 +++++++++--------- .../PixelVertexProducerCUDA.cc | 2 +- .../PixelVertexSoAFromCUDA.cc | 2 +- .../gpuVertexFinder.cc | 33 ++-- .../gpuVertexFinder.h | 2 +- .../SiPixelRawToClusterCUDA.cc | 2 +- .../SiPixelRawToClusterGPUKernel.cu | 84 +++++----- .../SiPixelRawToClusterGPUKernel.h | 2 +- .../SiPixelDigisSoAFromCUDA.cc | 8 +- .../PixelRecHitGPUKernel.cu | 18 +- .../PixelRecHitGPUKernel.h | 2 +- .../SiPixelRecHitCUDA.cc | 5 +- .../plugin-Validation/HistoValidator.cc | 12 +- src/cudadev/test/HistoContainer_t.cu | 8 +- src/cudadev/test/OneHistoContainer_t.cu | 4 +- src/cudadev/test/OneToManyAssoc_t.h | 7 +- src/cudadev/test/TrackingRecHit2DCUDA_t.cu | 4 +- src/cudadev/test/VertexFinder_t.h | 6 +- src/cudadev/test/gpuClustering_t.h | 18 +- src/cudadev/test/radixSort_t.cu | 10 +- 43 files changed, 477 insertions(+), 427 deletions(-) diff --git a/src/cudadev/CUDACore/device_unique_ptr.h b/src/cudadev/CUDACore/device_unique_ptr.h index ab5d6bc25..4736bdaad 100644 --- a/src/cudadev/CUDACore/device_unique_ptr.h +++ b/src/cudadev/CUDACore/device_unique_ptr.h @@ -3,11 +3,9 @@ #include #include +#include -#include - -#include "CUDACore/allocate_device.h" -#include "CUDACore/currentDevice.h" +#include "CUDACore/Context.h" namespace cms { namespace cuda { @@ -17,14 +15,16 @@ namespace cms { class DeviceDeleter { public: DeviceDeleter() = default; // for edm::Wrapper - DeviceDeleter(cudaStream_t stream) : stream_{stream} {} + DeviceDeleter(DeviceAllocatorContext const &ctx) : ctx_(ctx) {} void operator()(void *ptr) { - free_device(ptr, stream_); + if (ctx_) { + ctx_->free_device(ptr); + } } private: - cudaStream_t stream_ = cudaStreamDefault; + std::optional ctx_; }; } // namespace impl @@ -48,23 +48,24 @@ namespace cms { } // namespace device template - typename device::impl::make_device_unique_selector::non_array make_device_unique(cudaStream_t stream) { + typename device::impl::make_device_unique_selector::non_array make_device_unique( + DeviceAllocatorContext const &ctx) { static_assert(std::is_trivially_constructible::value, "Allocating with non-trivial constructor on the device memory is not supported"); - void *mem = allocate_device(sizeof(T), stream); + void *mem = ctx.allocate_device(sizeof(T)); return typename device::impl::make_device_unique_selector::non_array{reinterpret_cast(mem), - device::impl::DeviceDeleter{stream}}; + device::impl::DeviceDeleter{ctx}}; } template - typename device::impl::make_device_unique_selector::unbounded_array make_device_unique(size_t n, - cudaStream_t stream) { + typename device::impl::make_device_unique_selector::unbounded_array make_device_unique( + size_t n, DeviceAllocatorContext const &ctx) { using element_type = typename std::remove_extent::type; static_assert(std::is_trivially_constructible::value, "Allocating with non-trivial constructor on the device memory is not supported"); - void *mem = allocate_device(n * sizeof(element_type), stream); + void *mem = ctx.allocate_device(n * sizeof(element_type)); return typename device::impl::make_device_unique_selector::unbounded_array{ - reinterpret_cast(mem), device::impl::DeviceDeleter{stream}}; + reinterpret_cast(mem), device::impl::DeviceDeleter{ctx}}; } template @@ -73,19 +74,19 @@ namespace cms { // No check for the trivial constructor, make it clear in the interface template typename device::impl::make_device_unique_selector::non_array make_device_unique_uninitialized( - cudaStream_t stream) { - void *mem = allocate_device(sizeof(T), stream); + DeviceAllocatorContext const &ctx) { + void *mem = ctx.allocate_device(sizeof(T)); return typename device::impl::make_device_unique_selector::non_array{reinterpret_cast(mem), - device::impl::DeviceDeleter{stream}}; + device::impl::DeviceDeleter{ctx}}; } template typename device::impl::make_device_unique_selector::unbounded_array make_device_unique_uninitialized( - size_t n, cudaStream_t stream) { + size_t n, DeviceAllocatorContext const &ctx) { using element_type = typename std::remove_extent::type; - void *mem = allocate_device(n * sizeof(element_type), stream); + void *mem = ctx.allocate_device(n * sizeof(element_type)); return typename device::impl::make_device_unique_selector::unbounded_array{ - reinterpret_cast(mem), device::impl::DeviceDeleter{stream}}; + reinterpret_cast(mem), device::impl::DeviceDeleter{ctx}}; } template diff --git a/src/cudadev/CUDACore/host_unique_ptr.h b/src/cudadev/CUDACore/host_unique_ptr.h index f34798da3..2e8240476 100644 --- a/src/cudadev/CUDACore/host_unique_ptr.h +++ b/src/cudadev/CUDACore/host_unique_ptr.h @@ -3,8 +3,9 @@ #include #include +#include -#include "CUDACore/allocate_host.h" +#include "CUDACore/Context.h" namespace cms { namespace cuda { @@ -13,7 +14,12 @@ namespace cms { // Additional layer of types to distinguish from host::unique_ptr class HostDeleter { public: - void operator()(void *ptr) { cms::cuda::free_host(ptr); } + HostDeleter() = default; // for edm::Wrapper + HostDeleter(HostAllocatorContext const &ctx) : ctx_(ctx) {} + void operator()(void *ptr) { ctx_->free_host(ptr); } + + private: + std::optional ctx_; }; } // namespace impl @@ -38,20 +44,23 @@ namespace cms { // Allocate pinned host memory template - typename host::impl::make_host_unique_selector::non_array make_host_unique(cudaStream_t stream) { + typename host::impl::make_host_unique_selector::non_array make_host_unique(HostAllocatorContext const &ctx) { static_assert(std::is_trivially_constructible::value, "Allocating with non-trivial constructor on the pinned host memory is not supported"); - void *mem = allocate_host(sizeof(T), stream); - return typename host::impl::make_host_unique_selector::non_array{reinterpret_cast(mem)}; + void *mem = ctx.allocate_host(sizeof(T)); + return typename host::impl::make_host_unique_selector::non_array{reinterpret_cast(mem), + host::impl::HostDeleter{ctx}}; } template - typename host::impl::make_host_unique_selector::unbounded_array make_host_unique(size_t n, cudaStream_t stream) { + typename host::impl::make_host_unique_selector::unbounded_array make_host_unique( + size_t n, HostAllocatorContext const &ctx) { using element_type = typename std::remove_extent::type; static_assert(std::is_trivially_constructible::value, "Allocating with non-trivial constructor on the pinned host memory is not supported"); - void *mem = allocate_host(n * sizeof(element_type), stream); - return typename host::impl::make_host_unique_selector::unbounded_array{reinterpret_cast(mem)}; + void *mem = ctx.allocate_host(n * sizeof(element_type)); + return typename host::impl::make_host_unique_selector::unbounded_array{reinterpret_cast(mem), + host::impl::HostDeleter{ctx}}; } template @@ -59,17 +68,20 @@ namespace cms { // No check for the trivial constructor, make it clear in the interface template - typename host::impl::make_host_unique_selector::non_array make_host_unique_uninitialized(cudaStream_t stream) { - void *mem = allocate_host(sizeof(T), stream); - return typename host::impl::make_host_unique_selector::non_array{reinterpret_cast(mem)}; + typename host::impl::make_host_unique_selector::non_array make_host_unique_uninitialized( + HostAllocatorContext const &ctx) { + void *mem = ctx.allocate_host(sizeof(T)); + return typename host::impl::make_host_unique_selector::non_array{reinterpret_cast(mem), + host::impl::HostDeleter{ctx}}; } template typename host::impl::make_host_unique_selector::unbounded_array make_host_unique_uninitialized( - size_t n, cudaStream_t stream) { + size_t n, HostAllocatorContext const &ctx) { using element_type = typename std::remove_extent::type; - void *mem = allocate_host(n * sizeof(element_type), stream); - return typename host::impl::make_host_unique_selector::unbounded_array{reinterpret_cast(mem)}; + void *mem = ctx.allocate_host(n * sizeof(element_type)); + return typename host::impl::make_host_unique_selector::unbounded_array{reinterpret_cast(mem), + host::impl::HostDeleter{ctx}}; } template diff --git a/src/cudadev/CUDADataFormats/BeamSpotCUDA.h b/src/cudadev/CUDADataFormats/BeamSpotCUDA.h index a090ef347..0e9311ea1 100644 --- a/src/cudadev/CUDADataFormats/BeamSpotCUDA.h +++ b/src/cudadev/CUDADataFormats/BeamSpotCUDA.h @@ -12,7 +12,9 @@ class BeamSpotCUDA { BeamSpotCUDA() = default; // constructor that allocates cached device memory on the given CUDA stream - BeamSpotCUDA(cudaStream_t stream) { data_d_ = cms::cuda::make_device_unique(stream); } + BeamSpotCUDA(cms::cuda::DeviceAllocatorContext const& ctx) { + data_d_ = cms::cuda::make_device_unique(ctx); + } // movable, non-copiable BeamSpotCUDA(BeamSpotCUDA const&) = delete; diff --git a/src/cudadev/CUDADataFormats/HeterogeneousSoA.h b/src/cudadev/CUDADataFormats/HeterogeneousSoA.h index cfaad449c..96ade9feb 100644 --- a/src/cudadev/CUDADataFormats/HeterogeneousSoA.h +++ b/src/cudadev/CUDADataFormats/HeterogeneousSoA.h @@ -36,10 +36,10 @@ class HeterogeneousSoA { auto *operator->() { return get(); } // in reality valid only for GPU version... - cms::cuda::host::unique_ptr toHostAsync(cudaStream_t stream) const { + cms::cuda::host::unique_ptr toHostAsync(cms::cuda::Context const &ctx) const { assert(dm_ptr); - auto ret = cms::cuda::make_host_unique(stream); - cudaCheck(cudaMemcpyAsync(ret.get(), dm_ptr.get(), sizeof(T), cudaMemcpyDefault, stream)); + auto ret = cms::cuda::make_host_unique(ctx); + cudaCheck(cudaMemcpyAsync(ret.get(), dm_ptr.get(), sizeof(T), cudaMemcpyDefault, ctx.stream())); return ret; } @@ -58,28 +58,28 @@ namespace cms { using unique_ptr = cms::cuda::device::unique_ptr; template - static auto make_unique(cudaStream_t stream) { - return cms::cuda::make_device_unique(stream); + static auto make_unique(cms::cuda::DeviceAllocatorContext const &ctx) { + return cms::cuda::make_device_unique(ctx); } template - static auto make_unique(size_t size, cudaStream_t stream) { - return cms::cuda::make_device_unique(size, stream); + static auto make_unique(size_t size, cms::cuda::DeviceAllocatorContext const &ctx) { + return cms::cuda::make_device_unique(size, ctx); } template - static auto make_host_unique(cudaStream_t stream) { - return cms::cuda::make_host_unique(stream); + static auto make_host_unique(cms::cuda::HostAllocatorContext const &ctx) { + return cms::cuda::make_host_unique(ctx); } template - static auto make_device_unique(cudaStream_t stream) { - return cms::cuda::make_device_unique(stream); + static auto make_device_unique(cms::cuda::DeviceAllocatorContext const &ctx) { + return cms::cuda::make_device_unique(ctx); } template - static auto make_device_unique(size_t size, cudaStream_t stream) { - return cms::cuda::make_device_unique(size, stream); + static auto make_device_unique(size_t size, cms::cuda::DeviceAllocatorContext const &ctx) { + return cms::cuda::make_device_unique(size, ctx); } }; @@ -88,23 +88,23 @@ namespace cms { using unique_ptr = cms::cuda::host::unique_ptr; template - static auto make_unique(cudaStream_t stream) { - return cms::cuda::make_host_unique(stream); + static auto make_unique(cms::cuda::HostAllocatorContext const &ctx) { + return cms::cuda::make_host_unique(ctx); } template - static auto make_host_unique(cudaStream_t stream) { - return cms::cuda::make_host_unique(stream); + static auto make_host_unique(cms::cuda::HostAllocatorContext const &ctx) { + return cms::cuda::make_host_unique(ctx); } template - static auto make_device_unique(cudaStream_t stream) { - return cms::cuda::make_device_unique(stream); + static auto make_device_unique(cms::cuda::DeviceAllocatorContext const &ctx) { + return cms::cuda::make_device_unique(ctx); } template - static auto make_device_unique(size_t size, cudaStream_t stream) { - return cms::cuda::make_device_unique(size, stream); + static auto make_device_unique(size_t size, cms::cuda::DeviceAllocatorContext const &ctx) { + return cms::cuda::make_device_unique(size, ctx); } }; @@ -113,27 +113,27 @@ namespace cms { using unique_ptr = std::unique_ptr; template - static auto make_unique(cudaStream_t) { + static auto make_unique(cms::cuda::DeviceAllocatorContext const &) { return std::make_unique(); } template - static auto make_unique(size_t size, cudaStream_t) { + static auto make_unique(size_t size, cms::cuda::DeviceAllocatorContext const &) { return std::make_unique(size); } template - static auto make_host_unique(cudaStream_t) { + static auto make_host_unique(cms::cuda::HostAllocatorContext const &) { return std::make_unique(); } template - static auto make_device_unique(cudaStream_t) { + static auto make_device_unique(cms::cuda::DeviceAllocatorContext const &) { return std::make_unique(); } template - static auto make_device_unique(size_t size, cudaStream_t) { + static auto make_device_unique(size_t size, cms::cuda::DeviceAllocatorContext const &) { return std::make_unique(size); } }; @@ -154,28 +154,28 @@ class HeterogeneousSoAImpl { HeterogeneousSoAImpl &operator=(HeterogeneousSoAImpl &&) = default; explicit HeterogeneousSoAImpl(unique_ptr &&p) : m_ptr(std::move(p)) {} - explicit HeterogeneousSoAImpl(cudaStream_t stream); + explicit HeterogeneousSoAImpl(cms::cuda::DeviceAllocatorContext const &ctx); T const *get() const { return m_ptr.get(); } T *get() { return m_ptr.get(); } - cms::cuda::host::unique_ptr toHostAsync(cudaStream_t stream) const; + cms::cuda::host::unique_ptr toHostAsync(cms::cuda::Context const &ctx) const; private: unique_ptr m_ptr; //! }; template -HeterogeneousSoAImpl::HeterogeneousSoAImpl(cudaStream_t stream) { - m_ptr = Traits::template make_unique(stream); +HeterogeneousSoAImpl::HeterogeneousSoAImpl(cms::cuda::DeviceAllocatorContext const &ctx) { + m_ptr = Traits::template make_unique(ctx); } // in reality valid only for GPU version... template -cms::cuda::host::unique_ptr HeterogeneousSoAImpl::toHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(stream); - cudaCheck(cudaMemcpyAsync(ret.get(), get(), sizeof(T), cudaMemcpyDefault, stream)); +cms::cuda::host::unique_ptr HeterogeneousSoAImpl::toHostAsync(cms::cuda::Context const &ctx) const { + auto ret = cms::cuda::make_host_unique(ctx); + cudaCheck(cudaMemcpyAsync(ret.get(), get(), sizeof(T), cudaMemcpyDefault, ctx.stream())); return ret; } diff --git a/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.cc b/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.cc index a9feabb92..3d17c8371 100644 --- a/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.cc +++ b/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.cc @@ -3,17 +3,17 @@ #include "CUDACore/host_unique_ptr.h" #include "CUDADataFormats/SiPixelClustersCUDA.h" -SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream) - : moduleStart_d(cms::cuda::make_device_unique(maxModules + 1, stream)), - clusInModule_d(cms::cuda::make_device_unique(maxModules, stream)), - moduleId_d(cms::cuda::make_device_unique(maxModules, stream)), - clusModuleStart_d(cms::cuda::make_device_unique(maxModules + 1, stream)) { - auto view = cms::cuda::make_host_unique(stream); +SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxModules, cms::cuda::Context const& ctx) + : moduleStart_d(cms::cuda::make_device_unique(maxModules + 1, ctx)), + clusInModule_d(cms::cuda::make_device_unique(maxModules, ctx)), + moduleId_d(cms::cuda::make_device_unique(maxModules, ctx)), + clusModuleStart_d(cms::cuda::make_device_unique(maxModules + 1, ctx)) { + auto view = cms::cuda::make_host_unique(ctx); view->moduleStart_ = moduleStart_d.get(); view->clusInModule_ = clusInModule_d.get(); view->moduleId_ = moduleId_d.get(); view->clusModuleStart_ = clusModuleStart_d.get(); - view_d = cms::cuda::make_device_unique(stream); - cms::cuda::copyAsync(view_d, view, stream); + view_d = cms::cuda::make_device_unique(ctx); + cms::cuda::copyAsync(view_d, view, ctx.stream()); } diff --git a/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.h b/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.h index e93b742cf..ac06f9fb0 100644 --- a/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.h +++ b/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.h @@ -10,7 +10,7 @@ class SiPixelClustersCUDA { public: SiPixelClustersCUDA() = default; - explicit SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream); + explicit SiPixelClustersCUDA(size_t maxModules, cms::cuda::Context const &ctx); ~SiPixelClustersCUDA() = default; SiPixelClustersCUDA(const SiPixelClustersCUDA &) = delete; diff --git a/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.cc b/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.cc index d79942608..c2fe7bf8c 100644 --- a/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.cc +++ b/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.cc @@ -6,33 +6,35 @@ #include "CUDACore/memsetAsync.h" #include "CUDADataFormats/SiPixelDigiErrorsCUDA.h" -SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, SiPixelFormatterErrors errors, cudaStream_t stream) - : data_d(cms::cuda::make_device_unique(maxFedWords, stream)), - error_d(cms::cuda::make_device_unique(stream)), - error_h(cms::cuda::make_host_unique(stream)), +SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, + SiPixelFormatterErrors errors, + cms::cuda::Context const& ctx) + : data_d(cms::cuda::make_device_unique(maxFedWords, ctx)), + error_d(cms::cuda::make_device_unique(ctx)), + error_h(cms::cuda::make_host_unique(ctx)), formatterErrors_h(std::move(errors)) { - cms::cuda::memsetAsync(data_d, 0x00, maxFedWords, stream); + cms::cuda::memsetAsync(data_d, 0x00, maxFedWords, ctx.stream()); cms::cuda::make_SimpleVector(error_h.get(), maxFedWords, data_d.get()); assert(error_h->empty()); assert(error_h->capacity() == static_cast(maxFedWords)); - cms::cuda::copyAsync(error_d, error_h, stream); + cms::cuda::copyAsync(error_d, error_h, ctx.stream()); } -void SiPixelDigiErrorsCUDA::copyErrorToHostAsync(cudaStream_t stream) { - cms::cuda::copyAsync(error_h, error_d, stream); +void SiPixelDigiErrorsCUDA::copyErrorToHostAsync(cms::cuda::Context const& ctx) { + cms::cuda::copyAsync(error_h, error_d, ctx.stream()); } -SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync(cudaStream_t stream) const { +SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync(cms::cuda::Context const& ctx) const { // On one hand size() could be sufficient. On the other hand, if // someone copies the SimpleVector<>, (s)he might expect the data // buffer to actually have space for capacity() elements. - auto data = cms::cuda::make_host_unique(error_h->capacity(), stream); + auto data = cms::cuda::make_host_unique(error_h->capacity(), ctx); // but transfer only the required amount if (not error_h->empty()) { - cms::cuda::copyAsync(data, data_d, error_h->size(), stream); + cms::cuda::copyAsync(data, data_d, error_h->size(), ctx.stream()); } auto err = *error_h; err.set_data(data.get()); diff --git a/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.h b/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.h index 442a66f92..4c277d001 100644 --- a/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.h +++ b/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.h @@ -14,7 +14,7 @@ class SiPixelDigiErrorsCUDA { using SiPixelErrorCompactVector = cms::cuda::SimpleVector; SiPixelDigiErrorsCUDA() = default; - explicit SiPixelDigiErrorsCUDA(size_t maxFedWords, SiPixelFormatterErrors errors, cudaStream_t stream); + explicit SiPixelDigiErrorsCUDA(size_t maxFedWords, SiPixelFormatterErrors errors, cms::cuda::Context const& ctx); ~SiPixelDigiErrorsCUDA() = default; SiPixelDigiErrorsCUDA(const SiPixelDigiErrorsCUDA&) = delete; @@ -28,9 +28,9 @@ class SiPixelDigiErrorsCUDA { SiPixelErrorCompactVector const* error() const { return error_d.get(); } using HostDataError = std::pair>; - HostDataError dataErrorToHostAsync(cudaStream_t stream) const; + HostDataError dataErrorToHostAsync(cms::cuda::Context const& ctx) const; - void copyErrorToHostAsync(cudaStream_t stream); + void copyErrorToHostAsync(cms::cuda::Context const& ctx); private: cms::cuda::device::unique_ptr data_d; diff --git a/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc b/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc index fd87fee56..9be592dc1 100644 --- a/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc +++ b/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc @@ -4,45 +4,45 @@ #include "CUDACore/device_unique_ptr.h" #include "CUDACore/host_unique_ptr.h" -SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream) - : xx_d(cms::cuda::make_device_unique(maxFedWords, stream)), - yy_d(cms::cuda::make_device_unique(maxFedWords, stream)), - adc_d(cms::cuda::make_device_unique(maxFedWords, stream)), - moduleInd_d(cms::cuda::make_device_unique(maxFedWords, stream)), - clus_d(cms::cuda::make_device_unique(maxFedWords, stream)), - view_d(cms::cuda::make_device_unique(stream)), - pdigi_d(cms::cuda::make_device_unique(maxFedWords, stream)), - rawIdArr_d(cms::cuda::make_device_unique(maxFedWords, stream)) { - auto view = cms::cuda::make_host_unique(stream); +SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cms::cuda::Context const& ctx) + : xx_d(cms::cuda::make_device_unique(maxFedWords, ctx)), + yy_d(cms::cuda::make_device_unique(maxFedWords, ctx)), + adc_d(cms::cuda::make_device_unique(maxFedWords, ctx)), + moduleInd_d(cms::cuda::make_device_unique(maxFedWords, ctx)), + clus_d(cms::cuda::make_device_unique(maxFedWords, ctx)), + view_d(cms::cuda::make_device_unique(ctx)), + pdigi_d(cms::cuda::make_device_unique(maxFedWords, ctx)), + rawIdArr_d(cms::cuda::make_device_unique(maxFedWords, ctx)) { + auto view = cms::cuda::make_host_unique(ctx); view->xx_ = xx_d.get(); view->yy_ = yy_d.get(); view->adc_ = adc_d.get(); view->moduleInd_ = moduleInd_d.get(); view->clus_ = clus_d.get(); - cms::cuda::copyAsync(view_d, view, stream); + cms::cuda::copyAsync(view_d, view, ctx.stream()); } -cms::cuda::host::unique_ptr SiPixelDigisCUDA::adcToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(nDigis(), stream); - cms::cuda::copyAsync(ret, adc_d, nDigis(), stream); +cms::cuda::host::unique_ptr SiPixelDigisCUDA::adcToHostAsync(cms::cuda::Context const& ctx) const { + auto ret = cms::cuda::make_host_unique(nDigis(), ctx); + cms::cuda::copyAsync(ret, adc_d, nDigis(), ctx.stream()); return ret; } -cms::cuda::host::unique_ptr SiPixelDigisCUDA::clusToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(nDigis(), stream); - cms::cuda::copyAsync(ret, clus_d, nDigis(), stream); +cms::cuda::host::unique_ptr SiPixelDigisCUDA::clusToHostAsync(cms::cuda::Context const& ctx) const { + auto ret = cms::cuda::make_host_unique(nDigis(), ctx); + cms::cuda::copyAsync(ret, clus_d, nDigis(), ctx.stream()); return ret; } -cms::cuda::host::unique_ptr SiPixelDigisCUDA::pdigiToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(nDigis(), stream); - cms::cuda::copyAsync(ret, pdigi_d, nDigis(), stream); +cms::cuda::host::unique_ptr SiPixelDigisCUDA::pdigiToHostAsync(cms::cuda::Context const& ctx) const { + auto ret = cms::cuda::make_host_unique(nDigis(), ctx); + cms::cuda::copyAsync(ret, pdigi_d, nDigis(), ctx.stream()); return ret; } -cms::cuda::host::unique_ptr SiPixelDigisCUDA::rawIdArrToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(nDigis(), stream); - cms::cuda::copyAsync(ret, rawIdArr_d, nDigis(), stream); +cms::cuda::host::unique_ptr SiPixelDigisCUDA::rawIdArrToHostAsync(cms::cuda::Context const& ctx) const { + auto ret = cms::cuda::make_host_unique(nDigis(), ctx); + cms::cuda::copyAsync(ret, rawIdArr_d, nDigis(), ctx.stream()); return ret; } diff --git a/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.h b/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.h index 03ae6639a..e0f95ca34 100644 --- a/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.h +++ b/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.h @@ -10,7 +10,7 @@ class SiPixelDigisCUDA { public: SiPixelDigisCUDA() = default; - explicit SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream); + explicit SiPixelDigisCUDA(size_t maxFedWords, cms::cuda::Context const &ctx); ~SiPixelDigisCUDA() = default; SiPixelDigisCUDA(const SiPixelDigisCUDA &) = delete; @@ -42,10 +42,10 @@ class SiPixelDigisCUDA { uint32_t const *pdigi() const { return pdigi_d.get(); } uint32_t const *rawIdArr() const { return rawIdArr_d.get(); } - cms::cuda::host::unique_ptr adcToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr clusToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr pdigiToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr rawIdArrToHostAsync(cudaStream_t stream) const; + cms::cuda::host::unique_ptr adcToHostAsync(cms::cuda::Context const &ctx) const; + cms::cuda::host::unique_ptr clusToHostAsync(cms::cuda::Context const &ctx) const; + cms::cuda::host::unique_ptr pdigiToHostAsync(cms::cuda::Context const &ctx) const; + cms::cuda::host::unique_ptr rawIdArrToHostAsync(cms::cuda::Context const &ctx) const; class DeviceConstView { public: diff --git a/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.cc b/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.cc index 5c1aacaf4..611bbb101 100644 --- a/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.cc +++ b/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.cc @@ -5,40 +5,44 @@ #include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h" template <> -cms::cuda::host::unique_ptr TrackingRecHit2DCUDA::localCoordToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(4 * nHits(), stream); - cms::cuda::copyAsync(ret, m_store32, 4 * nHits(), stream); +cms::cuda::host::unique_ptr TrackingRecHit2DCUDA::localCoordToHostAsync(cms::cuda::Context const& ctx) const { + auto ret = cms::cuda::make_host_unique(4 * nHits(), ctx); + cms::cuda::copyAsync(ret, m_store32, 4 * nHits(), ctx.stream()); return ret; } template <> -cms::cuda::host::unique_ptr TrackingRecHit2DCUDA::hitsModuleStartToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(gpuClustering::maxNumModules + 1, stream); - cudaCheck(cudaMemcpyAsync( - ret.get(), m_hitsModuleStart, sizeof(uint32_t) * (gpuClustering::maxNumModules + 1), cudaMemcpyDefault, stream)); +cms::cuda::host::unique_ptr TrackingRecHit2DCUDA::hitsModuleStartToHostAsync( + cms::cuda::Context const& ctx) const { + auto ret = cms::cuda::make_host_unique(gpuClustering::maxNumModules + 1, ctx); + cudaCheck(cudaMemcpyAsync(ret.get(), + m_hitsModuleStart, + sizeof(uint32_t) * (gpuClustering::maxNumModules + 1), + cudaMemcpyDefault, + ctx.stream())); return ret; } template <> -cms::cuda::host::unique_ptr TrackingRecHit2DCUDA::globalCoordToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(4 * nHits(), stream); +cms::cuda::host::unique_ptr TrackingRecHit2DCUDA::globalCoordToHostAsync(cms::cuda::Context const& ctx) const { + auto ret = cms::cuda::make_host_unique(4 * nHits(), ctx); cudaCheck(cudaMemcpyAsync( - ret.get(), m_store32.get() + 4 * nHits(), 4 * nHits() * sizeof(float), cudaMemcpyDefault, stream)); + ret.get(), m_store32.get() + 4 * nHits(), 4 * nHits() * sizeof(float), cudaMemcpyDefault, ctx.stream())); return ret; } template <> -cms::cuda::host::unique_ptr TrackingRecHit2DCUDA::chargeToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(nHits(), stream); - cudaCheck( - cudaMemcpyAsync(ret.get(), m_store32.get() + 8 * nHits(), nHits() * sizeof(int32_t), cudaMemcpyDefault, stream)); +cms::cuda::host::unique_ptr TrackingRecHit2DCUDA::chargeToHostAsync(cms::cuda::Context const& ctx) const { + auto ret = cms::cuda::make_host_unique(nHits(), ctx); + cudaCheck(cudaMemcpyAsync( + ret.get(), m_store32.get() + 8 * nHits(), nHits() * sizeof(int32_t), cudaMemcpyDefault, ctx.stream())); return ret; } template <> -cms::cuda::host::unique_ptr TrackingRecHit2DCUDA::sizeToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(2 * nHits(), stream); +cms::cuda::host::unique_ptr TrackingRecHit2DCUDA::sizeToHostAsync(cms::cuda::Context const& ctx) const { + auto ret = cms::cuda::make_host_unique(2 * nHits(), ctx); cudaCheck(cudaMemcpyAsync( - ret.get(), m_store16.get() + 2 * nHits(), 2 * nHits() * sizeof(int16_t), cudaMemcpyDefault, stream)); + ret.get(), m_store16.get() + 2 * nHits(), 2 * nHits() * sizeof(int16_t), cudaMemcpyDefault, ctx.stream())); return ret; } diff --git a/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.h b/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.h index 7a19299a9..e3e706b08 100644 --- a/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.h +++ b/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.h @@ -17,7 +17,7 @@ class TrackingRecHit2DHeterogeneous { explicit TrackingRecHit2DHeterogeneous(uint32_t nHits, pixelCPEforGPU::ParamsOnGPU const* cpeParams, uint32_t const* hitsModuleStart, - cudaStream_t stream); + cms::cuda::Context const& ctx); ~TrackingRecHit2DHeterogeneous() = default; @@ -38,13 +38,13 @@ class TrackingRecHit2DHeterogeneous { auto iphi() { return m_iphi; } // only the local coord and detector index - cms::cuda::host::unique_ptr localCoordToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(cudaStream_t stream) const; + cms::cuda::host::unique_ptr localCoordToHostAsync(cms::cuda::Context const& ctx) const; + cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(cms::cuda::Context const& ctx) const; // for validation - cms::cuda::host::unique_ptr globalCoordToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr chargeToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr sizeToHostAsync(cudaStream_t stream) const; + cms::cuda::host::unique_ptr globalCoordToHostAsync(cms::cuda::Context const& ctx) const; + cms::cuda::host::unique_ptr chargeToHostAsync(cms::cuda::Context const& ctx) const; + cms::cuda::host::unique_ptr sizeToHostAsync(cms::cuda::Context const& ctx) const; private: static constexpr uint32_t n16 = 4; // number of elements in m_store16 @@ -77,13 +77,13 @@ template TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous(uint32_t nHits, pixelCPEforGPU::ParamsOnGPU const* cpeParams, uint32_t const* hitsModuleStart, - cudaStream_t stream) + cms::cuda::Context const& ctx) : m_nHits(nHits), m_hitsModuleStart(hitsModuleStart) { - auto view = Traits::template make_host_unique(stream); + auto view = Traits::template make_host_unique(ctx); view->m_nHits = nHits; - m_view = Traits::template make_device_unique(stream); - m_AverageGeometryStore = Traits::template make_device_unique(stream); + m_view = Traits::template make_device_unique(ctx); + m_AverageGeometryStore = Traits::template make_device_unique(ctx); view->m_averageGeometry = m_AverageGeometryStore.get(); view->m_cpeParams = cpeParams; view->m_hitsModuleStart = hitsModuleStart; @@ -91,7 +91,7 @@ TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous(uint32_t nH // if empy do not bother if (0 == nHits) { if constexpr (std::is_same::value) { - cms::cuda::copyAsync(m_view, view, stream); + cms::cuda::copyAsync(m_view, view, ctx.stream()); } else { m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version } @@ -103,10 +103,9 @@ TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous(uint32_t nH // if ordering is relevant they may have to be stored phi-ordered by layer or so // this will break 1to1 correspondence with cluster and module locality // so unless proven VERY inefficient we keep it ordered as generated - m_store16 = Traits::template make_device_unique(nHits * n16, stream); - m_store32 = - Traits::template make_device_unique(nHits * n32 + phase1PixelTopology::numberOfLayers + 1, stream); - m_PhiBinnerStore = Traits::template make_device_unique(stream); + m_store16 = Traits::template make_device_unique(nHits * n16, ctx); + m_store32 = Traits::template make_device_unique(nHits * n32 + phase1PixelTopology::numberOfLayers + 1, ctx); + m_PhiBinnerStore = Traits::template make_device_unique(ctx); static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float)); static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type)); @@ -140,7 +139,7 @@ TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous(uint32_t nH // transfer view if constexpr (std::is_same::value) { - cms::cuda::copyAsync(m_view, view, stream); + cms::cuda::copyAsync(m_view, view, ctx.stream()); } else { m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version } diff --git a/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc b/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc index b9633a856..819ecb4d7 100644 --- a/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc +++ b/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc @@ -32,7 +32,7 @@ BeamSpotToCUDA::BeamSpotToCUDA(edm::ProductRegistry& reg) void BeamSpotToCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext& ctx) { *bsHost = iSetup.get(); - BeamSpotCUDA bsDevice(ctx.stream()); + BeamSpotCUDA bsDevice(ctx); cms::cuda::copyAsync(bsDevice.ptr(), bsHost, ctx.stream()); ctx.emplace(iEvent, bsPutToken_, std::move(bsDevice)); diff --git a/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc b/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc index c9e0cc5d9..e3b525f28 100644 --- a/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc +++ b/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc @@ -35,7 +35,7 @@ void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, cms::cuda::AcquireContext& ctx) { auto const& inputData = ctx.get(iEvent, tokenCUDA_); - soa_ = inputData.toHostAsync(ctx.stream()); + soa_ = inputData.toHostAsync(ctx); } void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup, cms::cuda::ProduceContext&) { diff --git a/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.cu b/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.cu index deadb1a3c..0dcbb9b72 100644 --- a/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.cu +++ b/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.cu @@ -1,10 +1,10 @@ #include "BrokenLineFitOnGPU.h" #include "CUDACore/device_unique_ptr.h" -void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv, +void HelixFitOnGPU::launchBrokenLineKernels(HitsView const* hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples, - cudaStream_t stream) { + cms::cuda::Context const& ctx) { assert(tuples_); auto blockSize = 64; @@ -12,72 +12,72 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv, // Fit internals auto hitsGPU_ = cms::cuda::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double), stream); + maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double), ctx); auto hits_geGPU_ = cms::cuda::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float), stream); + maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float), ctx); auto fast_fit_resultsGPU_ = cms::cuda::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), stream); + maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), ctx); for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { // fit triplets - kernel_BLFastFit<3><<>>( + kernel_BLFastFit<3><<>>( tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset); cudaCheck(cudaGetLastError()); - kernel_BLFit<3><<>>(tupleMultiplicity_, - bField_, - outputSoa_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 3, - offset); + kernel_BLFit<3><<>>(tupleMultiplicity_, + bField_, + outputSoa_, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + 3, + offset); cudaCheck(cudaGetLastError()); // fit quads - kernel_BLFastFit<4><<>>( + kernel_BLFastFit<4><<>>( tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset); cudaCheck(cudaGetLastError()); - kernel_BLFit<4><<>>(tupleMultiplicity_, - bField_, - outputSoa_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 4, - offset); + kernel_BLFit<4><<>>(tupleMultiplicity_, + bField_, + outputSoa_, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + 4, + offset); cudaCheck(cudaGetLastError()); if (fit5as4_) { // fit penta (only first 4) - kernel_BLFastFit<4><<>>( + kernel_BLFastFit<4><<>>( tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); cudaCheck(cudaGetLastError()); - kernel_BLFit<4><<>>(tupleMultiplicity_, - bField_, - outputSoa_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 5, - offset); + kernel_BLFit<4><<>>(tupleMultiplicity_, + bField_, + outputSoa_, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + 5, + offset); cudaCheck(cudaGetLastError()); } else { // fit penta (all 5) - kernel_BLFastFit<5><<>>( + kernel_BLFastFit<5><<>>( tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); cudaCheck(cudaGetLastError()); - kernel_BLFit<5><<>>(tupleMultiplicity_, - bField_, - outputSoa_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 5, - offset); + kernel_BLFit<5><<>>(tupleMultiplicity_, + bField_, + outputSoa_, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + 5, + offset); cudaCheck(cudaGetLastError()); } diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc b/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc index aca6bd319..c1d7be1a3 100644 --- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc +++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc @@ -35,7 +35,7 @@ void CAHitNtupletCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es, cm auto const& hits = ctx.get(iEvent, tokenHitGPU_); - ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream())); + ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx)); } DEFINE_FWK_MODULE(CAHitNtupletCUDA); diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc index f2805d018..9e6f59a53 100644 --- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc +++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc @@ -11,7 +11,7 @@ void CAHitNtupletGeneratorKernelsCPU::fillHitDetIndices(HitsView const *hv, TkSo } template <> -void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { +void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cms::cuda::Context const &ctx) { auto nhits = hh.nHits(); #ifdef NTUPLE_DEBUG diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu index edc1eb49b..620ce551e 100644 --- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu +++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu @@ -137,7 +137,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * } template <> -void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { +void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cms::cuda::Context const &ctx) { int32_t nhits = hh.nHits(); #ifdef NTUPLE_DEBUG @@ -150,13 +150,13 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr #endif // in principle we can use "nhits" to heuristically dimension the workspace... - device_isOuterHitOfCell_ = cms::cuda::make_device_unique(std::max(1, nhits), stream); + device_isOuterHitOfCell_ = cms::cuda::make_device_unique(std::max(1, nhits), ctx); assert(device_isOuterHitOfCell_.get()); cellStorage_ = cms::cuda::make_device_unique( caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) + caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks), - stream); + ctx); device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get(); device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors)); @@ -165,16 +165,16 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr int threadsPerBlock = 128; // at least one block! int blocks = (std::max(1, nhits) + threadsPerBlock - 1) / threadsPerBlock; - gpuPixelDoublets::initDoublets<<>>(device_isOuterHitOfCell_.get(), - nhits, - device_theCellNeighbors_.get(), - device_theCellNeighborsContainer_, - device_theCellTracks_.get(), - device_theCellTracksContainer_); + gpuPixelDoublets::initDoublets<<>>(device_isOuterHitOfCell_.get(), + nhits, + device_theCellNeighbors_.get(), + device_theCellNeighborsContainer_, + device_theCellTracks_.get(), + device_theCellTracksContainer_); cudaCheck(cudaGetLastError()); } - device_theCells_ = cms::cuda::make_device_unique(params_.maxNumberOfDoublets_, stream); + device_theCells_ = cms::cuda::make_device_unique(params_.maxNumberOfDoublets_, ctx); #ifdef GPU_DEBUG cudaDeviceSynchronize(); @@ -201,18 +201,18 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr int blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock; dim3 blks(1, blocks, 1); dim3 thrs(stride, threadsPerBlock, 1); - gpuPixelDoublets::getDoubletsFromHisto<<>>(device_theCells_.get(), - device_nCells_, - device_theCellNeighbors_.get(), - device_theCellTracks_.get(), - hh.view(), - device_isOuterHitOfCell_.get(), - nActualPairs, - params_.idealConditions_, - params_.doClusterCut_, - params_.doZ0Cut_, - params_.doPtCut_, - params_.maxNumberOfDoublets_); + gpuPixelDoublets::getDoubletsFromHisto<<>>(device_theCells_.get(), + device_nCells_, + device_theCellNeighbors_.get(), + device_theCellTracks_.get(), + hh.view(), + device_isOuterHitOfCell_.get(), + nActualPairs, + params_.idealConditions_, + params_.doClusterCut_, + params_.doZ0Cut_, + params_.doPtCut_, + params_.maxNumberOfDoublets_); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h index dd87597a4..c57f97456 100644 --- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h +++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h @@ -180,8 +180,8 @@ class CAHitNtupletGeneratorKernels { void fillHitDetIndices(HitsView const* hv, TkSoA* tuples_d, cudaStream_t cudaStream); - void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream); - void allocateOnGPU(int32_t nHits, cudaStream_t stream); + void buildDoublets(HitsOnCPU const& hh, cms::cuda::Context const& ctx); + void allocateOnGPU(int32_t nHits, cms::cuda::Context const& ctx); void cleanup(cudaStream_t cudaStream); static void printCounters(Counters const* counters); diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h index 929677a44..1f8c3f931 100644 --- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h +++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h @@ -4,16 +4,16 @@ template <> #ifdef __CUDACC__ -void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(int32_t nHits, cudaStream_t stream) { +void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(int32_t nHits, cms::cuda::Context const& ctx) { #else -void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cudaStream_t stream) { +void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cms::cuda::Context const& ctx) { #endif ////////////////////////////////////////////////////////// // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER) ////////////////////////////////////////////////////////// - device_theCellNeighbors_ = Traits::template make_unique(stream); - device_theCellTracks_ = Traits::template make_unique(stream); + device_theCellNeighbors_ = Traits::template make_unique(ctx); + device_theCellTracks_ = Traits::template make_unique(ctx); #ifdef GPU_DEBUG std::cout << "Allocation for tuple building. N hits " << nHits << std::endl; @@ -21,15 +21,15 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cudaStream_t nHits++; // storage requires one more counter; assert(nHits > 0); - device_hitToTuple_ = Traits::template make_unique(stream); - device_hitToTupleStorage_ = Traits::template make_unique(nHits, stream); + device_hitToTuple_ = Traits::template make_unique(ctx); + device_hitToTupleStorage_ = Traits::template make_unique(nHits, ctx); hitToTupleView_.assoc = device_hitToTuple_.get(); hitToTupleView_.offStorage = device_hitToTupleStorage_.get(); hitToTupleView_.offSize = nHits; - device_tupleMultiplicity_ = Traits::template make_unique(stream); + device_tupleMultiplicity_ = Traits::template make_unique(ctx); - device_storage_ = Traits::template make_unique(3, stream); + device_storage_ = Traits::template make_unique(3, ctx); device_hitTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get(); device_hitToTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get() + 1; @@ -37,12 +37,12 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cudaStream_t // FIXME: consider collapsing these 3 in one adhoc kernel if constexpr (std::is_same::value) { - cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream)); + cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), ctx.stream())); } else { *device_nCells_ = 0; } - cms::cuda::launchZero(device_tupleMultiplicity_.get(), stream); - cms::cuda::launchZero(hitToTupleView_, stream); // we may wish to keep it in the edm + cms::cuda::launchZero(device_tupleMultiplicity_.get(), ctx.stream()); + cms::cuda::launchZero(hitToTupleView_, ctx.stream()); // we may wish to keep it in the edm #ifdef GPU_DEBUG cudaDeviceSynchronize(); cudaCheck(cudaGetLastError()); diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc index 714748cc1..7bddc5322 100644 --- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc +++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc @@ -112,28 +112,28 @@ CAHitNtupletGeneratorOnGPU::~CAHitNtupletGeneratorOnGPU() { PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d, float bfield, - cudaStream_t stream) const { - PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique(stream)); + cms::cuda::Context const& ctx) const { + PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique(ctx)); auto* soa = tracks.get(); assert(soa); CAHitNtupletGeneratorKernelsGPU kernels(m_params); kernels.setCounters(m_counters); - kernels.allocateOnGPU(hits_d.nHits(), stream); + kernels.allocateOnGPU(hits_d.nHits(), ctx); - kernels.buildDoublets(hits_d, stream); - kernels.launchKernels(hits_d, soa, stream); - kernels.fillHitDetIndices(hits_d.view(), soa, stream); // in principle needed only if Hits not "available" + kernels.buildDoublets(hits_d, ctx); + kernels.launchKernels(hits_d, soa, ctx.stream()); + kernels.fillHitDetIndices(hits_d.view(), soa, ctx.stream()); // in principle needed only if Hits not "available" HelixFitOnGPU fitter(bfield, m_params.fit5as4_); fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); if (m_params.useRiemannFit_) { - fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream); + fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, ctx); } else { - fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream); + fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, ctx); } - kernels.classifyTuples(hits_d, soa, stream); + kernels.classifyTuples(hits_d, soa, ctx.stream()); #ifdef GPU_DEBUG cudaDeviceSynchronize(); @@ -152,9 +152,10 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC CAHitNtupletGeneratorKernelsCPU kernels(m_params); kernels.setCounters(m_counters); - kernels.allocateOnGPU(hits_d.nHits(), nullptr); + cms::cuda::Context dummy{nullptr}; + kernels.allocateOnGPU(hits_d.nHits(), dummy); - kernels.buildDoublets(hits_d, nullptr); + kernels.buildDoublets(hits_d, dummy); kernels.launchKernels(hits_d, soa, nullptr); kernels.fillHitDetIndices(hits_d.view(), soa, nullptr); // in principle needed only if Hits not "available" diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h index f42bb301b..c6b10c574 100644 --- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h +++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h @@ -37,7 +37,9 @@ class CAHitNtupletGeneratorOnGPU { ~CAHitNtupletGeneratorOnGPU(); - PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const; + PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, + float bfield, + cms::cuda::Context const& ctx) const; PixelTrackHeterogeneous makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const; diff --git a/src/cudadev/plugin-PixelTriplets/HelixFitOnGPU.h b/src/cudadev/plugin-PixelTriplets/HelixFitOnGPU.h index fee0f8dae..d74599bf1 100644 --- a/src/cudadev/plugin-PixelTriplets/HelixFitOnGPU.h +++ b/src/cudadev/plugin-PixelTriplets/HelixFitOnGPU.h @@ -44,8 +44,14 @@ class HelixFitOnGPU { ~HelixFitOnGPU() { deallocateOnGPU(); } void setBField(double bField) { bField_ = bField; } - void launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); - void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); + void launchRiemannKernels(HitsView const *hv, + uint32_t nhits, + uint32_t maxNumberOfTuples, + cms::cuda::Context const &ctx); + void launchBrokenLineKernels(HitsView const *hv, + uint32_t nhits, + uint32_t maxNumberOfTuples, + cms::cuda::Context const &ctx); void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); diff --git a/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.cu b/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.cu index 8e6061611..ba5514a52 100644 --- a/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.cu +++ b/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.cu @@ -4,7 +4,7 @@ void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, - cudaStream_t stream) { + cms::cuda::Context const &ctx) { assert(tuples_); auto blockSize = 64; @@ -12,50 +12,35 @@ void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv, // Fit internals auto hitsGPU = cms::cuda::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double), stream); + maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double), ctx); auto hits_geGPU = cms::cuda::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float), stream); + maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float), ctx); auto fast_fit_resultsGPU = cms::cuda::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), stream); + maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), ctx); auto circle_fit_resultsGPU_holder = - cms::cuda::make_device_unique(maxNumberOfConcurrentFits_ * sizeof(riemannFit::CircleFit), stream); + cms::cuda::make_device_unique(maxNumberOfConcurrentFits_ * sizeof(riemannFit::CircleFit), ctx); riemannFit::CircleFit *circle_fit_resultsGPU_ = (riemannFit::CircleFit *)(circle_fit_resultsGPU_holder.get()); for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { // triplets - kernel_FastFit<3><<>>( + kernel_FastFit<3><<>>( tuples_, tupleMultiplicity_, 3, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); cudaCheck(cudaGetLastError()); - kernel_CircleFit<3><<>>(tupleMultiplicity_, - 3, - bField_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU_, - offset); - cudaCheck(cudaGetLastError()); - - kernel_LineFit<3><<>>(tupleMultiplicity_, - 3, - bField_, - outputSoa_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU_, - offset); - cudaCheck(cudaGetLastError()); - - // quads - kernel_FastFit<4><<>>( - tuples_, tupleMultiplicity_, 4, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); + kernel_CircleFit<3><<>>(tupleMultiplicity_, + 3, + bField_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); cudaCheck(cudaGetLastError()); - kernel_CircleFit<4><<>>(tupleMultiplicity_, - 4, + kernel_LineFit<3><<>>(tupleMultiplicity_, + 3, bField_, + outputSoa_, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), @@ -63,68 +48,83 @@ void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv, offset); cudaCheck(cudaGetLastError()); - kernel_LineFit<4><<>>(tupleMultiplicity_, - 4, - bField_, - outputSoa_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU_, - offset); + // quads + kernel_FastFit<4><<>>( + tuples_, tupleMultiplicity_, 4, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); + cudaCheck(cudaGetLastError()); + + kernel_CircleFit<4><<>>(tupleMultiplicity_, + 4, + bField_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + + kernel_LineFit<4><<>>(tupleMultiplicity_, + 4, + bField_, + outputSoa_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); cudaCheck(cudaGetLastError()); if (fit5as4_) { // penta - kernel_FastFit<4><<>>( + kernel_FastFit<4><<>>( tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); cudaCheck(cudaGetLastError()); - kernel_CircleFit<4><<>>(tupleMultiplicity_, - 5, - bField_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU_, - offset); + kernel_CircleFit<4><<>>(tupleMultiplicity_, + 5, + bField_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); cudaCheck(cudaGetLastError()); - kernel_LineFit<4><<>>(tupleMultiplicity_, - 5, - bField_, - outputSoa_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU_, - offset); + kernel_LineFit<4><<>>(tupleMultiplicity_, + 5, + bField_, + outputSoa_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); cudaCheck(cudaGetLastError()); } else { // penta all 5 - kernel_FastFit<5><<>>( + kernel_FastFit<5><<>>( tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); cudaCheck(cudaGetLastError()); - kernel_CircleFit<5><<>>(tupleMultiplicity_, - 5, - bField_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU_, - offset); + kernel_CircleFit<5><<>>(tupleMultiplicity_, + 5, + bField_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); cudaCheck(cudaGetLastError()); - kernel_LineFit<5><<>>(tupleMultiplicity_, - 5, - bField_, - outputSoa_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU_, - offset); + kernel_LineFit<5><<>>(tupleMultiplicity_, + 5, + bField_, + outputSoa_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); cudaCheck(cudaGetLastError()); } } diff --git a/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc b/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc index d4ca2196b..c9334747c 100644 --- a/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc +++ b/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc @@ -63,7 +63,7 @@ void PixelVertexProducerCUDA::produceOnGPU(edm::Event& iEvent, const edm::EventS assert(tracks); - ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_)); + ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx, tracks, ptMin_)); }); } diff --git a/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc b/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc index 4f38c0ad8..454e6b32a 100644 --- a/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc +++ b/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc @@ -33,7 +33,7 @@ void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent, cms::cuda::AcquireContext& ctx) { auto const& inputData = ctx.get(iEvent, tokenCUDA_); - m_soa = inputData.toHostAsync(ctx.stream()); + m_soa = inputData.toHostAsync(ctx); } void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup, cms::cuda::ProduceContext&) { diff --git a/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.cc b/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.cc index 608d4efc0..c05e31dde 100644 --- a/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.cc +++ b/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.cc @@ -93,11 +93,11 @@ namespace gpuVertexFinder { #endif #ifdef __CUDACC__ - ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const { + ZVertexHeterogeneous Producer::makeAsync(cms::cuda::Context const& ctx, TkSoA const* tksoa, float ptMin) const { #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "producing Vertices on GPU" << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE - ZVertexHeterogeneous vertices(cms::cuda::make_device_unique(stream)); + ZVertexHeterogeneous vertices(cms::cuda::make_device_unique(ctx)); #else ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const { #ifdef PIXVERTEX_DEBUG_PRODUCE @@ -110,16 +110,16 @@ namespace gpuVertexFinder { assert(soa); #ifdef __CUDACC__ - auto ws_d = cms::cuda::make_device_unique(stream); + auto ws_d = cms::cuda::make_device_unique(ctx); #else auto ws_d = std::make_unique(); #endif #ifdef __CUDACC__ - init<<<1, 1, 0, stream>>>(soa, ws_d.get()); + init<<<1, 1, 0, ctx.stream()>>>(soa, ws_d.get()); auto blockSize = 128; auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize; - loadTracks<<>>(tksoa, soa, ws_d.get(), ptMin); + loadTracks<<>>(tksoa, soa, ws_d.get(), ptMin); cudaCheck(cudaGetLastError()); #else init(soa, ws_d.get()); @@ -135,32 +135,33 @@ namespace gpuVertexFinder { if (oneKernel_) { // implemented only for density clustesrs #ifndef THREE_KERNELS - vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, ctx.stream()>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); #else - vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + vertexFinderKernel1<<<1, maxThreadsForPrint, 0, ctx.stream()>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); cudaCheck(cudaGetLastError()); // one block per vertex... - splitVerticesKernel<<>>(soa, ws_d.get(), maxChi2ForSplit); + splitVerticesKernel<<>>(soa, ws_d.get(), maxChi2ForSplit); cudaCheck(cudaGetLastError()); - vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get()); + vertexFinderKernel2<<<1, maxThreadsForPrint, 0, ctx.stream()>>>(soa, ws_d.get()); #endif } else { // five kernels if (useDensity_) { - clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, ctx.stream()>>>( + soa, ws_d.get(), minT, eps, errmax, chi2max); } else if (useDBSCAN_) { - clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, ctx.stream()>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); } else if (useIterative_) { - clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksIterative<<<1, maxThreadsForPrint, 0, ctx.stream()>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); } cudaCheck(cudaGetLastError()); - fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit); + fitVerticesKernel<<<1, maxThreadsForPrint, 0, ctx.stream()>>>(soa, ws_d.get(), maxChi2ForFirstFit); cudaCheck(cudaGetLastError()); // one block per vertex... - splitVerticesKernel<<>>(soa, ws_d.get(), maxChi2ForSplit); + splitVerticesKernel<<>>(soa, ws_d.get(), maxChi2ForSplit); cudaCheck(cudaGetLastError()); - fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit); + fitVerticesKernel<<<1, maxThreadsForPrint, 0, ctx.stream()>>>(soa, ws_d.get(), maxChi2ForFinalFit); cudaCheck(cudaGetLastError()); - sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get()); + sortByPt2Kernel<<<1, maxThreadsForPrint, 0, ctx.stream()>>>(soa, ws_d.get()); } cudaCheck(cudaGetLastError()); #else // __CUDACC__ diff --git a/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.h b/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.h index b9b8b35d7..8e569654e 100644 --- a/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.h +++ b/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.h @@ -63,7 +63,7 @@ namespace gpuVertexFinder { ~Producer() = default; - ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const; + ZVertexHeterogeneous makeAsync(cms::cuda::Context const& ctx, TkSoA const* tksoa, float ptMin) const; ZVertexHeterogeneous make(TkSoA const* tksoa, float ptMin) const; private: diff --git a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc index 4f06bf540..17674d46e 100644 --- a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc +++ b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc @@ -160,7 +160,7 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, useQuality_, includeErrors_, false, // debug - ctx.stream()); + ctx); } void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, diff --git a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu index aaa72c5e0..dc8591063 100644 --- a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu +++ b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu @@ -509,20 +509,20 @@ namespace pixelgpudetails { bool useQualityInfo, bool includeErrors, bool debug, - cudaStream_t stream) { + cms::cuda::Context const &ctx) { nDigis = wordCounter; #ifdef GPU_DEBUG std::cout << "decoding " << wordCounter << " digis. Max is " << pixelgpudetails::MAX_FED_WORDS << std::endl; #endif - digis_d = SiPixelDigisCUDA(pixelgpudetails::MAX_FED_WORDS, stream); + digis_d = SiPixelDigisCUDA(pixelgpudetails::MAX_FED_WORDS, ctx); if (includeErrors) { - digiErrors_d = SiPixelDigiErrorsCUDA(pixelgpudetails::MAX_FED_WORDS, std::move(errors), stream); + digiErrors_d = SiPixelDigiErrorsCUDA(pixelgpudetails::MAX_FED_WORDS, std::move(errors), ctx); } - clusters_d = SiPixelClustersCUDA(gpuClustering::maxNumModules, stream); + clusters_d = SiPixelClustersCUDA(gpuClustering::maxNumModules, ctx); - nModules_Clusters_h = cms::cuda::make_host_unique(2, stream); + nModules_Clusters_h = cms::cuda::make_host_unique(2, ctx); if (wordCounter) // protect in case of empty event.... { @@ -531,16 +531,16 @@ namespace pixelgpudetails { assert(0 == wordCounter % 2); // wordCounter is the total no of words in each event to be trasfered on device - auto word_d = cms::cuda::make_device_unique(wordCounter, stream); - auto fedId_d = cms::cuda::make_device_unique(wordCounter, stream); + auto word_d = cms::cuda::make_device_unique(wordCounter, ctx); + auto fedId_d = cms::cuda::make_device_unique(wordCounter, ctx); - cudaCheck( - cudaMemcpyAsync(word_d.get(), wordFed.word(), wordCounter * sizeof(uint32_t), cudaMemcpyDefault, stream)); cudaCheck(cudaMemcpyAsync( - fedId_d.get(), wordFed.fedId(), wordCounter * sizeof(uint8_t) / 2, cudaMemcpyDefault, stream)); + word_d.get(), wordFed.word(), wordCounter * sizeof(uint32_t), cudaMemcpyDefault, ctx.stream())); + cudaCheck(cudaMemcpyAsync( + fedId_d.get(), wordFed.fedId(), wordCounter * sizeof(uint8_t) / 2, cudaMemcpyDefault, ctx.stream())); // Launch rawToDigi kernel - RawToDigi_kernel<<>>( + RawToDigi_kernel<<>>( cablingMap, modToUnp, wordCounter, @@ -563,7 +563,7 @@ namespace pixelgpudetails { #endif if (includeErrors) { - digiErrors_d.copyErrorToHostAsync(stream); + digiErrors_d.copyErrorToHostAsync(ctx); } } // End of Raw2Digi and passing data for clustering @@ -575,16 +575,16 @@ namespace pixelgpudetails { int blocks = (std::max(int(wordCounter), int(gpuClustering::maxNumModules)) + threadsPerBlock - 1) / threadsPerBlock; - gpuCalibPixel::calibDigis<<>>(isRun2, - digis_d.moduleInd(), - digis_d.xx(), - digis_d.yy(), - digis_d.adc(), - gains, - wordCounter, - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.clusModuleStart()); + gpuCalibPixel::calibDigis<<>>(isRun2, + digis_d.moduleInd(), + digis_d.xx(), + digis_d.yy(), + digis_d.adc(), + gains, + wordCounter, + clusters_d.moduleStart(), + clusters_d.clusInModule(), + clusters_d.clusModuleStart()); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaDeviceSynchronize(); @@ -596,27 +596,27 @@ namespace pixelgpudetails { << " threads\n"; #endif - countModules<<>>( + countModules<<>>( digis_d.moduleInd(), clusters_d.moduleStart(), digis_d.clus(), wordCounter); cudaCheck(cudaGetLastError()); // read the number of modules into a data member, used by getProduct()) cudaCheck(cudaMemcpyAsync( - &(nModules_Clusters_h[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream)); + &(nModules_Clusters_h[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, ctx.stream())); threadsPerBlock = 256; blocks = maxNumModules; #ifdef GPU_DEBUG std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n"; #endif - findClus<<>>(digis_d.moduleInd(), - digis_d.xx(), - digis_d.yy(), - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.moduleId(), - digis_d.clus(), - wordCounter); + findClus<<>>(digis_d.moduleInd(), + digis_d.xx(), + digis_d.yy(), + clusters_d.moduleStart(), + clusters_d.clusInModule(), + clusters_d.moduleId(), + digis_d.clus(), + wordCounter); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaDeviceSynchronize(); @@ -624,14 +624,14 @@ namespace pixelgpudetails { #endif // apply charge cut - clusterChargeCut<<>>(clusterThresholds, - digis_d.moduleInd(), - digis_d.adc(), - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.moduleId(), - digis_d.clus(), - wordCounter); + clusterChargeCut<<>>(clusterThresholds, + digis_d.moduleInd(), + digis_d.adc(), + clusters_d.moduleStart(), + clusters_d.clusInModule(), + clusters_d.moduleId(), + digis_d.clus(), + wordCounter); cudaCheck(cudaGetLastError()); // count the module start indices already here (instead of @@ -640,14 +640,14 @@ namespace pixelgpudetails { // synchronization/ExternalWork // MUST be ONE block - fillHitsModuleStart<<<1, 1024, 0, stream>>>(clusters_d.clusInModule(), clusters_d.clusModuleStart()); + fillHitsModuleStart<<<1, 1024, 0, ctx.stream()>>>(clusters_d.clusInModule(), clusters_d.clusModuleStart()); // last element holds the number of all clusters cudaCheck(cudaMemcpyAsync(&(nModules_Clusters_h[1]), clusters_d.clusModuleStart() + gpuClustering::maxNumModules, sizeof(uint32_t), cudaMemcpyDefault, - stream)); + ctx.stream())); #ifdef GPU_DEBUG cudaDeviceSynchronize(); diff --git a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h index 04e8b99b9..ec70894c4 100644 --- a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h +++ b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h @@ -151,7 +151,7 @@ namespace pixelgpudetails { bool useQualityInfo, bool includeErrors, bool debug, - cudaStream_t stream); + const cms::cuda::Context& ctx); std::pair getResults() { digis_d.setNModulesDigis(nModules_Clusters_h[0], nDigis); diff --git a/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc b/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc index 8d1f8b3c4..bd07cf430 100644 --- a/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc +++ b/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc @@ -37,10 +37,10 @@ void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent, const auto& gpuDigis = ctx.get(iEvent, digiGetToken_); nDigis_ = gpuDigis.nDigis(); - pdigi_ = gpuDigis.pdigiToHostAsync(ctx.stream()); - rawIdArr_ = gpuDigis.rawIdArrToHostAsync(ctx.stream()); - adc_ = gpuDigis.adcToHostAsync(ctx.stream()); - clus_ = gpuDigis.clusToHostAsync(ctx.stream()); + pdigi_ = gpuDigis.pdigiToHostAsync(ctx); + rawIdArr_ = gpuDigis.rawIdArrToHostAsync(ctx); + adc_ = gpuDigis.adcToHostAsync(ctx); + clus_ = gpuDigis.clusToHostAsync(ctx); } void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext&) { diff --git a/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.cu b/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.cu index ba62da1b5..520a395ef 100644 --- a/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.cu +++ b/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.cu @@ -37,9 +37,9 @@ namespace pixelgpudetails { SiPixelClustersCUDA const& clusters_d, BeamSpotCUDA const& bs_d, pixelCPEforGPU::ParamsOnGPU const* cpeParams, - cudaStream_t stream) const { + cms::cuda::Context const& ctx) const { auto nHits = clusters_d.nClusters(); - TrackingRecHit2DCUDA hits_d(nHits, cpeParams, clusters_d.clusModuleStart(), stream); + TrackingRecHit2DCUDA hits_d(nHits, cpeParams, clusters_d.clusModuleStart(), ctx); int threadsPerBlock = 128; int blocks = digis_d.nModules(); // active modules (with digis) @@ -49,7 +49,7 @@ namespace pixelgpudetails { #endif // protect from empty events if (blocks) { - gpuPixelRecHits::getHits<<>>( + gpuPixelRecHits::getHits<<>>( cpeParams, bs_d.data(), digis_d.view(), digis_d.nDigis(), clusters_d.view(), hits_d.view()); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG @@ -59,11 +59,17 @@ namespace pixelgpudetails { // assuming full warp of threads is better than a smaller number... if (nHits) { - setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart()); + setHitsLayerStart<<<1, 32, 0, ctx.stream()>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart()); cudaCheck(cudaGetLastError()); - cms::cuda::fillManyFromVector( - hits_d.phiBinner(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256, hits_d.phiBinnerStorage(), stream); + cms::cuda::fillManyFromVector(hits_d.phiBinner(), + 10, + hits_d.iphi(), + hits_d.hitsLayerStart(), + nHits, + 256, + hits_d.phiBinnerStorage(), + ctx.stream()); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG diff --git a/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.h b/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.h index 7b0a38a15..f4735489d 100644 --- a/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.h +++ b/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.h @@ -26,7 +26,7 @@ namespace pixelgpudetails { SiPixelClustersCUDA const& clusters_d, BeamSpotCUDA const& bs_d, pixelCPEforGPU::ParamsOnGPU const* cpeParams, - cudaStream_t stream) const; + cms::cuda::Context const& ctx) const; }; } // namespace pixelgpudetails diff --git a/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc b/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc index 133fa8eaf..c363a6a67 100644 --- a/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc +++ b/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc @@ -42,9 +42,8 @@ void SiPixelRecHitCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es, c auto const& digis = ctx.get(iEvent, tokenDigi_); auto const& bs = ctx.get(iEvent, tBeamSpot); - ctx.emplace(iEvent, - tokenHit_, - gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe.getGPUProductAsync(ctx.stream()), ctx.stream())); + ctx.emplace( + iEvent, tokenHit_, gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe.getGPUProductAsync(ctx.stream()), ctx)); } DEFINE_FWK_MODULE(SiPixelRecHitCUDA); diff --git a/src/cudadev/plugin-Validation/HistoValidator.cc b/src/cudadev/plugin-Validation/HistoValidator.cc index f8af1773d..e45e27f47 100644 --- a/src/cudadev/plugin-Validation/HistoValidator.cc +++ b/src/cudadev/plugin-Validation/HistoValidator.cc @@ -92,18 +92,18 @@ void HistoValidator::acquire(const edm::Event& iEvent, const edm::EventSetup& iS nDigis = digis.nDigis(); nModules = digis.nModules(); - h_adc = digis.adcToHostAsync(ctx.stream()); + h_adc = digis.adcToHostAsync(ctx); nClusters = clusters.nClusters(); - h_clusInModule = cms::cuda::make_host_unique(nModules, ctx.stream()); + h_clusInModule = cms::cuda::make_host_unique(nModules, ctx); cudaCheck(cudaMemcpyAsync( h_clusInModule.get(), clusters.clusInModule(), sizeof(uint32_t) * nModules, cudaMemcpyDefault, ctx.stream())); nHits = hits.nHits(); - h_localCoord = hits.localCoordToHostAsync(ctx.stream()); - h_globalCoord = hits.globalCoordToHostAsync(ctx.stream()); - h_charge = hits.chargeToHostAsync(ctx.stream()); - h_size = hits.sizeToHostAsync(ctx.stream()); + h_localCoord = hits.localCoordToHostAsync(ctx); + h_globalCoord = hits.globalCoordToHostAsync(ctx); + h_charge = hits.chargeToHostAsync(ctx); + h_size = hits.sizeToHostAsync(ctx); } void HistoValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext&) { diff --git a/src/cudadev/test/HistoContainer_t.cu b/src/cudadev/test/HistoContainer_t.cu index 4f5a4ee54..4263a7e82 100644 --- a/src/cudadev/test/HistoContainer_t.cu +++ b/src/cudadev/test/HistoContainer_t.cu @@ -8,6 +8,7 @@ #include "CUDACore/cudaCheck.h" #include "CUDACore/device_unique_ptr.h" #include "CUDACore/requireDevices.h" +#include "CUDACore/TestContext.h" using namespace cms::cuda; @@ -15,10 +16,11 @@ template void go() { std::mt19937 eng; std::uniform_int_distribution rgen(std::numeric_limits::min(), std::numeric_limits::max()); + cms::cudatest::TestContext ctx; constexpr int N = 12000; T v[N]; - auto v_d = make_device_unique(N, nullptr); + auto v_d = make_device_unique(N, ctx); cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); @@ -34,9 +36,9 @@ void go() { assert(Hist::totbins() == Hist::ctNOnes()); Hist h; - auto h_d = make_device_unique(1, nullptr); + auto h_d = make_device_unique(1, ctx); - auto off_d = make_device_unique(nParts + 1, nullptr); + auto off_d = make_device_unique(nParts + 1, ctx); for (int it = 0; it < 5; ++it) { offsets[0] = 0; diff --git a/src/cudadev/test/OneHistoContainer_t.cu b/src/cudadev/test/OneHistoContainer_t.cu index 89fed087a..e37b21b9c 100644 --- a/src/cudadev/test/OneHistoContainer_t.cu +++ b/src/cudadev/test/OneHistoContainer_t.cu @@ -9,6 +9,7 @@ #include "CUDACore/device_unique_ptr.h" #include "CUDACore/launch.h" #include "CUDACore/requireDevices.h" +#include "CUDACore/TestContext.h" using namespace cms::cuda; @@ -104,11 +105,12 @@ void go() { } std::uniform_int_distribution rgen(rmin, rmax); + cms::cudatest::TestContext ctx; constexpr int N = 12000; T v[N]; - auto v_d = make_device_unique(N, nullptr); + auto v_d = make_device_unique(N, ctx); assert(v_d.get()); using Hist = HistoContainer; diff --git a/src/cudadev/test/OneToManyAssoc_t.h b/src/cudadev/test/OneToManyAssoc_t.h index 50bf6fb38..a80b71a1c 100644 --- a/src/cudadev/test/OneToManyAssoc_t.h +++ b/src/cudadev/test/OneToManyAssoc_t.h @@ -11,6 +11,7 @@ #include "CUDACore/cudaCheck.h" #include "CUDACore/requireDevices.h" #include "CUDACore/currentDevice.h" +#include "CUDACore/TestContext.h" #endif #include "CUDACore/OneToManyAssoc.h" @@ -141,7 +142,8 @@ __global__ void verifyFinal(Assoc const* __restrict__ la, int N) { template auto make_unique(std::size_t size) { #ifdef __CUDACC__ - return cms::cuda::make_device_unique(size, 0); + cms::cudatest::TestContext ctx; + return cms::cuda::make_device_unique(size, ctx); #else return std::make_unique(size); #endif @@ -219,7 +221,8 @@ int main() { auto a_d = make_unique(1); auto sa_d = make_unique(1); #ifdef __CUDACC__ - auto v_d = cms::cuda::make_device_unique[]>(N, nullptr); + cms::cudatest::TestContext ctx; + auto v_d = cms::cuda::make_device_unique[]>(N, ctx); assert(v_d.get()); cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array), cudaMemcpyHostToDevice)); #else diff --git a/src/cudadev/test/TrackingRecHit2DCUDA_t.cu b/src/cudadev/test/TrackingRecHit2DCUDA_t.cu index 5f3a26391..86abafbe6 100644 --- a/src/cudadev/test/TrackingRecHit2DCUDA_t.cu +++ b/src/cudadev/test/TrackingRecHit2DCUDA_t.cu @@ -1,6 +1,7 @@ #include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h" #include "CUDACore/copyAsync.h" #include "CUDACore/cudaCheck.h" +#include "CUDACore/TestContext.h" namespace testTrackingRecHit2D { @@ -41,11 +42,12 @@ namespace testTrackingRecHit2D { int main() { cudaStream_t stream; cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + cms::cudatest::TestContext ctx{stream}; // inner scope to deallocate memory before destroying the stream { auto nHits = 200; - TrackingRecHit2DCUDA tkhit(nHits, nullptr, nullptr, stream); + TrackingRecHit2DCUDA tkhit(nHits, nullptr, nullptr, ctx); testTrackingRecHit2D::runKernels(tkhit.view()); } diff --git a/src/cudadev/test/VertexFinder_t.h b/src/cudadev/test/VertexFinder_t.h index aed660c0d..cb9ba942e 100644 --- a/src/cudadev/test/VertexFinder_t.h +++ b/src/cudadev/test/VertexFinder_t.h @@ -7,6 +7,7 @@ #include "CUDACore/cudaCheck.h" #include "CUDACore/requireDevices.h" #include "CUDACore/launch.h" +#include "CUDACore/TestContext.h" #ifdef USE_DBSCAN #include "plugin-PixelVertexFinding/gpuClusterTracksDBSCAN.h" #define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN @@ -114,8 +115,9 @@ int main() { #ifdef __CUDACC__ cms::cudatest::requireDevices(); - auto onGPU_d = cms::cuda::make_device_unique(1, nullptr); - auto ws_d = cms::cuda::make_device_unique(1, nullptr); + cms::cudatest::TestContext ctx; + auto onGPU_d = cms::cuda::make_device_unique(1, ctx); + auto ws_d = cms::cuda::make_device_unique(1, ctx); #else auto onGPU_d = std::make_unique(); auto ws_d = std::make_unique(); diff --git a/src/cudadev/test/gpuClustering_t.h b/src/cudadev/test/gpuClustering_t.h index 4f273e57a..03d090ce7 100644 --- a/src/cudadev/test/gpuClustering_t.h +++ b/src/cudadev/test/gpuClustering_t.h @@ -14,6 +14,7 @@ #include "CUDACore/device_unique_ptr.h" #include "CUDACore/launch.h" #include "CUDACore/requireDevices.h" +#include "CUDACore/TestContext.h" #endif // __CUDACC__ // dirty, but works @@ -39,14 +40,15 @@ int main(void) { auto h_clus = std::make_unique(numElements); #ifdef __CUDACC__ - auto d_id = cms::cuda::make_device_unique(numElements, nullptr); - auto d_x = cms::cuda::make_device_unique(numElements, nullptr); - auto d_y = cms::cuda::make_device_unique(numElements, nullptr); - auto d_adc = cms::cuda::make_device_unique(numElements, nullptr); - auto d_clus = cms::cuda::make_device_unique(numElements, nullptr); - auto d_moduleStart = cms::cuda::make_device_unique(maxNumModules + 1, nullptr); - auto d_clusInModule = cms::cuda::make_device_unique(maxNumModules, nullptr); - auto d_moduleId = cms::cuda::make_device_unique(maxNumModules, nullptr); + cms::cudatest::TestContext ctx; + auto d_id = cms::cuda::make_device_unique(numElements, ctx); + auto d_x = cms::cuda::make_device_unique(numElements, ctx); + auto d_y = cms::cuda::make_device_unique(numElements, ctx); + auto d_adc = cms::cuda::make_device_unique(numElements, ctx); + auto d_clus = cms::cuda::make_device_unique(numElements, ctx); + auto d_moduleStart = cms::cuda::make_device_unique(maxNumModules + 1, ctx); + auto d_clusInModule = cms::cuda::make_device_unique(maxNumModules, ctx); + auto d_moduleId = cms::cuda::make_device_unique(maxNumModules, ctx); #else // __CUDACC__ auto h_moduleStart = std::make_unique(maxNumModules + 1); auto h_clusInModule = std::make_unique(maxNumModules); diff --git a/src/cudadev/test/radixSort_t.cu b/src/cudadev/test/radixSort_t.cu index d2c469510..c94273fce 100644 --- a/src/cudadev/test/radixSort_t.cu +++ b/src/cudadev/test/radixSort_t.cu @@ -15,6 +15,7 @@ #include "CUDACore/launch.h" #include "CUDACore/radixSort.h" #include "CUDACore/requireDevices.h" +#include "CUDACore/TestContext.h" using namespace cms::cuda; @@ -137,10 +138,11 @@ void go(bool useShared) { std::random_shuffle(v, v + N); - auto v_d = cms::cuda::make_device_unique(N, nullptr); - auto ind_d = cms::cuda::make_device_unique(N, nullptr); - auto ws_d = cms::cuda::make_device_unique(N, nullptr); - auto off_d = cms::cuda::make_device_unique(blocks + 1, nullptr); + cms::cudatest::TestContext ctx; + auto v_d = cms::cuda::make_device_unique(N, ctx); + auto ind_d = cms::cuda::make_device_unique(N, ctx); + auto ws_d = cms::cuda::make_device_unique(N, ctx); + auto off_d = cms::cuda::make_device_unique(blocks + 1, ctx); cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks + 1), cudaMemcpyHostToDevice));