Skip to content

Commit

Permalink
Always record and query the CUDA event, to minimize need for error ch…
Browse files Browse the repository at this point in the history
…ecking in CUDAScopedContextProduce destructor
  • Loading branch information
makortel committed Dec 17, 2019
1 parent 3709b84 commit b14cff6
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 56 deletions.
8 changes: 4 additions & 4 deletions CUDADataFormats/Common/interface/CUDAProduct.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,12 @@ class CUDAProduct : public CUDAProductBase {
friend class CUDAScopedContextProduce;
friend class edm::Wrapper<CUDAProduct<T>>;

explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, T data)
: CUDAProductBase(device, std::move(stream)), data_(std::move(data)) {}
explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event, T data)
: CUDAProductBase(device, std::move(stream), std::move(event)), data_(std::move(data)) {}

template <typename... Args>
explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, Args&&... args)
: CUDAProductBase(device, std::move(stream)), data_(std::forward<Args>(args)...) {}
explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event, Args&&... args)
: CUDAProductBase(device, std::move(stream), std::move(event)), data_(std::forward<Args>(args)...) {}

T data_; //!
};
Expand Down
9 changes: 4 additions & 5 deletions CUDADataFormats/Common/interface/CUDAProductBase.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,18 +50,17 @@ class CUDAProductBase {
// mutable access is needed even if the CUDAScopedContext itself
// would be const. Therefore it is ok to return a non-const
// pointer from a const method here.
cudaEvent_t event() const { return event_ ? event_.get() : nullptr; }
cudaEvent_t event() const { return event_.get(); }

protected:
explicit CUDAProductBase(int device, cudautils::SharedStreamPtr stream)
: stream_{std::move(stream)}, device_{device} {}
explicit CUDAProductBase(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event)
: stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {}

private:
friend class impl::CUDAScopedContextBase;
friend class CUDAScopedContextProduce;

// The following functions are intended to be used only from CUDAScopedContext
void setEvent(cudautils::SharedEventPtr event) { event_ = std::move(event); }
// The following function is intended to be used only from CUDAScopedContext
const cudautils::SharedStreamPtr& streamPtr() const { return stream_; }

bool mayReuseStream() const {
Expand Down
22 changes: 11 additions & 11 deletions CUDADataFormats/Common/src/CUDAProductBase.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
#include "HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h"

bool CUDAProductBase::isAvailable() const {
// In absence of event, the product was available already at the end
// of produce() of the producer.
if (not event_) {
return true;
// if default-constructed, the product is not available
if(not event_) {
return false;
}
return cudautils::eventIsOccurred(event_.get());
}
Expand All @@ -15,13 +14,14 @@ CUDAProductBase::~CUDAProductBase() {
// complete before destructing the product. This is to make sure
// that the EDM stream does not move to the next event before all
// asynchronous processing of the current is complete.
if (event_) {
// TODO: a callback notifying a WaitingTaskHolder (or similar)
// would avoid blocking the CPU, but would also require more work.
//
// Intentionally not checking the return value to avoid throwing
// exceptions. If this call would fail, we should get failures
// elsewhere as well.

// TODO: a callback notifying a WaitingTaskHolder (or similar)
// would avoid blocking the CPU, but would also require more work.
//
// Intentionally not checking the return value to avoid throwing
// exceptions. If this call would fail, we should get failures
// elsewhere as well.
if(event_) {
cudaEventSynchronize(event_.get());
}
}
21 changes: 6 additions & 15 deletions HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "FWCore/Utilities/interface/EDPutToken.h"
#include "FWCore/Utilities/interface/StreamID.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"

Expand Down Expand Up @@ -154,27 +155,18 @@ class CUDAScopedContextProduce : public impl::CUDAScopedContextGetterBase {
explicit CUDAScopedContextProduce(CUDAContextState& state)
: CUDAScopedContextGetterBase(state.device(), state.releaseStreamPtr()) {}

/// Record the CUDA event, all asynchronous work must have been queued before the destructor
~CUDAScopedContextProduce();

template <typename T>
std::unique_ptr<CUDAProduct<T>> wrap(T data) {
// make_unique doesn't work because of private constructor
//
// CUDAProduct<T> constructor records CUDA event to the CUDA
// stream. The event will become "occurred" after all work queued
// to the stream before this point has been finished.
std::unique_ptr<CUDAProduct<T>> ret(new CUDAProduct<T>(device(), streamPtr(), std::move(data)));
createEventIfStreamBusy();
ret->setEvent(event_);
return ret;
return std::unique_ptr<CUDAProduct<T>>(new CUDAProduct<T>(device(), streamPtr(), event_, std::move(data)));
}

template <typename T, typename... Args>
auto emplace(edm::Event& iEvent, edm::EDPutTokenT<T> token, Args&&... args) {
auto ret = iEvent.emplace(token, device(), streamPtr(), std::forward<Args>(args)...);
createEventIfStreamBusy();
const_cast<T&>(*ret).setEvent(event_);
return ret;
return iEvent.emplace(token, device(), streamPtr(), event_, std::forward<Args>(args)...);
}

private:
Expand All @@ -184,9 +176,8 @@ class CUDAScopedContextProduce : public impl::CUDAScopedContextGetterBase {
explicit CUDAScopedContextProduce(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event)
: CUDAScopedContextGetterBase(device, std::move(stream)), event_{std::move(event)} {}

void createEventIfStreamBusy();

cudautils::SharedEventPtr event_;
// create the CUDA Event upfront to catch possible errors from its creation
cudautils::SharedEventPtr event_ = cudautils::getCUDAEventCache().getCUDAEvent();
};

/**
Expand Down
25 changes: 4 additions & 21 deletions HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
#include "FWCore/MessageLogger/interface/MessageLogger.h"
#include "FWCore/ServiceRegistry/interface/Service.h"
#include "FWCore/Utilities/interface/Exception.h"
#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"

Expand Down Expand Up @@ -107,26 +106,10 @@ void CUDAScopedContextAcquire::throwNoState() {
////////////////////

CUDAScopedContextProduce::~CUDAScopedContextProduce() {
if (event_) {
cudaCheck(cudaEventRecord(event_.get(), stream()));
}
}

void CUDAScopedContextProduce::createEventIfStreamBusy() {
if (event_) {
return;
}
auto ret = cudaStreamQuery(stream());
if (ret == cudaSuccess) {
return;
}
if (ret != cudaErrorNotReady) {
// cudaErrorNotReady indicates that the stream is busy, and thus
// is not an error
cudaCheck(ret);
}

event_ = cudautils::getCUDAEventCache().getCUDAEvent();
// Intentionally not checking the return value to avoid throwing
// exceptions. If this call would fail, we should get failures
// elsewhere as well.
cudaEventRecord(event_.get(), stream());
}

////////////////////
Expand Down

0 comments on commit b14cff6

Please sign in to comment.