Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[cudadev][RFC] Prototype (host|device)_unique_ptr API to use lightweight "Context" object instead of CUDA stream #256

Closed
5 changes: 5 additions & 0 deletions src/cudadev/CUDACore/AcquireContext.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#include "CUDACore/AcquireContext.h"

namespace cms::cuda {
void AcquireContext::commit() { holderHelper_.enqueueCallback(stream()); }
}
45 changes: 45 additions & 0 deletions src/cudadev/CUDACore/AcquireContext.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#ifndef HeterogeneousCore_CUDACore_AcquireContext_h
#define HeterogeneousCore_CUDACore_AcquireContext_h

#include "CUDACore/EDGetterContextBase.h"
#include "CUDACore/TaskContext.h"

namespace cms::cuda {
/**
* The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire():
* - setting the current device
* - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
* - synchronizing between CUDA streams if necessary
* Users should not, however, construct it explicitly.
*/
class AcquireContext : public impl::EDGetterContextBase {
public:
explicit AcquireContext(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
: EDGetterContextBase(streamID), holderHelper_{std::move(waitingTaskHolder), device()} {}
~AcquireContext() = default;

template <typename F>
void pushNextTask(F&& f) {
holderHelper_.pushNextTask(std::forward<F>(f));
}

void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
}

// internal API
void commit();

private:
impl::FwkContextHolderHelper holderHelper_;
};

template <typename F>
void runAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder holder, F func) {
AcquireContext context(streamID, std::move(holder));
func(context);
context.commit();
}
}

#endif
20 changes: 20 additions & 0 deletions src/cudadev/CUDACore/AnalyzeContext.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#ifndef HeterogeneousCore_CUDACore_AnalyzeContext_h
#define HeterogeneousCore_CUDACore_AnalyzeContext_h

#include "CUDACore/EDGetterContextBase.h"

namespace cms::cuda {
/**
* The aim of this class is to do necessary per-event "initialization" in analyze()
* - setting the current device
* - synchronizing between CUDA streams if necessary
* and enforce that those get done in a proper way in RAII fashion.
*/
class AnalyzeContext : public impl::EDGetterContextBase {
public:
/// Constructor to (possibly) re-use a CUDA stream
explicit AnalyzeContext(edm::StreamID streamID) : EDGetterContextBase(streamID) {}
};
}

#endif
46 changes: 46 additions & 0 deletions src/cudadev/CUDACore/Context.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#ifndef HeterogeneousCore_CUDAUtilities_Context_h
#define HeterogeneousCore_CUDAUtilities_Context_h

#include "CUDACore/allocate_device.h"
#include "CUDACore/allocate_host.h"

namespace cms::cuda {
class HostAllocatorContext {
public:
explicit HostAllocatorContext(cudaStream_t stream) : stream_(stream) {}

void *allocate_host(size_t nbytes) const { return cms::cuda::allocate_host(nbytes, stream_); }

void free_host(void *ptr) const { cms::cuda::free_host(ptr); }

private:
cudaStream_t stream_;
};

class DeviceAllocatorContext {
public:
explicit DeviceAllocatorContext(cudaStream_t stream) : stream_(stream) {}

void *allocate_device(size_t nbytes) const { return cms::cuda::allocate_device(nbytes, stream_); }

void free_device(void *ptr) const { cms::cuda::free_device(ptr, stream_); }

private:
cudaStream_t stream_;
};
Comment on lines +8 to +30
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right now (and possibly forever in cudadev) the HostAllocatorContext and DeviceAllocatorContext look nearly identical, but in the future (in CMSSW) they could hold a pointer to the CachingHostAllocator/CachingDeviceAllocator objects.


class Context {
public:
explicit Context(cudaStream_t stream) : stream_(stream) {}

cudaStream_t stream() const { return stream_; }

operator HostAllocatorContext() const { return HostAllocatorContext(stream()); }
operator DeviceAllocatorContext() const { return DeviceAllocatorContext(stream()); }

private:
cudaStream_t stream_;
};
} // namespace cms::cuda

#endif
3 changes: 3 additions & 0 deletions src/cudadev/CUDACore/ContextState.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ namespace cms {
friend class ScopedContextAcquire;
friend class ScopedContextProduce;
friend class ScopedContextTask;
friend class AcquireContext;
friend class ProduceContext;
friend class TaskContext;

void set(int device, SharedStreamPtr stream) {
throwIfStream();
Expand Down
27 changes: 27 additions & 0 deletions src/cudadev/CUDACore/EDGetterContextBase.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#include "CUDACore/EDGetterContextBase.h"
#include "CUDACore/cudaCheck.h"

namespace cms::cuda::impl {
void EDGetterContextBase::synchronizeStreams(int dataDevice,
cudaStream_t dataStream,
bool available,
cudaEvent_t dataEvent) {
if (dataDevice != device()) {
// Eventually replace with prefetch to current device (assuming unified memory works)
// If we won't go to unified memory, need to figure out something else...
throw std::runtime_error("Handling data from multiple devices is not yet supported");
}

if (dataStream != stream()) {
// Different streams, need to synchronize
if (not available) {
// Event not yet occurred, so need to add synchronization
// here. Sychronization is done by making the CUDA stream to
// wait for an event, so all subsequent work in the stream
// will run only after the event has "occurred" (i.e. data
// product became available).
cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event");
}
}
}
}
38 changes: 38 additions & 0 deletions src/cudadev/CUDACore/EDGetterContextBase.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#ifndef HeterogeneousCore_CUDACore_EDGetterContextBase_h
#define HeterogeneousCore_CUDACore_EDGetterContextBase_h

#include "CUDACore/FwkContextBase.h"
#include "CUDACore/Product.h"
#include "Framework/EDGetToken.h"

namespace cms::cuda::impl {
/**
* This class is a base class for Context classes that should be
* able to read Event Data products
*/
class EDGetterContextBase : public FwkContextBase {
public:
template <typename T>
const T& get(const Product<T>& data) {
if (not isInitialized()) {
initialize(data);
}
synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event());
return data.data_;
}

template <typename T>
const T& get(const edm::Event& iEvent, edm::EDGetTokenT<Product<T>> token) {
return get(iEvent.get(token));
}

protected:
template <typename... Args>
EDGetterContextBase(Args&&... args) : FwkContextBase(std::forward<Args>(args)...) {}

private:
void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent);
};
}

#endif
17 changes: 17 additions & 0 deletions src/cudadev/CUDACore/EDProducer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#include "CUDACore/EDProducer.h"

namespace cms::cuda {
void EDProducer::produce(edm::Event& event, edm::EventSetup const& eventSetup) {
runProduce(event.streamID(), [&](auto& ctx) { produce(event, eventSetup, ctx); });
}

void SynchronizingEDProducer::acquire(edm::Event const& event,
edm::EventSetup const& eventSetup,
edm::WaitingTaskWithArenaHolder holder) {
runAcquire(event.streamID(), std::move(holder), [&](auto& ctx) { acquire(event, eventSetup, ctx); });
}

void SynchronizingEDProducer::produce(edm::Event& event, edm::EventSetup const& eventSetup) {
runProduce(event.streamID(), [&](auto& ctx) { produce(event, eventSetup, ctx); });
}
} // namespace cms::cuda
27 changes: 27 additions & 0 deletions src/cudadev/CUDACore/EDProducer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#ifndef HeterogeneousCore_CUDACore_stream_EDProducer_h
#define HeterogeneousCore_CUDACore_stream_EDProducer_h

#include "Framework/EDProducer.h"
#include "CUDACore/AcquireContext.h"
#include "CUDACore/ProduceContext.h"

namespace cms::cuda {
class EDProducer : public edm::EDProducer {
public:
void produce(edm::Event& event, edm::EventSetup const& eventSetup) override;
virtual void produce(edm::Event& event, edm::EventSetup const& eventSetup, ProduceContext& context) = 0;
};

class SynchronizingEDProducer : public edm::EDProducerExternalWork {
public:
void acquire(edm::Event const& event,
edm::EventSetup const& eventSetup,
edm::WaitingTaskWithArenaHolder holder) override;
void produce(edm::Event& event, edm::EventSetup const& eventSetup) override;

virtual void acquire(edm::Event const& event, edm::EventSetup const& eventSetup, AcquireContext& context) = 0;
virtual void produce(edm::Event& event, edm::EventSetup const& eventSetup, ProduceContext& context) = 0;
};
} // namespace cms::cuda

#endif
28 changes: 28 additions & 0 deletions src/cudadev/CUDACore/FwkContextBase.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#include "CUDACore/FwkContextBase.h"
#include "CUDACore/StreamCache.h"
#include "CUDACore/cudaCheck.h"

#include "chooseDevice.h"

namespace cms::cuda::impl {
FwkContextBase::FwkContextBase(edm::StreamID streamID) : FwkContextBase(chooseDevice(streamID)) {}

FwkContextBase::FwkContextBase(int device) : currentDevice_(device) { cudaCheck(cudaSetDevice(currentDevice_)); }

FwkContextBase::FwkContextBase(int device, SharedStreamPtr stream)
: currentDevice_(device), stream_(std::make_shared<impl::StreamSharingHelper>(std::move(stream))) {
cudaCheck(cudaSetDevice(currentDevice_));
}

void FwkContextBase::initialize() { stream_ = std::make_shared<impl::StreamSharingHelper>(getStreamCache().get()); }

void FwkContextBase::initialize(const ProductBase& data) {
SharedStreamPtr stream;
if (data.mayReuseStream()) {
stream = data.streamPtr();
} else {
stream = getStreamCache().get();
}
stream_ = std::make_shared<impl::StreamSharingHelper>(std::move(stream));
}
}
71 changes: 71 additions & 0 deletions src/cudadev/CUDACore/FwkContextBase.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#ifndef HeterogeneousCore_CUDACore_FwkContextBase_h
#define HeterogeneousCore_CUDACore_FwkContextBase_h

#include "CUDACore/Context.h"
#include "CUDACore/ProductBase.h"
#include "CUDACore/SharedStreamPtr.h"
#include "Framework/Event.h"

namespace cms::cuda::impl {
/**
* This class is a base class for other Context classes for interacting with the framework
*/
class FwkContextBase {
public:
FwkContextBase(FwkContextBase const&) = delete;
FwkContextBase& operator=(FwkContextBase const&) = delete;
FwkContextBase(FwkContextBase&&) = delete;
FwkContextBase& operator=(FwkContextBase&&) = delete;

int device() const { return currentDevice_; }

cudaStream_t stream() {
if (not isInitialized()) {
initialize();
}
return stream_->streamPtr().get();
}
const SharedStreamPtr& streamPtr() {
if (not isInitialized()) {
initialize();
}
return stream_->streamPtr();
}

operator HostAllocatorContext() { return HostAllocatorContext(stream()); }
operator DeviceAllocatorContext() { return DeviceAllocatorContext(stream()); }
operator Context() { return Context(stream()); }

protected:
// The constructors set the current device, but the device
// is not set back to the previous value at the destructor. This
// should be sufficient (and tiny bit faster) as all CUDA API
// functions relying on the current device should be called from
// the scope where this context is. The current device doesn't
// really matter between modules (or across TBB tasks).
explicit FwkContextBase(edm::StreamID streamID);

explicit FwkContextBase(int device);

// meant only for testing
explicit FwkContextBase(int device, SharedStreamPtr stream);

bool isInitialized() const { return bool(stream_); }

void initialize();
void initialize(const ProductBase& data);

const std::shared_ptr<impl::StreamSharingHelper>& streamSharingHelper() {
if (not isInitialized()) {
initialize();
}
return stream_;
}

private:
int currentDevice_ = -1;
std::shared_ptr<impl::StreamSharingHelper> stream_;
};
}

#endif
25 changes: 25 additions & 0 deletions src/cudadev/CUDACore/FwkContextHolderHelper.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#ifndef HeterogeneousCore_CUDACore_FwkContextHolderHelper_h
#define HeterogeneousCore_CUDACore_FwkContextHolderHelper_h

namespace cms::cuda::impl {
class FwkContextHolderHelper {
public:
FwkContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder, int device)
: waitingTaskHolder_{std::move(waitingTaskHolder)}, device_{device} {}

template <typename F>
void pushNextTask(F&& f);

void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
waitingTaskHolder_ = std::move(waitingTaskHolder);
}

void enqueueCallback(cudaStream_t stream);

private:
edm::WaitingTaskWithArenaHolder waitingTaskHolder_;
int device_;
};
}

#endif
6 changes: 6 additions & 0 deletions src/cudadev/CUDACore/ProduceContext.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#include "CUDACore/ProduceContext.h"
#include "CUDACore/cudaCheck.h"

namespace cms::cuda {
void ProduceContext::commit() { cudaCheck(cudaEventRecord(event_.get(), stream())); }
}
Loading