-
Notifications
You must be signed in to change notification settings - Fork 35
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[cudadev][RFC] Prototype (host|device)_unique_ptr API to use lightweight "Context" object instead of CUDA stream #256
Closed
makortel
wants to merge
12
commits into
cms-patatrack:master
from
makortel:cudadev_next_api_v3_allocator_via_context_v3
Closed
Changes from all commits
Commits
Show all changes
12 commits
Select commit
Hold shift + click to select a range
06cc8f2
Add Context class hierarchy
makortel 44b1429
Add EDProducer base class, and runAcquire() and runProduce() functions
makortel 70341c6
Migrate code from ScopedContext to Context
makortel df786db
Avoid using ContextState
makortel 8b3187a
Remove ScopedContext as obsolete
makortel 4cd8dab
Share the CUDA stream between Event products from the same EDProducer…
makortel 55effec
Split Context.h into one class per file
makortel e5b8bdd
Migrate code away from Context.h
makortel eaf55b2
Remove Context.h
makortel 744c94f
Add lightweight Context class with memory allocation oriented counter…
makortel 0429ad5
Add TestContext for tests
makortel c91e0e3
Migrate make_(device|host)_unique and all calling code to pass cms::c…
makortel File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#include "CUDACore/AcquireContext.h" | ||
|
||
namespace cms::cuda { | ||
void AcquireContext::commit() { holderHelper_.enqueueCallback(stream()); } | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#ifndef HeterogeneousCore_CUDACore_AcquireContext_h | ||
#define HeterogeneousCore_CUDACore_AcquireContext_h | ||
|
||
#include "CUDACore/EDGetterContextBase.h" | ||
#include "CUDACore/TaskContext.h" | ||
|
||
namespace cms::cuda { | ||
/** | ||
* The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire(): | ||
* - setting the current device | ||
* - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary | ||
* - synchronizing between CUDA streams if necessary | ||
* Users should not, however, construct it explicitly. | ||
*/ | ||
class AcquireContext : public impl::EDGetterContextBase { | ||
public: | ||
explicit AcquireContext(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder) | ||
: EDGetterContextBase(streamID), holderHelper_{std::move(waitingTaskHolder), device()} {} | ||
~AcquireContext() = default; | ||
|
||
template <typename F> | ||
void pushNextTask(F&& f) { | ||
holderHelper_.pushNextTask(std::forward<F>(f)); | ||
} | ||
|
||
void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { | ||
holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder)); | ||
} | ||
|
||
// internal API | ||
void commit(); | ||
|
||
private: | ||
impl::FwkContextHolderHelper holderHelper_; | ||
}; | ||
|
||
template <typename F> | ||
void runAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder holder, F func) { | ||
AcquireContext context(streamID, std::move(holder)); | ||
func(context); | ||
context.commit(); | ||
} | ||
} | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
#ifndef HeterogeneousCore_CUDACore_AnalyzeContext_h | ||
#define HeterogeneousCore_CUDACore_AnalyzeContext_h | ||
|
||
#include "CUDACore/EDGetterContextBase.h" | ||
|
||
namespace cms::cuda { | ||
/** | ||
* The aim of this class is to do necessary per-event "initialization" in analyze() | ||
* - setting the current device | ||
* - synchronizing between CUDA streams if necessary | ||
* and enforce that those get done in a proper way in RAII fashion. | ||
*/ | ||
class AnalyzeContext : public impl::EDGetterContextBase { | ||
public: | ||
/// Constructor to (possibly) re-use a CUDA stream | ||
explicit AnalyzeContext(edm::StreamID streamID) : EDGetterContextBase(streamID) {} | ||
}; | ||
} | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
#ifndef HeterogeneousCore_CUDAUtilities_Context_h | ||
#define HeterogeneousCore_CUDAUtilities_Context_h | ||
|
||
#include "CUDACore/allocate_device.h" | ||
#include "CUDACore/allocate_host.h" | ||
|
||
namespace cms::cuda { | ||
class HostAllocatorContext { | ||
public: | ||
explicit HostAllocatorContext(cudaStream_t stream) : stream_(stream) {} | ||
|
||
void *allocate_host(size_t nbytes) const { return cms::cuda::allocate_host(nbytes, stream_); } | ||
|
||
void free_host(void *ptr) const { cms::cuda::free_host(ptr); } | ||
|
||
private: | ||
cudaStream_t stream_; | ||
}; | ||
|
||
class DeviceAllocatorContext { | ||
public: | ||
explicit DeviceAllocatorContext(cudaStream_t stream) : stream_(stream) {} | ||
|
||
void *allocate_device(size_t nbytes) const { return cms::cuda::allocate_device(nbytes, stream_); } | ||
|
||
void free_device(void *ptr) const { cms::cuda::free_device(ptr, stream_); } | ||
|
||
private: | ||
cudaStream_t stream_; | ||
}; | ||
|
||
class Context { | ||
public: | ||
explicit Context(cudaStream_t stream) : stream_(stream) {} | ||
|
||
cudaStream_t stream() const { return stream_; } | ||
|
||
operator HostAllocatorContext() const { return HostAllocatorContext(stream()); } | ||
operator DeviceAllocatorContext() const { return DeviceAllocatorContext(stream()); } | ||
|
||
private: | ||
cudaStream_t stream_; | ||
}; | ||
} // namespace cms::cuda | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
#include "CUDACore/EDGetterContextBase.h" | ||
#include "CUDACore/cudaCheck.h" | ||
|
||
namespace cms::cuda::impl { | ||
void EDGetterContextBase::synchronizeStreams(int dataDevice, | ||
cudaStream_t dataStream, | ||
bool available, | ||
cudaEvent_t dataEvent) { | ||
if (dataDevice != device()) { | ||
// Eventually replace with prefetch to current device (assuming unified memory works) | ||
// If we won't go to unified memory, need to figure out something else... | ||
throw std::runtime_error("Handling data from multiple devices is not yet supported"); | ||
} | ||
|
||
if (dataStream != stream()) { | ||
// Different streams, need to synchronize | ||
if (not available) { | ||
// Event not yet occurred, so need to add synchronization | ||
// here. Sychronization is done by making the CUDA stream to | ||
// wait for an event, so all subsequent work in the stream | ||
// will run only after the event has "occurred" (i.e. data | ||
// product became available). | ||
cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event"); | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
#ifndef HeterogeneousCore_CUDACore_EDGetterContextBase_h | ||
#define HeterogeneousCore_CUDACore_EDGetterContextBase_h | ||
|
||
#include "CUDACore/FwkContextBase.h" | ||
#include "CUDACore/Product.h" | ||
#include "Framework/EDGetToken.h" | ||
|
||
namespace cms::cuda::impl { | ||
/** | ||
* This class is a base class for Context classes that should be | ||
* able to read Event Data products | ||
*/ | ||
class EDGetterContextBase : public FwkContextBase { | ||
public: | ||
template <typename T> | ||
const T& get(const Product<T>& data) { | ||
if (not isInitialized()) { | ||
initialize(data); | ||
} | ||
synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event()); | ||
return data.data_; | ||
} | ||
|
||
template <typename T> | ||
const T& get(const edm::Event& iEvent, edm::EDGetTokenT<Product<T>> token) { | ||
return get(iEvent.get(token)); | ||
} | ||
|
||
protected: | ||
template <typename... Args> | ||
EDGetterContextBase(Args&&... args) : FwkContextBase(std::forward<Args>(args)...) {} | ||
|
||
private: | ||
void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent); | ||
}; | ||
} | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#include "CUDACore/EDProducer.h" | ||
|
||
namespace cms::cuda { | ||
void EDProducer::produce(edm::Event& event, edm::EventSetup const& eventSetup) { | ||
runProduce(event.streamID(), [&](auto& ctx) { produce(event, eventSetup, ctx); }); | ||
} | ||
|
||
void SynchronizingEDProducer::acquire(edm::Event const& event, | ||
edm::EventSetup const& eventSetup, | ||
edm::WaitingTaskWithArenaHolder holder) { | ||
runAcquire(event.streamID(), std::move(holder), [&](auto& ctx) { acquire(event, eventSetup, ctx); }); | ||
} | ||
|
||
void SynchronizingEDProducer::produce(edm::Event& event, edm::EventSetup const& eventSetup) { | ||
runProduce(event.streamID(), [&](auto& ctx) { produce(event, eventSetup, ctx); }); | ||
} | ||
} // namespace cms::cuda |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
#ifndef HeterogeneousCore_CUDACore_stream_EDProducer_h | ||
#define HeterogeneousCore_CUDACore_stream_EDProducer_h | ||
|
||
#include "Framework/EDProducer.h" | ||
#include "CUDACore/AcquireContext.h" | ||
#include "CUDACore/ProduceContext.h" | ||
|
||
namespace cms::cuda { | ||
class EDProducer : public edm::EDProducer { | ||
public: | ||
void produce(edm::Event& event, edm::EventSetup const& eventSetup) override; | ||
virtual void produce(edm::Event& event, edm::EventSetup const& eventSetup, ProduceContext& context) = 0; | ||
}; | ||
|
||
class SynchronizingEDProducer : public edm::EDProducerExternalWork { | ||
public: | ||
void acquire(edm::Event const& event, | ||
edm::EventSetup const& eventSetup, | ||
edm::WaitingTaskWithArenaHolder holder) override; | ||
void produce(edm::Event& event, edm::EventSetup const& eventSetup) override; | ||
|
||
virtual void acquire(edm::Event const& event, edm::EventSetup const& eventSetup, AcquireContext& context) = 0; | ||
virtual void produce(edm::Event& event, edm::EventSetup const& eventSetup, ProduceContext& context) = 0; | ||
}; | ||
} // namespace cms::cuda | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#include "CUDACore/FwkContextBase.h" | ||
#include "CUDACore/StreamCache.h" | ||
#include "CUDACore/cudaCheck.h" | ||
|
||
#include "chooseDevice.h" | ||
|
||
namespace cms::cuda::impl { | ||
FwkContextBase::FwkContextBase(edm::StreamID streamID) : FwkContextBase(chooseDevice(streamID)) {} | ||
|
||
FwkContextBase::FwkContextBase(int device) : currentDevice_(device) { cudaCheck(cudaSetDevice(currentDevice_)); } | ||
|
||
FwkContextBase::FwkContextBase(int device, SharedStreamPtr stream) | ||
: currentDevice_(device), stream_(std::make_shared<impl::StreamSharingHelper>(std::move(stream))) { | ||
cudaCheck(cudaSetDevice(currentDevice_)); | ||
} | ||
|
||
void FwkContextBase::initialize() { stream_ = std::make_shared<impl::StreamSharingHelper>(getStreamCache().get()); } | ||
|
||
void FwkContextBase::initialize(const ProductBase& data) { | ||
SharedStreamPtr stream; | ||
if (data.mayReuseStream()) { | ||
stream = data.streamPtr(); | ||
} else { | ||
stream = getStreamCache().get(); | ||
} | ||
stream_ = std::make_shared<impl::StreamSharingHelper>(std::move(stream)); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
#ifndef HeterogeneousCore_CUDACore_FwkContextBase_h | ||
#define HeterogeneousCore_CUDACore_FwkContextBase_h | ||
|
||
#include "CUDACore/Context.h" | ||
#include "CUDACore/ProductBase.h" | ||
#include "CUDACore/SharedStreamPtr.h" | ||
#include "Framework/Event.h" | ||
|
||
namespace cms::cuda::impl { | ||
/** | ||
* This class is a base class for other Context classes for interacting with the framework | ||
*/ | ||
class FwkContextBase { | ||
public: | ||
FwkContextBase(FwkContextBase const&) = delete; | ||
FwkContextBase& operator=(FwkContextBase const&) = delete; | ||
FwkContextBase(FwkContextBase&&) = delete; | ||
FwkContextBase& operator=(FwkContextBase&&) = delete; | ||
|
||
int device() const { return currentDevice_; } | ||
|
||
cudaStream_t stream() { | ||
if (not isInitialized()) { | ||
initialize(); | ||
} | ||
return stream_->streamPtr().get(); | ||
} | ||
const SharedStreamPtr& streamPtr() { | ||
if (not isInitialized()) { | ||
initialize(); | ||
} | ||
return stream_->streamPtr(); | ||
} | ||
|
||
operator HostAllocatorContext() { return HostAllocatorContext(stream()); } | ||
operator DeviceAllocatorContext() { return DeviceAllocatorContext(stream()); } | ||
operator Context() { return Context(stream()); } | ||
|
||
protected: | ||
// The constructors set the current device, but the device | ||
// is not set back to the previous value at the destructor. This | ||
// should be sufficient (and tiny bit faster) as all CUDA API | ||
// functions relying on the current device should be called from | ||
// the scope where this context is. The current device doesn't | ||
// really matter between modules (or across TBB tasks). | ||
explicit FwkContextBase(edm::StreamID streamID); | ||
|
||
explicit FwkContextBase(int device); | ||
|
||
// meant only for testing | ||
explicit FwkContextBase(int device, SharedStreamPtr stream); | ||
|
||
bool isInitialized() const { return bool(stream_); } | ||
|
||
void initialize(); | ||
void initialize(const ProductBase& data); | ||
|
||
const std::shared_ptr<impl::StreamSharingHelper>& streamSharingHelper() { | ||
if (not isInitialized()) { | ||
initialize(); | ||
} | ||
return stream_; | ||
} | ||
|
||
private: | ||
int currentDevice_ = -1; | ||
std::shared_ptr<impl::StreamSharingHelper> stream_; | ||
}; | ||
} | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
#ifndef HeterogeneousCore_CUDACore_FwkContextHolderHelper_h | ||
#define HeterogeneousCore_CUDACore_FwkContextHolderHelper_h | ||
|
||
namespace cms::cuda::impl { | ||
class FwkContextHolderHelper { | ||
public: | ||
FwkContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder, int device) | ||
: waitingTaskHolder_{std::move(waitingTaskHolder)}, device_{device} {} | ||
|
||
template <typename F> | ||
void pushNextTask(F&& f); | ||
|
||
void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { | ||
waitingTaskHolder_ = std::move(waitingTaskHolder); | ||
} | ||
|
||
void enqueueCallback(cudaStream_t stream); | ||
|
||
private: | ||
edm::WaitingTaskWithArenaHolder waitingTaskHolder_; | ||
int device_; | ||
}; | ||
} | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
#include "CUDACore/ProduceContext.h" | ||
#include "CUDACore/cudaCheck.h" | ||
|
||
namespace cms::cuda { | ||
void ProduceContext::commit() { cudaCheck(cudaEventRecord(event_.get(), stream())); } | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right now (and possibly forever in
cudadev
) theHostAllocatorContext
andDeviceAllocatorContext
look nearly identical, but in the future (in CMSSW) they could hold a pointer to theCachingHostAllocator
/CachingDeviceAllocator
objects.