From 06cc8f2eb874c6496ca7b1489a469bfbf4f5ec06 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Mon, 23 Aug 2021 06:44:58 -0700
Subject: [PATCH 01/12] Add Context class hierarchy

---
 src/cudadev/CUDACore/Context.cc     | 103 ++++++++++++
 src/cudadev/CUDACore/Context.h      | 235 ++++++++++++++++++++++++++++
 src/cudadev/CUDACore/ContextState.h |   3 +
 src/cudadev/CUDACore/Product.h      |   5 +-
 src/cudadev/CUDACore/ProductBase.h  |   5 +-
 5 files changed, 349 insertions(+), 2 deletions(-)
 create mode 100644 src/cudadev/CUDACore/Context.cc
 create mode 100644 src/cudadev/CUDACore/Context.h
diff --git a/src/cudadev/CUDACore/Context.cc b/src/cudadev/CUDACore/Context.cc
new file mode 100644
index 000000000..5d1a46dff
--- /dev/null
+++ b/src/cudadev/CUDACore/Context.cc
@@ -0,0 +1,103 @@
+#include "CUDACore/Context.h"
+
+#include "CUDACore/StreamCache.h"
+#include "CUDACore/cudaCheck.h"
+
+#include "chooseDevice.h"
+
+namespace {
+  struct CallbackData {
+    edm::WaitingTaskWithArenaHolder holder;
+    int device;
+  };
+
+  void CUDART_CB cudaContextCallback(cudaStream_t streamId, cudaError_t status, void* data) {
+    std::unique_ptr<CallbackData> guard{reinterpret_cast<CallbackData*>(data)};
+    edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder;
+    int device = guard->device;
+    if (status == cudaSuccess) {
+      //std::cout << " GPU kernel finished (in callback) device " << device << " CUDA stream "
+      //          << streamId << std::endl;
+      waitingTaskHolder.doneWaiting(nullptr);
+    } else {
+      // wrap the exception in a try-catch block to let GDB "catch throw" break on it
+      try {
+        auto error = cudaGetErrorName(status);
+        auto message = cudaGetErrorString(status);
+        throw std::runtime_error("Callback of CUDA stream " +
+                                 std::to_string(reinterpret_cast<unsigned long>(streamId)) + " in device " +
+                                 std::to_string(device) + " error " + std::string(error) + ": " + std::string(message));
+      } catch (std::exception&) {
+        waitingTaskHolder.doneWaiting(std::current_exception());
+      }
+    }
+  }
+}  // namespace
+
+namespace cms::cuda {
+  namespace impl {
+    Context::Context(edm::StreamID streamID) : currentDevice_(chooseDevice(streamID)) {
+      cudaCheck(cudaSetDevice(currentDevice_));
+    }
+
+    Context::Context(int device, SharedStreamPtr stream) : currentDevice_(device), stream_(std::move(stream)) {
+      cudaCheck(cudaSetDevice(currentDevice_));
+    }
+
+    void Context::initialize() { stream_ = getStreamCache().get(); }
+
+    void Context::initialize(const ProductBase& data) {
+      if (data.mayReuseStream()) {
+        stream_ = data.streamPtr();
+      } else {
+        stream_ = getStreamCache().get();
+      }
+    }
+
+    ////////////////////
+
+    void ContextGetterBase::synchronizeStreams(int dataDevice,
+                                               cudaStream_t dataStream,
+                                               bool available,
+                                               cudaEvent_t dataEvent) {
+      if (dataDevice != device()) {
+        // Eventually replace with prefetch to current device (assuming unified memory works)
+        // If we won't go to unified memory, need to figure out something else...
+        throw std::runtime_error("Handling data from multiple devices is not yet supported");
+      }
+
+      if (dataStream != stream()) {
+        // Different streams, need to synchronize
+        if (not available) {
+          // Event not yet occurred, so need to add synchronization
+          // here. Sychronization is done by making the CUDA stream to
+          // wait for an event, so all subsequent work in the stream
+          // will run only after the event has "occurred" (i.e. data
+          // product became available).
+          cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event");
+        }
+      }
+    }
+
+    ////////////////////
+
+    void ContextHolderHelper::enqueueCallback(int device, cudaStream_t stream) {
+      cudaCheck(cudaStreamAddCallback(stream, cudaContextCallback, new CallbackData{waitingTaskHolder_, device}, 0));
+    }
+  }  // namespace impl
+
+  ////////////////////
+
+  void AcquireContext::commit() {
+    holderHelper_.enqueueCallback(device(), stream());
+    contextState_.set(device(), streamPtr());
+  }
+
+  ////////////////////
+
+  void ProduceContext::commit() { cudaCheck(cudaEventRecord(event_.get(), stream())); }
+
+  ////////////////////
+
+  void TaskContext::commit() { holderHelper_.enqueueCallback(device(), stream()); }
+}  // namespace cms::cuda
diff --git a/src/cudadev/CUDACore/Context.h b/src/cudadev/CUDACore/Context.h
new file mode 100644
index 000000000..6fe87dcc5
--- /dev/null
+++ b/src/cudadev/CUDACore/Context.h
@@ -0,0 +1,235 @@
+#ifndef HeterogeneousCore_CUDACore_Context_h
+#define HeterogeneousCore_CUDACore_Context_h
+
+#include "CUDACore/Product.h"
+#include "Framework/WaitingTaskWithArenaHolder.h"
+#include "Framework/Event.h"
+#include "Framework/EDGetToken.h"
+#include "Framework/EDPutToken.h"
+#include "CUDACore/ContextState.h"
+#include "CUDACore/EventCache.h"
+#include "CUDACore/SharedEventPtr.h"
+#include "CUDACore/SharedStreamPtr.h"
+
+namespace cms::cuda {
+  namespace impl {
+    // This class is intended to be derived by other Context*, not for general use
+    class Context {
+    public:
+      Context(Context const&) = delete;
+      Context& operator=(Context const&) = delete;
+      Context(Context&&) = delete;
+      Context& operator=(Context&&) = delete;
+
+      int device() const { return currentDevice_; }
+
+      // cudaStream_t is a pointer to a thread-safe object, for which a
+      // mutable access is needed even if the Context itself
+      // would be const. Therefore it is ok to return a non-const
+      // pointer from a const method here.
+      cudaStream_t stream() {
+        if (not isInitialized()) {
+          initialize();
+        }
+        return stream_.get();
+      }
+      const SharedStreamPtr& streamPtr() {
+        if (not isInitialized()) {
+          initialize();
+        }
+        return stream_;
+      }
+
+    protected:
+      // The constructors set the current device, but the device
+      // is not set back to the previous value at the destructor. This
+      // should be sufficient (and tiny bit faster) as all CUDA API
+      // functions relying on the current device should be called from
+      // the scope where this context is. The current device doesn't
+      // really matter between modules (or across TBB tasks).
+      explicit Context(edm::StreamID streamID);
+
+      explicit Context(int device, SharedStreamPtr stream);
+
+      bool isInitialized() const { return bool(stream_); }
+
+      void initialize();
+      void initialize(const ProductBase& data);
+
+    private:
+      int currentDevice_ = -1;
+      SharedStreamPtr stream_;
+    };
+
+    class ContextGetterBase : public Context {
+    public:
+      template <typename T>
+      const T& get(const Product<T>& data) {
+        if (not isInitialized()) {
+          initialize(data);
+        }
+        synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event());
+        return data.data_;
+      }
+
+      template <typename T>
+      const T& get(const edm::Event& iEvent, edm::EDGetTokenT<Product<T>> token) {
+        return get(iEvent.get(token));
+      }
+
+    protected:
+      template <typename... Args>
+      ContextGetterBase(Args&&... args) : Context(std::forward<Args>(args)...) {}
+
+    private:
+      void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent);
+    };
+
+    class ContextHolderHelper {
+    public:
+      ContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+          : waitingTaskHolder_{std::move(waitingTaskHolder)} {}
+
+      template <typename F>
+      void pushNextTask(F&& f, ContextState const* state);
+
+      void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+        waitingTaskHolder_ = std::move(waitingTaskHolder);
+      }
+
+      void enqueueCallback(int device, cudaStream_t stream);
+
+    private:
+      edm::WaitingTaskWithArenaHolder waitingTaskHolder_;
+    };
+  }  // namespace impl
+
+  /**
+     * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire():
+     * - setting the current device
+     * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
+     * - synchronizing between CUDA streams if necessary
+     * Users should not, however, construct it explicitly.
+     */
+  class AcquireContext : public impl::ContextGetterBase {
+  public:
+    explicit AcquireContext(edm::StreamID streamID,
+                            edm::WaitingTaskWithArenaHolder waitingTaskHolder,
+                            ContextState& state)
+        : ContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{state} {}
+    ~AcquireContext() = default;
+
+    template <typename F>
+    void pushNextTask(F&& f) {
+      holderHelper_.pushNextTask(std::forward<F>(f), contextState_);
+    }
+
+    void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+      holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
+    }
+
+    // internal API
+    void commit();
+
+  private:
+    impl::ContextHolderHelper holderHelper_;
+    ContextState& contextState_;
+  };
+
+  /**
+   * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce():
+   * - setting the current device
+   * - synchronizing between CUDA streams if necessary
+   * Users should not, however, construct it explicitly.
+   */
+  class ProduceContext : public impl::ContextGetterBase {
+  public:
+    /// Constructor to create a new CUDA stream (non-ExternalWork module)
+    explicit ProduceContext(edm::StreamID streamID) : ContextGetterBase(streamID) {}
+
+    /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
+    explicit ProduceContext(ContextState& state) : ContextGetterBase(state.device(), state.releaseStreamPtr()) {}
+
+    ~ProduceContext() = default;
+
+    template <typename T>
+    std::unique_ptr<Product<T>> wrap(T data) {
+      // make_unique doesn't work because of private constructor
+      return std::unique_ptr<Product<T>>(new Product<T>(device(), streamPtr(), event_, std::move(data)));
+    }
+
+    template <typename T, typename... Args>
+    auto emplace(edm::Event& iEvent, edm::EDPutTokenT<T> token, Args&&... args) {
+      return iEvent.emplace(token, device(), streamPtr(), event_, std::forward<Args>(args)...);
+    }
+
+    // internal API
+    void commit();
+
+  private:
+    // This construcor is only meant for testing
+    explicit ProduceContext(int device, SharedStreamPtr stream, SharedEventPtr event)
+        : ContextGetterBase(device, std::move(stream)), event_{std::move(event)} {}
+
+    // create the CUDA Event upfront to catch possible errors from its creation
+    SharedEventPtr event_ = getEventCache().get();
+  };
+
+  /**
+   * The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire():
+   * - setting the current device
+   * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
+   */
+  class TaskContext : public impl::Context {
+  public:
+    /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
+    explicit TaskContext(ContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+        : Context(state->device(), state->streamPtr()),  // don't move, state is re-used afterwards
+          holderHelper_{std::move(waitingTaskHolder)},
+          contextState_{state} {}
+
+    ~TaskContext() = default;
+
+    template <typename F>
+    void pushNextTask(F&& f) {
+      holderHelper_.pushNextTask(std::forward<F>(f), contextState_);
+    }
+
+    void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+      holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
+    }
+
+    // Internal API
+    void commit();
+
+  private:
+    impl::ContextHolderHelper holderHelper_;
+    ContextState const* contextState_;
+  };
+
+  /**
+   * The aim of this class is to do necessary per-event "initialization" in analyze()
+   * - setting the current device
+   * - synchronizing between CUDA streams if necessary
+   * and enforce that those get done in a proper way in RAII fashion.
+   */
+  class AnalyzeContext : public impl::ContextGetterBase {
+  public:
+    /// Constructor to (possibly) re-use a CUDA stream
+    explicit AnalyzeContext(edm::StreamID streamID) : ContextGetterBase(streamID) {}
+  };
+
+  namespace impl {
+    template <typename F>
+    void ContextHolderHelper::pushNextTask(F&& f, ContextState const* state) {
+      replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{
+          edm::make_waiting_task_with_holder(tbb::task::allocate_root(),
+                                             std::move(waitingTaskHolder_),
+                                             [state, func = std::forward<F>(f)](edm::WaitingTaskWithArenaHolder h) {
+                                               func(TaskContext{state, std::move(h)});
+                                             })});
+    }
+  }  // namespace impl
+}  // namespace cms::cuda
+
+#endif
diff --git a/src/cudadev/CUDACore/ContextState.h b/src/cudadev/CUDACore/ContextState.h
index 3c73054ab..b0109e33d 100644
--- a/src/cudadev/CUDACore/ContextState.h
+++ b/src/cudadev/CUDACore/ContextState.h
@@ -26,6 +26,9 @@ namespace cms {
       friend class ScopedContextAcquire;
       friend class ScopedContextProduce;
       friend class ScopedContextTask;
+      friend class AcquireContext;
+      friend class ProduceContext;
+      friend class TaskContext;
 
       void set(int device, SharedStreamPtr stream) {
         throwIfStream();
diff --git a/src/cudadev/CUDACore/Product.h b/src/cudadev/CUDACore/Product.h
index c60e994f0..5d9cab481 100644
--- a/src/cudadev/CUDACore/Product.h
+++ b/src/cudadev/CUDACore/Product.h
@@ -14,7 +14,8 @@ namespace cms {
   namespace cuda {
     namespace impl {
       class ScopedContextGetterBase;
-    }
+      class ContextGetterBase;
+    }  // namespace impl
 
     /**
      * The purpose of this class is to wrap CUDA data to edm::Event in a
@@ -43,6 +44,8 @@ namespace cms {
     private:
       friend class impl::ScopedContextGetterBase;
       friend class ScopedContextProduce;
+      friend class impl::ContextGetterBase;
+      friend class ProduceContext;
       friend class edm::Wrapper<Product<T>>;
 
       explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, T data)
diff --git a/src/cudadev/CUDACore/ProductBase.h b/src/cudadev/CUDACore/ProductBase.h
index cb3fd4db9..00d12b3fb 100644
--- a/src/cudadev/CUDACore/ProductBase.h
+++ b/src/cudadev/CUDACore/ProductBase.h
@@ -11,7 +11,8 @@ namespace cms {
   namespace cuda {
     namespace impl {
       class ScopedContextBase;
-    }
+      class Context;
+    }  // namespace impl
 
     /**
      * Base class for all instantiations of CUDA<T> to hold the
@@ -61,6 +62,8 @@ namespace cms {
     private:
       friend class impl::ScopedContextBase;
       friend class ScopedContextProduce;
+      friend class impl::Context;
+      friend class ProduceContext;
 
       // The following function is intended to be used only from ScopedContext
       const SharedStreamPtr& streamPtr() const { return stream_; }

From 44b14290e229da447659baec257a1ad55583dba2 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Mon, 23 Aug 2021 06:59:14 -0700
Subject: [PATCH 02/12] Add EDProducer base class, and runAcquire() and
 runProduce() functions

Add runAcquire(), runProduce() functions
---
 src/cudadev/CUDACore/Context.h     | 21 +++++++++++++++++++++
 src/cudadev/CUDACore/EDProducer.cc | 17 +++++++++++++++++
 src/cudadev/CUDACore/EDProducer.h  | 29 +++++++++++++++++++++++++++++
 3 files changed, 67 insertions(+)
 create mode 100644 src/cudadev/CUDACore/EDProducer.cc
 create mode 100644 src/cudadev/CUDACore/EDProducer.h

diff --git a/src/cudadev/CUDACore/Context.h b/src/cudadev/CUDACore/Context.h
index 6fe87dcc5..b85d29774 100644
--- a/src/cudadev/CUDACore/Context.h
+++ b/src/cudadev/CUDACore/Context.h
@@ -230,6 +230,27 @@ namespace cms::cuda {
                                              })});
     }
   }  // namespace impl
+
+  template <typename F>
+  void runAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder holder, ContextState& state, F func) {
+    AcquireContext context(streamID, std::move(holder), state);
+    func(context);
+    context.commit();
+  }
+
+  template <typename F>
+  void runProduce(edm::StreamID streamID, F func) {
+    ProduceContext context(streamID);
+    func(context);
+    context.commit();
+  }
+
+  template <typename F>
+  void runProduce(ContextState& state, F func) {
+    ProduceContext context(state);
+    func(context);
+    context.commit();
+  }
 }  // namespace cms::cuda
 
 #endif
diff --git a/src/cudadev/CUDACore/EDProducer.cc b/src/cudadev/CUDACore/EDProducer.cc
new file mode 100644
index 000000000..d41d77c31
--- /dev/null
+++ b/src/cudadev/CUDACore/EDProducer.cc
@@ -0,0 +1,17 @@
+#include "CUDACore/EDProducer.h"
+
+namespace cms::cuda {
+  void EDProducer::produce(edm::Event& event, edm::EventSetup const& eventSetup) {
+    runProduce(event.streamID(), [&](auto& ctx) { produce(event, eventSetup, ctx); });
+  }
+
+  void SynchronizingEDProducer::acquire(edm::Event const& event,
+                                        edm::EventSetup const& eventSetup,
+                                        edm::WaitingTaskWithArenaHolder holder) {
+    runAcquire(event.streamID(), std::move(holder), state_, [&](auto& ctx) { acquire(event, eventSetup, ctx); });
+  }
+
+  void SynchronizingEDProducer::produce(edm::Event& event, edm::EventSetup const& eventSetup) {
+    runProduce(state_, [&](auto& ctx) { produce(event, eventSetup, ctx); });
+  }
+}  // namespace cms::cuda
diff --git a/src/cudadev/CUDACore/EDProducer.h b/src/cudadev/CUDACore/EDProducer.h
new file mode 100644
index 000000000..3fc917285
--- /dev/null
+++ b/src/cudadev/CUDACore/EDProducer.h
@@ -0,0 +1,29 @@
+#ifndef HeterogeneousCore_CUDACore_stream_EDProducer_h
+#define HeterogeneousCore_CUDACore_stream_EDProducer_h
+
+#include "Framework/EDProducer.h"
+#include "CUDACore/Context.h"
+
+namespace cms::cuda {
+  class EDProducer : public edm::EDProducer {
+  public:
+    void produce(edm::Event& event, edm::EventSetup const& eventSetup) override;
+    virtual void produce(edm::Event& event, edm::EventSetup const& eventSetup, ProduceContext& context) = 0;
+  };
+
+  class SynchronizingEDProducer : public edm::EDProducerExternalWork {
+  public:
+    void acquire(edm::Event const& event,
+                 edm::EventSetup const& eventSetup,
+                 edm::WaitingTaskWithArenaHolder holder) override;
+    void produce(edm::Event& event, edm::EventSetup const& eventSetup) override;
+
+    virtual void acquire(edm::Event const& event, edm::EventSetup const& eventSetup, AcquireContext& context) = 0;
+    virtual void produce(edm::Event& event, edm::EventSetup const& eventSetup, ProduceContext& context) = 0;
+
+  private:
+    ContextState state_;
+  };
+}  // namespace cms::cuda
+
+#endif

From 70341c6796050443706a0aa8c5ab5d53f2a3c2c4 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Mon, 23 Aug 2021 10:16:19 -0700
Subject: [PATCH 03/12] Migrate code from ScopedContext to Context

---
 .../plugin-BeamSpotProducer/BeamSpotToCUDA.cc | 11 ++----
 .../PixelTrackSoAFromCUDA.cc                  | 20 ++++------
 .../plugin-PixelTriplets/CAHitNtupletCUDA.cc  | 13 +++----
 .../PixelVertexProducerCUDA.cc                | 13 +++----
 .../PixelVertexSoAFromCUDA.cc                 | 20 ++++------
 .../SiPixelRawToClusterCUDA.cc                | 24 +++++-------
 .../SiPixelDigisSoAFromCUDA.cc                | 19 ++++------
 .../SiPixelRecHitCUDA.cc                      | 14 +++----
 .../plugin-Validation/CountValidator.cc       | 38 +++++++++----------
 .../plugin-Validation/HistoValidator.cc       | 20 ++++------
 10 files changed, 78 insertions(+), 114 deletions(-)

diff --git a/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc b/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc
index 48badcabf..b9633a856 100644
--- a/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc
+++ b/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc
@@ -2,23 +2,22 @@
 
 #include <cuda_runtime.h>
 
+#include "CUDACore/EDProducer.h"
 #include "CUDACore/Product.h"
-#include "CUDACore/ScopedContext.h"
 #include "CUDACore/copyAsync.h"
 #include "CUDACore/host_noncached_unique_ptr.h"
 #include "CUDADataFormats/BeamSpotCUDA.h"
 #include "DataFormats/BeamSpotPOD.h"
-#include "Framework/EDProducer.h"
 #include "Framework/Event.h"
 #include "Framework/EventSetup.h"
 #include "Framework/PluginFactory.h"
 
-class BeamSpotToCUDA : public edm::EDProducer {
+class BeamSpotToCUDA : public cms::cuda::EDProducer {
 public:
   explicit BeamSpotToCUDA(edm::ProductRegistry& reg);
   ~BeamSpotToCUDA() override = default;
 
-  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext& ctx) override;
 
 private:
   const edm::EDPutTokenT<cms::cuda::Product<BeamSpotCUDA>> bsPutToken_;
@@ -30,11 +29,9 @@ BeamSpotToCUDA::BeamSpotToCUDA(edm::ProductRegistry& reg)
     : bsPutToken_{reg.produces<cms::cuda::Product<BeamSpotCUDA>>()},
       bsHost{cms::cuda::make_host_noncached_unique<BeamSpotPOD>(cudaHostAllocWriteCombined)} {}
 
-void BeamSpotToCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+void BeamSpotToCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext& ctx) {
   *bsHost = iSetup.get<BeamSpotPOD>();
 
-  cms::cuda::ScopedContextProduce ctx{iEvent.streamID()};
-
   BeamSpotCUDA bsDevice(ctx.stream());
   cms::cuda::copyAsync(bsDevice.ptr(), bsHost, ctx.stream());
 
diff --git a/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc b/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc
index 408450ea8..137bcb073 100644
--- a/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc
+++ b/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc
@@ -1,27 +1,25 @@
 #include <cuda_runtime.h>
 
+#include "CUDACore/Context.h"
+#include "CUDACore/EDProducer.h"
 #include "CUDACore/Product.h"
 #include "CUDACore/HostProduct.h"
 #include "CUDADataFormats/PixelTrackHeterogeneous.h"
 #include "Framework/EventSetup.h"
 #include "Framework/Event.h"
 #include "Framework/PluginFactory.h"
-#include "Framework/EDProducer.h"
-#include "CUDACore/ScopedContext.h"
 
 // Switch on to enable checks and printout for found tracks
 #undef PIXEL_DEBUG_PRODUCE
 
-class PixelTrackSoAFromCUDA : public edm::EDProducerExternalWork {
+class PixelTrackSoAFromCUDA : public cms::cuda::SynchronizingEDProducer {
 public:
   explicit PixelTrackSoAFromCUDA(edm::ProductRegistry& reg);
   ~PixelTrackSoAFromCUDA() override = default;
 
 private:
-  void acquire(edm::Event const& iEvent,
-               edm::EventSetup const& iSetup,
-               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
-  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+  void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, cms::cuda::AcquireContext& ctx) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup, cms::cuda::ProduceContext&) override;
 
   edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenCUDA_;
   edm::EDPutTokenT<PixelTrackHeterogeneous> tokenSOA_;
@@ -35,15 +33,13 @@ PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(edm::ProductRegistry& reg)
 
 void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
                                     edm::EventSetup const& iSetup,
-                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  cms::cuda::Product<PixelTrackHeterogeneous> const& inputDataWrapped = iEvent.get(tokenCUDA_);
-  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
-  auto const& inputData = ctx.get(inputDataWrapped);
+                                    cms::cuda::AcquireContext& ctx) {
+  auto const& inputData = ctx.get(iEvent, tokenCUDA_);
 
   soa_ = inputData.toHostAsync(ctx.stream());
 }
 
-void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup, cms::cuda::ProduceContext&) {
 #ifdef PIXEL_DEBUG_PRODUCE
   auto const& tsoa = *soa_;
   auto maxTracks = tsoa.stride();
diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc b/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc
index 94085d784..aca6bd319 100644
--- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc
+++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc
@@ -1,24 +1,23 @@
 #include <cuda_runtime.h>
 
 #include "CUDACore/Product.h"
+#include "CUDACore/EDProducer.h"
 #include "Framework/EventSetup.h"
 #include "Framework/Event.h"
 #include "Framework/PluginFactory.h"
-#include "Framework/EDProducer.h"
 #include "Framework/RunningAverage.h"
-#include "CUDACore/ScopedContext.h"
 
 #include "CAHitNtupletGeneratorOnGPU.h"
 #include "CUDADataFormats/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
 
-class CAHitNtupletCUDA : public edm::EDProducer {
+class CAHitNtupletCUDA : public cms::cuda::EDProducer {
 public:
   explicit CAHitNtupletCUDA(edm::ProductRegistry& reg);
   ~CAHitNtupletCUDA() override = default;
 
 private:
-  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext& ctx) override;
 
   edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> tokenHitGPU_;
   edm::EDPutTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenTrackGPU_;
@@ -31,12 +30,10 @@ CAHitNtupletCUDA::CAHitNtupletCUDA(edm::ProductRegistry& reg)
       tokenTrackGPU_{reg.produces<cms::cuda::Product<PixelTrackHeterogeneous>>()},
       gpuAlgo_(reg) {}
 
-void CAHitNtupletCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es) {
+void CAHitNtupletCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es, cms::cuda::ProduceContext& ctx) {
   auto bf = 0.0114256972711507;  // 1/fieldInGeV
 
-  auto const& phits = iEvent.get(tokenHitGPU_);
-  cms::cuda::ScopedContextProduce ctx{phits};
-  auto const& hits = ctx.get(phits);
+  auto const& hits = ctx.get(iEvent, tokenHitGPU_);
 
   ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream()));
 }
diff --git a/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc b/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc
index 723f7eb4c..50827a71a 100644
--- a/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc
+++ b/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc
@@ -6,7 +6,7 @@
 #include "Framework/PluginFactory.h"
 #include "Framework/EDProducer.h"
 #include "Framework/RunningAverage.h"
-#include "CUDACore/ScopedContext.h"
+#include "CUDACore/Context.h"
 
 #include "gpuVertexFinder.h"
 
@@ -58,14 +58,13 @@ PixelVertexProducerCUDA::PixelVertexProducerCUDA(edm::ProductRegistry& reg)
 }
 
 void PixelVertexProducerCUDA::produceOnGPU(edm::Event& iEvent, const edm::EventSetup& iSetup) {
-  auto const& ptracks = iEvent.get(tokenGPUTrack_);
+  cms::cuda::runProduce(iEvent.streamID(), [&](cms::cuda::ProduceContext& ctx) {
+    auto const* tracks = ctx.get(iEvent, tokenGPUTrack_).get();
 
-  cms::cuda::ScopedContextProduce ctx{ptracks};
-  auto const* tracks = ctx.get(ptracks).get();
+    assert(tracks);
 
-  assert(tracks);
-
-  ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_));
+    ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_));
+  });
 }
 
 void PixelVertexProducerCUDA::produceOnCPU(edm::Event& iEvent, const edm::EventSetup& iSetup) {
diff --git a/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc b/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc
index d709f0c5e..79ad0ca91 100644
--- a/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc
+++ b/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc
@@ -1,25 +1,23 @@
 #include <cuda_runtime.h>
 
+#include "CUDACore/Context.h"
+#include "CUDACore/EDProducer.h"
 #include "CUDACore/Product.h"
 #include "CUDACore/HostProduct.h"
 #include "CUDADataFormats/ZVertexHeterogeneous.h"
 #include "Framework/EventSetup.h"
 #include "Framework/Event.h"
 #include "Framework/PluginFactory.h"
-#include "Framework/EDProducer.h"
 #include "Framework/RunningAverage.h"
-#include "CUDACore/ScopedContext.h"
 
-class PixelVertexSoAFromCUDA : public edm::EDProducerExternalWork {
+class PixelVertexSoAFromCUDA : public cms::cuda::SynchronizingEDProducer {
 public:
   explicit PixelVertexSoAFromCUDA(edm::ProductRegistry& reg);
   ~PixelVertexSoAFromCUDA() override = default;
 
 private:
-  void acquire(edm::Event const& iEvent,
-               edm::EventSetup const& iSetup,
-               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
-  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+  void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, cms::cuda::AcquireContext& ctx) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup, cms::cuda::ProduceContext&) override;
 
   edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenCUDA_;
   edm::EDPutTokenT<ZVertexHeterogeneous> tokenSOA_;
@@ -33,15 +31,13 @@ PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(edm::ProductRegistry& reg)
 
 void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent,
                                      edm::EventSetup const& iSetup,
-                                     edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  auto const& inputDataWrapped = iEvent.get(tokenCUDA_);
-  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
-  auto const& inputData = ctx.get(inputDataWrapped);
+                                     cms::cuda::AcquireContext& ctx) {
+  auto const& inputData = ctx.get(iEvent, tokenCUDA_);
 
   m_soa = inputData.toHostAsync(ctx.stream());
 }
 
-void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup, cms::cuda::ProduceContext&) {
   // No copies....
   iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(m_soa)));
 }
diff --git a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
index a5229b295..7a2875ad9 100644
--- a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
+++ b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
@@ -4,8 +4,9 @@
 #include <vector>
 
 // CMSSW includes
+#include "CUDACore/Context.h"
+#include "CUDACore/EDProducer.h"
 #include "CUDACore/Product.h"
-#include "CUDACore/ScopedContext.h"
 #include "CUDADataFormats/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigiErrorsCUDA.h"
 #include "CUDADataFormats/SiPixelDigisCUDA.h"
@@ -16,7 +17,6 @@
 #include "DataFormats/FEDRawData.h"
 #include "DataFormats/FEDRawDataCollection.h"
 #include "DataFormats/SiPixelErrorCompact.h"
-#include "Framework/EDProducer.h"
 #include "Framework/Event.h"
 #include "Framework/EventSetup.h"
 #include "Framework/PluginFactory.h"
@@ -26,18 +26,14 @@
 #include "SiPixelClusterThresholds.h"
 #include "SiPixelRawToClusterGPUKernel.h"
 
-class SiPixelRawToClusterCUDA : public edm::EDProducerExternalWork {
+class SiPixelRawToClusterCUDA : public cms::cuda::SynchronizingEDProducer {
 public:
   explicit SiPixelRawToClusterCUDA(edm::ProductRegistry& reg);
   ~SiPixelRawToClusterCUDA() override = default;
 
 private:
-  void acquire(const edm::Event& iEvent,
-               const edm::EventSetup& iSetup,
-               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
-  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
-
-  cms::cuda::ContextState ctxState_;
+  void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::AcquireContext& ctx) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext& ctx) override;
 
   edm::EDGetTokenT<FEDRawDataCollection> rawGetToken_;
   edm::EDPutTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiPutToken_;
@@ -72,9 +68,7 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(edm::ProductRegistry& reg)
 
 void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
                                       const edm::EventSetup& iSetup,
-                                      edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_};
-
+                                      cms::cuda::AcquireContext& ctx) {
   auto const& hgpuMap = iSetup.get<SiPixelROCsStatusAndMappingWrapper>();
   if (hgpuMap.hasQuality() != useQuality_) {
     throw std::runtime_error(
@@ -170,9 +164,9 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
                              ctx.stream());
 }
 
-void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
-  cms::cuda::ScopedContextProduce ctx{ctxState_};
-
+void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent,
+                                      const edm::EventSetup& iSetup,
+                                      cms::cuda::ProduceContext& ctx) {
   auto tmp = gpuAlgo_.getResults();
   ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first));
   ctx.emplace(iEvent, clusterPutToken_, std::move(tmp.second));
diff --git a/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc b/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc
index 448f4b797..8b90c5326 100644
--- a/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc
+++ b/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc
@@ -1,23 +1,21 @@
+#include "CUDACore/Context.h"
+#include "CUDACore/EDProducer.h"
 #include "CUDACore/Product.h"
 #include "CUDADataFormats/SiPixelDigisCUDA.h"
 #include "DataFormats/SiPixelDigisSoA.h"
 #include "Framework/EventSetup.h"
 #include "Framework/Event.h"
-#include "Framework/EDProducer.h"
 #include "Framework/PluginFactory.h"
-#include "CUDACore/ScopedContext.h"
 #include "CUDACore/host_unique_ptr.h"
 
-class SiPixelDigisSoAFromCUDA : public edm::EDProducerExternalWork {
+class SiPixelDigisSoAFromCUDA : public cms::cuda::SynchronizingEDProducer {
 public:
   explicit SiPixelDigisSoAFromCUDA(edm::ProductRegistry& reg);
   ~SiPixelDigisSoAFromCUDA() override = default;
 
 private:
-  void acquire(const edm::Event& iEvent,
-               const edm::EventSetup& iSetup,
-               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
-  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+  void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::AcquireContext& ctx) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext&) override;
 
   edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiGetToken_;
   edm::EDPutTokenT<SiPixelDigisSoA> digiPutToken_;
@@ -36,10 +34,7 @@ SiPixelDigisSoAFromCUDA::SiPixelDigisSoAFromCUDA(edm::ProductRegistry& reg)
 
 void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent,
                                       const edm::EventSetup& iSetup,
-                                      edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  // Do the transfer in a CUDA stream parallel to the computation CUDA stream
-  cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
-
+                                      cms::cuda::AcquireContext& ctx) {
   const auto& gpuDigis = ctx.get(iEvent, digiGetToken_);
 
   nDigis_ = gpuDigis.nDigis();
@@ -49,7 +44,7 @@ void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent,
   clus_ = gpuDigis.clusToHostAsync(ctx.stream());
 }
 
-void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext&) {
   // The following line copies the data from the pinned host memory to
   // regular host memory. In principle that feels unnecessary (why not
   // just use the pinned host memory?). There are a few arguments for
diff --git a/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc b/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc
index 413982fc3..133fa8eaf 100644
--- a/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc
+++ b/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc
@@ -1,6 +1,7 @@
 #include <cuda_runtime.h>
 
 #include "CUDADataFormats/BeamSpotCUDA.h"
+#include "CUDACore/EDProducer.h"
 #include "CUDACore/Product.h"
 #include "CUDADataFormats/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigisCUDA.h"
@@ -8,19 +9,17 @@
 #include "Framework/EventSetup.h"
 #include "Framework/Event.h"
 #include "Framework/PluginFactory.h"
-#include "Framework/EDProducer.h"
-#include "CUDACore/ScopedContext.h"
 #include "CondFormats/PixelCPEFast.h"
 
 #include "PixelRecHitGPUKernel.h"
 
-class SiPixelRecHitCUDA : public edm::EDProducer {
+class SiPixelRecHitCUDA : public cms::cuda::EDProducer {
 public:
   explicit SiPixelRecHitCUDA(edm::ProductRegistry& reg);
   ~SiPixelRecHitCUDA() override = default;
 
 private:
-  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext& ctx) override;
 
   // The mess with inputs will be cleaned up when migrating to the new framework
   const edm::EDGetTokenT<cms::cuda::Product<BeamSpotCUDA>> tBeamSpot;
@@ -36,13 +35,10 @@ SiPixelRecHitCUDA::SiPixelRecHitCUDA(edm::ProductRegistry& reg)
       tokenDigi_(reg.consumes<cms::cuda::Product<SiPixelDigisCUDA>>()),
       tokenHit_(reg.produces<cms::cuda::Product<TrackingRecHit2DCUDA>>()) {}
 
-void SiPixelRecHitCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es) {
+void SiPixelRecHitCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es, cms::cuda::ProduceContext& ctx) {
   PixelCPEFast const& fcpe = es.get<PixelCPEFast>();
 
-  auto const& pclusters = iEvent.get(token_);
-  cms::cuda::ScopedContextProduce ctx{pclusters};
-
-  auto const& clusters = ctx.get(pclusters);
+  auto const& clusters = ctx.get(iEvent, token_);
   auto const& digis = ctx.get(iEvent, tokenDigi_);
   auto const& bs = ctx.get(iEvent, tBeamSpot);
 
diff --git a/src/cudadev/plugin-Validation/CountValidator.cc b/src/cudadev/plugin-Validation/CountValidator.cc
index 23352f5ba..1993e20eb 100644
--- a/src/cudadev/plugin-Validation/CountValidator.cc
+++ b/src/cudadev/plugin-Validation/CountValidator.cc
@@ -1,5 +1,5 @@
+#include "CUDACore/Context.h"
 #include "CUDACore/Product.h"
-#include "CUDACore/ScopedContext.h"
 #include "CUDADataFormats/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigisCUDA.h"
@@ -63,24 +63,24 @@ void CountValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup)
   ss << "Event " << iEvent.eventID() << " ";
 
   {
-    auto const& pdigis = iEvent.get(digiToken_);
-    cms::cuda::ScopedContextProduce ctx{pdigis};
-    auto const& count = iEvent.get(digiClusterCountToken_);
-    auto const& digis = ctx.get(iEvent, digiToken_);
-    auto const& clusters = ctx.get(iEvent, clusterToken_);
-
-    if (digis.nModules() != count.nModules()) {
-      ss << "\n N(modules) is " << digis.nModules() << " expected " << count.nModules();
-      ok = false;
-    }
-    if (digis.nDigis() != count.nDigis()) {
-      ss << "\n N(digis) is " << digis.nDigis() << " expected " << count.nDigis();
-      ok = false;
-    }
-    if (clusters.nClusters() != count.nClusters()) {
-      ss << "\n N(clusters) is " << clusters.nClusters() << " expected " << count.nClusters();
-      ok = false;
-    }
+    cms::cuda::runProduce(iEvent.streamID(), [&](cms::cuda::ProduceContext& ctx) {
+      auto const& count = iEvent.get(digiClusterCountToken_);
+      auto const& digis = ctx.get(iEvent, digiToken_);
+      auto const& clusters = ctx.get(iEvent, clusterToken_);
+
+      if (digis.nModules() != count.nModules()) {
+        ss << "\n N(modules) is " << digis.nModules() << " expected " << count.nModules();
+        ok = false;
+      }
+      if (digis.nDigis() != count.nDigis()) {
+        ss << "\n N(digis) is " << digis.nDigis() << " expected " << count.nDigis();
+        ok = false;
+      }
+      if (clusters.nClusters() != count.nClusters()) {
+        ss << "\n N(clusters) is " << clusters.nClusters() << " expected " << count.nClusters();
+        ok = false;
+      }
+    });
   }
 
   {
diff --git a/src/cudadev/plugin-Validation/HistoValidator.cc b/src/cudadev/plugin-Validation/HistoValidator.cc
index 8a888666b..c23bb42af 100644
--- a/src/cudadev/plugin-Validation/HistoValidator.cc
+++ b/src/cudadev/plugin-Validation/HistoValidator.cc
@@ -1,5 +1,6 @@
+#include "CUDACore/Context.h"
+#include "CUDACore/EDProducer.h"
 #include "CUDACore/Product.h"
-#include "CUDACore/ScopedContext.h"
 #include "CUDADataFormats/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigisCUDA.h"
@@ -8,22 +9,19 @@
 #include "Framework/EventSetup.h"
 #include "Framework/Event.h"
 #include "Framework/PluginFactory.h"
-#include "Framework/EDProducer.h"
 
 #include "SimpleAtomicHisto.h"
 
 #include <map>
 #include <fstream>
 
-class HistoValidator : public edm::EDProducerExternalWork {
+class HistoValidator : public cms::cuda::SynchronizingEDProducer {
 public:
   explicit HistoValidator(edm::ProductRegistry& reg);
 
 private:
-  void acquire(const edm::Event& iEvent,
-               const edm::EventSetup& iSetup,
-               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
-  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+  void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::AcquireContext& ctx) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext&) override;
   void endJob() override;
 
   edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiToken_;
@@ -88,11 +86,7 @@ HistoValidator::HistoValidator(edm::ProductRegistry& reg)
       trackToken_(reg.consumes<PixelTrackHeterogeneous>()),
       vertexToken_(reg.consumes<ZVertexHeterogeneous>()) {}
 
-void HistoValidator::acquire(const edm::Event& iEvent,
-                             const edm::EventSetup& iSetup,
-                             edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  auto const& pdigis = iEvent.get(digiToken_);
-  cms::cuda::ScopedContextAcquire ctx{pdigis, std::move(waitingTaskHolder)};
+void HistoValidator::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::AcquireContext& ctx) {
   auto const& digis = ctx.get(iEvent, digiToken_);
   auto const& clusters = ctx.get(iEvent, clusterToken_);
   auto const& hits = ctx.get(iEvent, hitToken_);
@@ -113,7 +107,7 @@ void HistoValidator::acquire(const edm::Event& iEvent,
   h_size = hits.sizeToHostAsync(ctx.stream());
 }
 
-void HistoValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+void HistoValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext&) {
   histos["digi_n"].fill(nDigis);
   for (uint32_t i = 0; i < nDigis; ++i) {
     histos["digi_adc"].fill(h_adc[i]);

From df786db1d4f7405af6a575dec88c79c3a86bbebc Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 31 Aug 2021 14:37:24 -0700
Subject: [PATCH 04/12] Avoid using ContextState

---
 src/cudadev/CUDACore/Context.cc    | 17 ++++-----
 src/cudadev/CUDACore/Context.h     | 60 ++++++++++++------------------
 src/cudadev/CUDACore/EDProducer.cc |  4 +-
 src/cudadev/CUDACore/EDProducer.h  |  3 --
 src/cudadev/Framework/Event.h      | 27 +++++++++++++-
 5 files changed, 58 insertions(+), 53 deletions(-)

diff --git a/src/cudadev/CUDACore/Context.cc b/src/cudadev/CUDACore/Context.cc
index 5d1a46dff..80452494d 100644
--- a/src/cudadev/CUDACore/Context.cc
+++ b/src/cudadev/CUDACore/Context.cc
@@ -36,9 +36,9 @@ namespace {
 
 namespace cms::cuda {
   namespace impl {
-    Context::Context(edm::StreamID streamID) : currentDevice_(chooseDevice(streamID)) {
-      cudaCheck(cudaSetDevice(currentDevice_));
-    }
+    Context::Context(edm::StreamID streamID) : Context(chooseDevice(streamID)) {}
+
+    Context::Context(int device) : currentDevice_(device) { cudaCheck(cudaSetDevice(currentDevice_)); }
 
     Context::Context(int device, SharedStreamPtr stream) : currentDevice_(device), stream_(std::move(stream)) {
       cudaCheck(cudaSetDevice(currentDevice_));
@@ -81,17 +81,14 @@ namespace cms::cuda {
 
     ////////////////////
 
-    void ContextHolderHelper::enqueueCallback(int device, cudaStream_t stream) {
-      cudaCheck(cudaStreamAddCallback(stream, cudaContextCallback, new CallbackData{waitingTaskHolder_, device}, 0));
+    void ContextHolderHelper::enqueueCallback(cudaStream_t stream) {
+      cudaCheck(cudaStreamAddCallback(stream, cudaContextCallback, new CallbackData{waitingTaskHolder_, device_}, 0));
     }
   }  // namespace impl
 
   ////////////////////
 
-  void AcquireContext::commit() {
-    holderHelper_.enqueueCallback(device(), stream());
-    contextState_.set(device(), streamPtr());
-  }
+  void AcquireContext::commit() { holderHelper_.enqueueCallback(stream()); }
 
   ////////////////////
 
@@ -99,5 +96,5 @@ namespace cms::cuda {
 
   ////////////////////
 
-  void TaskContext::commit() { holderHelper_.enqueueCallback(device(), stream()); }
+  void TaskContext::commit() { holderHelper_.enqueueCallback(stream()); }
 }  // namespace cms::cuda
diff --git a/src/cudadev/CUDACore/Context.h b/src/cudadev/CUDACore/Context.h
index b85d29774..73139d13c 100644
--- a/src/cudadev/CUDACore/Context.h
+++ b/src/cudadev/CUDACore/Context.h
@@ -6,7 +6,6 @@
 #include "Framework/Event.h"
 #include "Framework/EDGetToken.h"
 #include "Framework/EDPutToken.h"
-#include "CUDACore/ContextState.h"
 #include "CUDACore/EventCache.h"
 #include "CUDACore/SharedEventPtr.h"
 #include "CUDACore/SharedStreamPtr.h"
@@ -49,6 +48,9 @@ namespace cms::cuda {
       // really matter between modules (or across TBB tasks).
       explicit Context(edm::StreamID streamID);
 
+      explicit Context(int device);
+
+      // meant only for testing
       explicit Context(int device, SharedStreamPtr stream);
 
       bool isInitialized() const { return bool(stream_); }
@@ -87,20 +89,21 @@ namespace cms::cuda {
 
     class ContextHolderHelper {
     public:
-      ContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder)
-          : waitingTaskHolder_{std::move(waitingTaskHolder)} {}
+      ContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder, int device)
+          : waitingTaskHolder_{std::move(waitingTaskHolder)}, device_{device} {}
 
       template <typename F>
-      void pushNextTask(F&& f, ContextState const* state);
+      void pushNextTask(F&& f);
 
       void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
         waitingTaskHolder_ = std::move(waitingTaskHolder);
       }
 
-      void enqueueCallback(int device, cudaStream_t stream);
+      void enqueueCallback(cudaStream_t stream);
 
     private:
       edm::WaitingTaskWithArenaHolder waitingTaskHolder_;
+      int device_;
     };
   }  // namespace impl
 
@@ -113,15 +116,13 @@ namespace cms::cuda {
      */
   class AcquireContext : public impl::ContextGetterBase {
   public:
-    explicit AcquireContext(edm::StreamID streamID,
-                            edm::WaitingTaskWithArenaHolder waitingTaskHolder,
-                            ContextState& state)
-        : ContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{state} {}
+    explicit AcquireContext(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+        : ContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder), device()} {}
     ~AcquireContext() = default;
 
     template <typename F>
     void pushNextTask(F&& f) {
-      holderHelper_.pushNextTask(std::forward<F>(f), contextState_);
+      holderHelper_.pushNextTask(std::forward<F>(f));
     }
 
     void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
@@ -133,7 +134,6 @@ namespace cms::cuda {
 
   private:
     impl::ContextHolderHelper holderHelper_;
-    ContextState& contextState_;
   };
 
   /**
@@ -144,12 +144,8 @@ namespace cms::cuda {
    */
   class ProduceContext : public impl::ContextGetterBase {
   public:
-    /// Constructor to create a new CUDA stream (non-ExternalWork module)
     explicit ProduceContext(edm::StreamID streamID) : ContextGetterBase(streamID) {}
 
-    /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
-    explicit ProduceContext(ContextState& state) : ContextGetterBase(state.device(), state.releaseStreamPtr()) {}
-
     ~ProduceContext() = default;
 
     template <typename T>
@@ -183,16 +179,14 @@ namespace cms::cuda {
   class TaskContext : public impl::Context {
   public:
     /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
-    explicit TaskContext(ContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
-        : Context(state->device(), state->streamPtr()),  // don't move, state is re-used afterwards
-          holderHelper_{std::move(waitingTaskHolder)},
-          contextState_{state} {}
+    explicit TaskContext(int device, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+        : Context(device), holderHelper_{std::move(waitingTaskHolder), device} {}
 
     ~TaskContext() = default;
 
     template <typename F>
     void pushNextTask(F&& f) {
-      holderHelper_.pushNextTask(std::forward<F>(f), contextState_);
+      holderHelper_.pushNextTask(std::forward<F>(f));
     }
 
     void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
@@ -204,7 +198,6 @@ namespace cms::cuda {
 
   private:
     impl::ContextHolderHelper holderHelper_;
-    ContextState const* contextState_;
   };
 
   /**
@@ -221,19 +214,19 @@ namespace cms::cuda {
 
   namespace impl {
     template <typename F>
-    void ContextHolderHelper::pushNextTask(F&& f, ContextState const* state) {
-      replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{
-          edm::make_waiting_task_with_holder(tbb::task::allocate_root(),
-                                             std::move(waitingTaskHolder_),
-                                             [state, func = std::forward<F>(f)](edm::WaitingTaskWithArenaHolder h) {
-                                               func(TaskContext{state, std::move(h)});
-                                             })});
+    void ContextHolderHelper::pushNextTask(F&& f) {
+      replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{edm::make_waiting_task_with_holder(
+          tbb::task::allocate_root(),
+          std::move(waitingTaskHolder_),
+          [device = device_, func = std::forward<F>(f)](edm::WaitingTaskWithArenaHolder h) {
+            func(TaskContext{device, std::move(h)});
+          })});
     }
   }  // namespace impl
 
   template <typename F>
-  void runAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder holder, ContextState& state, F func) {
-    AcquireContext context(streamID, std::move(holder), state);
+  void runAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder holder, F func) {
+    AcquireContext context(streamID, std::move(holder));
     func(context);
     context.commit();
   }
@@ -244,13 +237,6 @@ namespace cms::cuda {
     func(context);
     context.commit();
   }
-
-  template <typename F>
-  void runProduce(ContextState& state, F func) {
-    ProduceContext context(state);
-    func(context);
-    context.commit();
-  }
 }  // namespace cms::cuda
 
 #endif
diff --git a/src/cudadev/CUDACore/EDProducer.cc b/src/cudadev/CUDACore/EDProducer.cc
index d41d77c31..2e08ea2d2 100644
--- a/src/cudadev/CUDACore/EDProducer.cc
+++ b/src/cudadev/CUDACore/EDProducer.cc
@@ -8,10 +8,10 @@ namespace cms::cuda {
   void SynchronizingEDProducer::acquire(edm::Event const& event,
                                         edm::EventSetup const& eventSetup,
                                         edm::WaitingTaskWithArenaHolder holder) {
-    runAcquire(event.streamID(), std::move(holder), state_, [&](auto& ctx) { acquire(event, eventSetup, ctx); });
+    runAcquire(event.streamID(), std::move(holder), [&](auto& ctx) { acquire(event, eventSetup, ctx); });
   }
 
   void SynchronizingEDProducer::produce(edm::Event& event, edm::EventSetup const& eventSetup) {
-    runProduce(state_, [&](auto& ctx) { produce(event, eventSetup, ctx); });
+    runProduce(event.streamID(), [&](auto& ctx) { produce(event, eventSetup, ctx); });
   }
 }  // namespace cms::cuda
diff --git a/src/cudadev/CUDACore/EDProducer.h b/src/cudadev/CUDACore/EDProducer.h
index 3fc917285..6fe6962bd 100644
--- a/src/cudadev/CUDACore/EDProducer.h
+++ b/src/cudadev/CUDACore/EDProducer.h
@@ -20,9 +20,6 @@ namespace cms::cuda {
 
     virtual void acquire(edm::Event const& event, edm::EventSetup const& eventSetup, AcquireContext& context) = 0;
     virtual void produce(edm::Event& event, edm::EventSetup const& eventSetup, ProduceContext& context) = 0;
-
-  private:
-    ContextState state_;
   };
 }  // namespace cms::cuda
 
diff --git a/src/cudadev/Framework/Event.h b/src/cudadev/Framework/Event.h
index 9f952c492..68a04e729 100644
--- a/src/cudadev/Framework/Event.h
+++ b/src/cudadev/Framework/Event.h
@@ -9,7 +9,32 @@
 
 // type erasure
 namespace edm {
-  using StreamID = int;
+  class Event;
+
+  class StreamID {
+  public:
+    ~StreamID() = default;
+    StreamID() = delete;
+    StreamID(const StreamID&) = default;
+    StreamID& operator=(const StreamID&) = default;
+
+    bool operator==(const StreamID& iID) const { return iID.value_ == value_; }
+
+    operator unsigned int() const { return value_; }
+
+    /** \return value ranging from 0 to one less than max number of streams.
+     */
+    unsigned int value() const { return value_; }
+
+    static StreamID invalidStreamID() { return StreamID(0xFFFFFFFFU); }
+
+  private:
+    ///Only a Event is allowed to create one of these
+    friend class Event;
+    explicit StreamID(unsigned int iValue) : value_(iValue) {}
+
+    unsigned int value_;
+  };
 
   class WrapperBase {
   public:

From 8b3187ad32d15045382674ead24dfd54776ddba8 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Thu, 2 Sep 2021 06:24:17 -0700
Subject: [PATCH 05/12] Remove ScopedContext as obsolete

---
 src/cudadev/CUDACore/Product.h        |   5 +-
 src/cudadev/CUDACore/ProductBase.h    |   9 +-
 src/cudadev/CUDACore/ScopedContext.cc | 116 -------------
 src/cudadev/CUDACore/ScopedContext.h  | 241 --------------------------
 4 files changed, 4 insertions(+), 367 deletions(-)
 delete mode 100644 src/cudadev/CUDACore/ScopedContext.cc
 delete mode 100644 src/cudadev/CUDACore/ScopedContext.h

diff --git a/src/cudadev/CUDACore/Product.h b/src/cudadev/CUDACore/Product.h
index 5d9cab481..ae4123d79 100644
--- a/src/cudadev/CUDACore/Product.h
+++ b/src/cudadev/CUDACore/Product.h
@@ -13,7 +13,6 @@ namespace edm {
 namespace cms {
   namespace cuda {
     namespace impl {
-      class ScopedContextGetterBase;
       class ContextGetterBase;
     }  // namespace impl
 
@@ -21,7 +20,7 @@ namespace cms {
      * The purpose of this class is to wrap CUDA data to edm::Event in a
      * way which forces correct use of various utilities.
      *
-     * The non-default construction has to be done with cms::cuda::ScopedContext
+     * The non-default construction has to be done with cms::cuda::Context
      * (in order to properly register the CUDA event).
      *
      * The default constructor is needed only for the ROOT dictionary generation.
@@ -42,8 +41,6 @@ namespace cms {
       Product& operator=(Product&&) = default;
 
     private:
-      friend class impl::ScopedContextGetterBase;
-      friend class ScopedContextProduce;
       friend class impl::ContextGetterBase;
       friend class ProduceContext;
       friend class edm::Wrapper<Product<T>>;
diff --git a/src/cudadev/CUDACore/ProductBase.h b/src/cudadev/CUDACore/ProductBase.h
index 00d12b3fb..47a7d7a93 100644
--- a/src/cudadev/CUDACore/ProductBase.h
+++ b/src/cudadev/CUDACore/ProductBase.h
@@ -10,7 +10,6 @@
 namespace cms {
   namespace cuda {
     namespace impl {
-      class ScopedContextBase;
       class Context;
     }  // namespace impl
 
@@ -44,13 +43,13 @@ namespace cms {
       int device() const { return device_; }
 
       // cudaStream_t is a pointer to a thread-safe object, for which a
-      // mutable access is needed even if the cms::cuda::ScopedContext itself
+      // mutable access is needed even if the cms::cuda::Context itself
       // would be const. Therefore it is ok to return a non-const
       // pointer from a const method here.
       cudaStream_t stream() const { return stream_.get(); }
 
       // cudaEvent_t is a pointer to a thread-safe object, for which a
-      // mutable access is needed even if the cms::cuda::ScopedContext itself
+      // mutable access is needed even if the cms::cuda::Context itself
       // would be const. Therefore it is ok to return a non-const
       // pointer from a const method here.
       cudaEvent_t event() const { return event_.get(); }
@@ -60,12 +59,10 @@ namespace cms {
           : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {}
 
     private:
-      friend class impl::ScopedContextBase;
-      friend class ScopedContextProduce;
       friend class impl::Context;
       friend class ProduceContext;
 
-      // The following function is intended to be used only from ScopedContext
+      // The following function is intended to be used only from Context
       const SharedStreamPtr& streamPtr() const { return stream_; }
 
       bool mayReuseStream() const {
diff --git a/src/cudadev/CUDACore/ScopedContext.cc b/src/cudadev/CUDACore/ScopedContext.cc
deleted file mode 100644
index 14bff04eb..000000000
--- a/src/cudadev/CUDACore/ScopedContext.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-#include "CUDACore/ScopedContext.h"
-
-#include "CUDACore/StreamCache.h"
-#include "CUDACore/cudaCheck.h"
-
-#include "chooseDevice.h"
-
-namespace {
-  struct CallbackData {
-    edm::WaitingTaskWithArenaHolder holder;
-    int device;
-  };
-
-  void CUDART_CB cudaScopedContextCallback(cudaStream_t streamId, cudaError_t status, void* data) {
-    std::unique_ptr<CallbackData> guard{reinterpret_cast<CallbackData*>(data)};
-    edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder;
-    int device = guard->device;
-    if (status == cudaSuccess) {
-      //std::cout << " GPU kernel finished (in callback) device " << device << " CUDA stream "
-      //          << streamId << std::endl;
-      waitingTaskHolder.doneWaiting(nullptr);
-    } else {
-      // wrap the exception in a try-catch block to let GDB "catch throw" break on it
-      try {
-        auto error = cudaGetErrorName(status);
-        auto message = cudaGetErrorString(status);
-        throw std::runtime_error("Callback of CUDA stream " +
-                                 std::to_string(reinterpret_cast<unsigned long>(streamId)) + " in device " +
-                                 std::to_string(device) + " error " + std::string(error) + ": " + std::string(message));
-      } catch (std::exception&) {
-        waitingTaskHolder.doneWaiting(std::current_exception());
-      }
-    }
-  }
-}  // namespace
-
-namespace cms::cuda {
-  namespace impl {
-    ScopedContextBase::ScopedContextBase(edm::StreamID streamID) : currentDevice_(chooseDevice(streamID)) {
-      cudaCheck(cudaSetDevice(currentDevice_));
-      stream_ = getStreamCache().get();
-    }
-
-    ScopedContextBase::ScopedContextBase(const ProductBase& data) : currentDevice_(data.device()) {
-      cudaCheck(cudaSetDevice(currentDevice_));
-      if (data.mayReuseStream()) {
-        stream_ = data.streamPtr();
-      } else {
-        stream_ = getStreamCache().get();
-      }
-    }
-
-    ScopedContextBase::ScopedContextBase(int device, SharedStreamPtr stream)
-        : currentDevice_(device), stream_(std::move(stream)) {
-      cudaCheck(cudaSetDevice(currentDevice_));
-    }
-
-    ////////////////////
-
-    void ScopedContextGetterBase::synchronizeStreams(int dataDevice,
-                                                     cudaStream_t dataStream,
-                                                     bool available,
-                                                     cudaEvent_t dataEvent) {
-      if (dataDevice != device()) {
-        // Eventually replace with prefetch to current device (assuming unified memory works)
-        // If we won't go to unified memory, need to figure out something else...
-        throw std::runtime_error("Handling data from multiple devices is not yet supported");
-      }
-
-      if (dataStream != stream()) {
-        // Different streams, need to synchronize
-        if (not available) {
-          // Event not yet occurred, so need to add synchronization
-          // here. Sychronization is done by making the CUDA stream to
-          // wait for an event, so all subsequent work in the stream
-          // will run only after the event has "occurred" (i.e. data
-          // product became available).
-          cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event");
-        }
-      }
-    }
-
-    void ScopedContextHolderHelper::enqueueCallback(int device, cudaStream_t stream) {
-      cudaCheck(
-          cudaStreamAddCallback(stream, cudaScopedContextCallback, new CallbackData{waitingTaskHolder_, device}, 0));
-    }
-  }  // namespace impl
-
-  ////////////////////
-
-  ScopedContextAcquire::~ScopedContextAcquire() {
-    holderHelper_.enqueueCallback(device(), stream());
-    if (contextState_) {
-      contextState_->set(device(), streamPtr());
-    }
-  }
-
-  void ScopedContextAcquire::throwNoState() {
-    throw std::runtime_error(
-        "Calling ScopedContextAcquire::insertNextTask() requires ScopedContextAcquire to be constructed with "
-        "ContextState, but that was not the case");
-  }
-
-  ////////////////////
-
-  ScopedContextProduce::~ScopedContextProduce() {
-    // Intentionally not checking the return value to avoid throwing
-    // exceptions. If this call would fail, we should get failures
-    // elsewhere as well.
-    cudaEventRecord(event_.get(), stream());
-  }
-
-  ////////////////////
-
-  ScopedContextTask::~ScopedContextTask() { holderHelper_.enqueueCallback(device(), stream()); }
-}  // namespace cms::cuda
diff --git a/src/cudadev/CUDACore/ScopedContext.h b/src/cudadev/CUDACore/ScopedContext.h
deleted file mode 100644
index 4f6669883..000000000
--- a/src/cudadev/CUDACore/ScopedContext.h
+++ /dev/null
@@ -1,241 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_ScopedContext_h
-#define HeterogeneousCore_CUDACore_ScopedContext_h
-
-#include <optional>
-
-#include "CUDACore/Product.h"
-#include "Framework/WaitingTaskWithArenaHolder.h"
-#include "Framework/Event.h"
-#include "Framework/EDGetToken.h"
-#include "Framework/EDPutToken.h"
-#include "CUDACore/ContextState.h"
-#include "CUDACore/EventCache.h"
-#include "CUDACore/SharedEventPtr.h"
-#include "CUDACore/SharedStreamPtr.h"
-
-namespace cms {
-  namespace cudatest {
-    class TestScopedContext;
-  }
-
-  namespace cuda {
-
-    namespace impl {
-      // This class is intended to be derived by other ScopedContext*, not for general use
-      class ScopedContextBase {
-      public:
-        int device() const { return currentDevice_; }
-
-        // cudaStream_t is a pointer to a thread-safe object, for which a
-        // mutable access is needed even if the ScopedContext itself
-        // would be const. Therefore it is ok to return a non-const
-        // pointer from a const method here.
-        cudaStream_t stream() const { return stream_.get(); }
-        const SharedStreamPtr& streamPtr() const { return stream_; }
-
-      protected:
-        // The constructors set the current device, but the device
-        // is not set back to the previous value at the destructor. This
-        // should be sufficient (and tiny bit faster) as all CUDA API
-        // functions relying on the current device should be called from
-        // the scope where this context is. The current device doesn't
-        // really matter between modules (or across TBB tasks).
-        explicit ScopedContextBase(edm::StreamID streamID);
-
-        explicit ScopedContextBase(const ProductBase& data);
-
-        explicit ScopedContextBase(int device, SharedStreamPtr stream);
-
-      private:
-        int currentDevice_;
-        SharedStreamPtr stream_;
-      };
-
-      class ScopedContextGetterBase : public ScopedContextBase {
-      public:
-        template <typename T>
-        const T& get(const Product<T>& data) {
-          synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event());
-          return data.data_;
-        }
-
-        template <typename T>
-        const T& get(const edm::Event& iEvent, edm::EDGetTokenT<Product<T>> token) {
-          return get(iEvent.get(token));
-        }
-
-      protected:
-        template <typename... Args>
-        ScopedContextGetterBase(Args&&... args) : ScopedContextBase(std::forward<Args>(args)...) {}
-
-        void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent);
-      };
-
-      class ScopedContextHolderHelper {
-      public:
-        ScopedContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder)
-            : waitingTaskHolder_{std::move(waitingTaskHolder)} {}
-
-        template <typename F>
-        void pushNextTask(F&& f, ContextState const* state);
-
-        void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-          waitingTaskHolder_ = std::move(waitingTaskHolder);
-        }
-
-        void enqueueCallback(int device, cudaStream_t stream);
-
-      private:
-        edm::WaitingTaskWithArenaHolder waitingTaskHolder_;
-      };
-    }  // namespace impl
-
-    /**
-     * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire():
-     * - setting the current device
-     * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
-     * - synchronizing between CUDA streams if necessary
-     * and enforce that those get done in a proper way in RAII fashion.
-     */
-    class ScopedContextAcquire : public impl::ScopedContextGetterBase {
-    public:
-      /// Constructor to create a new CUDA stream (no need for context beyond acquire())
-      explicit ScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
-          : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)} {}
-
-      /// Constructor to create a new CUDA stream, and the context is needed after acquire()
-      explicit ScopedContextAcquire(edm::StreamID streamID,
-                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder,
-                                    ContextState& state)
-          : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {}
-
-      /// Constructor to (possibly) re-use a CUDA stream (no need for context beyond acquire())
-      explicit ScopedContextAcquire(const ProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
-          : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)} {}
-
-      /// Constructor to (possibly) re-use a CUDA stream, and the context is needed after acquire()
-      explicit ScopedContextAcquire(const ProductBase& data,
-                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder,
-                                    ContextState& state)
-          : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {}
-
-      ~ScopedContextAcquire();
-
-      template <typename F>
-      void pushNextTask(F&& f) {
-        if (contextState_ == nullptr)
-          throwNoState();
-        holderHelper_.pushNextTask(std::forward<F>(f), contextState_);
-      }
-
-      void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-        holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
-      }
-
-    private:
-      void throwNoState();
-
-      impl::ScopedContextHolderHelper holderHelper_;
-      ContextState* contextState_ = nullptr;
-    };
-
-    /**
-     * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce():
-     * - setting the current device
-     * - synchronizing between CUDA streams if necessary
-     * and enforce that those get done in a proper way in RAII fashion.
-     */
-    class ScopedContextProduce : public impl::ScopedContextGetterBase {
-    public:
-      /// Constructor to create a new CUDA stream (non-ExternalWork module)
-      explicit ScopedContextProduce(edm::StreamID streamID) : ScopedContextGetterBase(streamID) {}
-
-      /// Constructor to (possibly) re-use a CUDA stream (non-ExternalWork module)
-      explicit ScopedContextProduce(const ProductBase& data) : ScopedContextGetterBase(data) {}
-
-      /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
-      explicit ScopedContextProduce(ContextState& state)
-          : ScopedContextGetterBase(state.device(), state.releaseStreamPtr()) {}
-
-      /// Record the CUDA event, all asynchronous work must have been queued before the destructor
-      ~ScopedContextProduce();
-
-      template <typename T>
-      std::unique_ptr<Product<T>> wrap(T data) {
-        // make_unique doesn't work because of private constructor
-        return std::unique_ptr<Product<T>>(new Product<T>(device(), streamPtr(), event_, std::move(data)));
-      }
-
-      template <typename T, typename... Args>
-      auto emplace(edm::Event& iEvent, edm::EDPutTokenT<T> token, Args&&... args) {
-        return iEvent.emplace(token, device(), streamPtr(), event_, std::forward<Args>(args)...);
-      }
-
-    private:
-      friend class cudatest::TestScopedContext;
-
-      // This construcor is only meant for testing
-      explicit ScopedContextProduce(int device, SharedStreamPtr stream, SharedEventPtr event)
-          : ScopedContextGetterBase(device, std::move(stream)), event_{std::move(event)} {}
-
-      // create the CUDA Event upfront to catch possible errors from its creation
-      SharedEventPtr event_ = getEventCache().get();
-    };
-
-    /**
-     * The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire():
-     * - setting the current device
-     * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
-     * and enforce that those get done in a proper way in RAII fashion.
-     */
-    class ScopedContextTask : public impl::ScopedContextBase {
-    public:
-      /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
-      explicit ScopedContextTask(ContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
-          : ScopedContextBase(state->device(), state->streamPtr()),  // don't move, state is re-used afterwards
-            holderHelper_{std::move(waitingTaskHolder)},
-            contextState_{state} {}
-
-      ~ScopedContextTask();
-
-      template <typename F>
-      void pushNextTask(F&& f) {
-        holderHelper_.pushNextTask(std::forward<F>(f), contextState_);
-      }
-
-      void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-        holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
-      }
-
-    private:
-      impl::ScopedContextHolderHelper holderHelper_;
-      ContextState const* contextState_;
-    };
-
-    /**
-     * The aim of this class is to do necessary per-event "initialization" in analyze()
-     * - setting the current device
-     * - synchronizing between CUDA streams if necessary
-     * and enforce that those get done in a proper way in RAII fashion.
-     */
-    class ScopedContextAnalyze : public impl::ScopedContextGetterBase {
-    public:
-      /// Constructor to (possibly) re-use a CUDA stream
-      explicit ScopedContextAnalyze(const ProductBase& data) : ScopedContextGetterBase(data) {}
-    };
-
-    namespace impl {
-      template <typename F>
-      void ScopedContextHolderHelper::pushNextTask(F&& f, ContextState const* state) {
-        replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{
-            edm::make_waiting_task_with_holder(tbb::task::allocate_root(),
-                                               std::move(waitingTaskHolder_),
-                                               [state, func = std::forward<F>(f)](edm::WaitingTaskWithArenaHolder h) {
-                                                 func(ScopedContextTask{state, std::move(h)});
-                                               })});
-      }
-    }  // namespace impl
-  }    // namespace cuda
-}  // namespace cms
-
-#endif

From 4cd8dab1134029e269ee0111e11f377210e6a171 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Thu, 2 Sep 2021 08:39:44 -0700
Subject: [PATCH 06/12] Share the CUDA stream between Event products from the
 same EDProducer in better-defined way

---
 src/cudadev/CUDACore/Context.cc    | 11 ++--
 src/cudadev/CUDACore/Context.h     | 17 +++++--
 src/cudadev/CUDACore/Product.h     |  7 ++-
 src/cudadev/CUDACore/ProductBase.h | 81 +++++++++++++++++-------------
 4 files changed, 71 insertions(+), 45 deletions(-)

diff --git a/src/cudadev/CUDACore/Context.cc b/src/cudadev/CUDACore/Context.cc
index 80452494d..b34041e53 100644
--- a/src/cudadev/CUDACore/Context.cc
+++ b/src/cudadev/CUDACore/Context.cc
@@ -40,18 +40,21 @@ namespace cms::cuda {
 
     Context::Context(int device) : currentDevice_(device) { cudaCheck(cudaSetDevice(currentDevice_)); }
 
-    Context::Context(int device, SharedStreamPtr stream) : currentDevice_(device), stream_(std::move(stream)) {
+    Context::Context(int device, SharedStreamPtr stream)
+        : currentDevice_(device), stream_(std::make_shared<impl::StreamSharingHelper>(std::move(stream))) {
       cudaCheck(cudaSetDevice(currentDevice_));
     }
 
-    void Context::initialize() { stream_ = getStreamCache().get(); }
+    void Context::initialize() { stream_ = std::make_shared<impl::StreamSharingHelper>(getStreamCache().get()); }
 
     void Context::initialize(const ProductBase& data) {
+      SharedStreamPtr stream;
       if (data.mayReuseStream()) {
-        stream_ = data.streamPtr();
+        stream = data.streamPtr();
       } else {
-        stream_ = getStreamCache().get();
+        stream = getStreamCache().get();
       }
+      stream_ = std::make_shared<impl::StreamSharingHelper>(std::move(stream));
     }
 
     ////////////////////
diff --git a/src/cudadev/CUDACore/Context.h b/src/cudadev/CUDACore/Context.h
index 73139d13c..d2b7f6343 100644
--- a/src/cudadev/CUDACore/Context.h
+++ b/src/cudadev/CUDACore/Context.h
@@ -30,13 +30,13 @@ namespace cms::cuda {
         if (not isInitialized()) {
           initialize();
         }
-        return stream_.get();
+        return stream_->streamPtr().get();
       }
       const SharedStreamPtr& streamPtr() {
         if (not isInitialized()) {
           initialize();
         }
-        return stream_;
+        return stream_->streamPtr();
       }
 
     protected:
@@ -58,9 +58,16 @@ namespace cms::cuda {
       void initialize();
       void initialize(const ProductBase& data);
 
+      const std::shared_ptr<impl::StreamSharingHelper>& streamSharingHelper() {
+        if (not isInitialized()) {
+          initialize();
+        }
+        return stream_;
+      }
+
     private:
       int currentDevice_ = -1;
-      SharedStreamPtr stream_;
+      std::shared_ptr<impl::StreamSharingHelper> stream_;
     };
 
     class ContextGetterBase : public Context {
@@ -151,12 +158,12 @@ namespace cms::cuda {
     template <typename T>
     std::unique_ptr<Product<T>> wrap(T data) {
       // make_unique doesn't work because of private constructor
-      return std::unique_ptr<Product<T>>(new Product<T>(device(), streamPtr(), event_, std::move(data)));
+      return std::unique_ptr<Product<T>>(new Product<T>(device(), streamSharingHelper(), event_, std::move(data)));
     }
 
     template <typename T, typename... Args>
     auto emplace(edm::Event& iEvent, edm::EDPutTokenT<T> token, Args&&... args) {
-      return iEvent.emplace(token, device(), streamPtr(), event_, std::forward<Args>(args)...);
+      return iEvent.emplace(token, device(), streamSharingHelper(), event_, std::forward<Args>(args)...);
     }
 
     // internal API
diff --git a/src/cudadev/CUDACore/Product.h b/src/cudadev/CUDACore/Product.h
index ae4123d79..759079003 100644
--- a/src/cudadev/CUDACore/Product.h
+++ b/src/cudadev/CUDACore/Product.h
@@ -45,11 +45,14 @@ namespace cms {
       friend class ProduceContext;
       friend class edm::Wrapper<Product<T>>;
 
-      explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, T data)
+      explicit Product(int device, std::shared_ptr<impl::StreamSharingHelper> stream, SharedEventPtr event, T data)
           : ProductBase(device, std::move(stream), std::move(event)), data_(std::move(data)) {}
 
       template <typename... Args>
-      explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, Args&&... args)
+      explicit Product(int device,
+                       std::shared_ptr<impl::StreamSharingHelper> stream,
+                       SharedEventPtr event,
+                       Args&&... args)
           : ProductBase(device, std::move(stream), std::move(event)), data_(std::forward<Args>(args)...) {}
 
       T data_;  //!
diff --git a/src/cudadev/CUDACore/ProductBase.h b/src/cudadev/CUDACore/ProductBase.h
index 47a7d7a93..e2a367341 100644
--- a/src/cudadev/CUDACore/ProductBase.h
+++ b/src/cudadev/CUDACore/ProductBase.h
@@ -11,6 +11,40 @@ namespace cms {
   namespace cuda {
     namespace impl {
       class Context;
+
+      /**
+       * The CUDA stream is shared between all the Event products of
+       * the EDProducer. If the stream gets re-used, only one consumer
+       * of all the products should be allowed to use the stream. An
+       * objects of this class is shared between such Event products
+       * and takes care of letting only those consumers get the stream.
+       */
+      class StreamSharingHelper {
+      public:
+        explicit StreamSharingHelper(SharedStreamPtr stream) : stream_(std::move(stream)) {}
+        StreamSharingHelper(const StreamSharingHelper&) = delete;
+        StreamSharingHelper& operator=(const StreamSharingHelper&) = delete;
+        StreamSharingHelper(StreamSharingHelper&&) = delete;
+        StreamSharingHelper& operator=(StreamSharingHelper) = delete;
+
+        const SharedStreamPtr& streamPtr() const { return stream_; }
+
+        bool mayReuseStream() const {
+          bool expected = true;
+          bool changed = mayReuseStream_.compare_exchange_strong(expected, false);
+          // If the current thread is the one flipping the flag, it may
+          // reuse the stream.
+          return changed;
+        }
+
+      private:
+        SharedStreamPtr stream_;
+
+        // This flag tells whether the CUDA stream may be reused by a
+        // consumer or not. The goal is to have a "chain" of modules to
+        // queue their work to the same stream.
+        mutable std::atomic<bool> mayReuseStream_ = true;
+      };
     }  // namespace impl
 
     /**
@@ -24,20 +58,9 @@ namespace cms {
 
       ProductBase(const ProductBase&) = delete;
       ProductBase& operator=(const ProductBase&) = delete;
-      ProductBase(ProductBase&& other)
-          : stream_{std::move(other.stream_)},
-            event_{std::move(other.event_)},
-            mayReuseStream_{other.mayReuseStream_.load()},
-            device_{other.device_} {}
-      ProductBase& operator=(ProductBase&& other) {
-        stream_ = std::move(other.stream_);
-        event_ = std::move(other.event_);
-        mayReuseStream_ = other.mayReuseStream_.load();
-        device_ = other.device_;
-        return *this;
-      }
-
-      bool isValid() const { return stream_.get() != nullptr; }
+      ProductBase(ProductBase&& other) = default;
+      ProductBase& operator=(ProductBase&& other) = default;
+
       bool isAvailable() const;
 
       int device() const { return device_; }
@@ -46,7 +69,7 @@ namespace cms {
       // mutable access is needed even if the cms::cuda::Context itself
       // would be const. Therefore it is ok to return a non-const
       // pointer from a const method here.
-      cudaStream_t stream() const { return stream_.get(); }
+      cudaStream_t stream() const { return stream_->streamPtr().get(); }
 
       // cudaEvent_t is a pointer to a thread-safe object, for which a
       // mutable access is needed even if the cms::cuda::Context itself
@@ -55,7 +78,7 @@ namespace cms {
       cudaEvent_t event() const { return event_.get(); }
 
     protected:
-      explicit ProductBase(int device, SharedStreamPtr stream, SharedEventPtr event)
+      explicit ProductBase(int device, std::shared_ptr<impl::StreamSharingHelper> stream, SharedEventPtr event)
           : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {}
 
     private:
@@ -63,27 +86,17 @@ namespace cms {
       friend class ProduceContext;
 
       // The following function is intended to be used only from Context
-      const SharedStreamPtr& streamPtr() const { return stream_; }
-
-      bool mayReuseStream() const {
-        bool expected = true;
-        bool changed = mayReuseStream_.compare_exchange_strong(expected, false);
-        // If the current thread is the one flipping the flag, it may
-        // reuse the stream.
-        return changed;
-      }
-
-      // The cudaStream_t is really shared among edm::Event products, so
-      // using shared_ptr also here
-      SharedStreamPtr stream_;  //!
+      const SharedStreamPtr& streamPtr() const { return stream_->streamPtr(); }
+
+      bool mayReuseStream() const { return stream_->mayReuseStream(); }
+
+      // Helper shared between all cms::cuda::Product<T> event
+      // products of an EDProducer
+      std::shared_ptr<impl::StreamSharingHelper> stream_;  //!
+
       // shared_ptr because of caching in cms::cuda::EventCache
       SharedEventPtr event_;  //!
 
-      // This flag tells whether the CUDA stream may be reused by a
-      // consumer or not. The goal is to have a "chain" of modules to
-      // queue their work to the same stream.
-      mutable std::atomic<bool> mayReuseStream_ = true;  //!
-
       // The CUDA device associated with this product
       int device_ = -1;  //!
     };

From 55effecbf227aa45b46637ad551efe8b1b842375 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 8 Sep 2021 18:23:39 -0700
Subject: [PATCH 07/12] Split Context.h into one class per file

---
 src/cudadev/CUDACore/AcquireContext.cc        |   5 +
 src/cudadev/CUDACore/AcquireContext.h         |  45 ++++
 src/cudadev/CUDACore/AnalyzeContext.h         |  20 ++
 src/cudadev/CUDACore/Context.cc               | 103 --------
 src/cudadev/CUDACore/Context.h                | 248 +-----------------
 src/cudadev/CUDACore/EDGetterContextBase.cc   |  27 ++
 src/cudadev/CUDACore/EDGetterContextBase.h    |  38 +++
 src/cudadev/CUDACore/FwkContextBase.cc        |  28 ++
 src/cudadev/CUDACore/FwkContextBase.h         |  66 +++++
 src/cudadev/CUDACore/FwkContextHolderHelper.h |  25 ++
 src/cudadev/CUDACore/ProduceContext.cc        |   6 +
 src/cudadev/CUDACore/ProduceContext.h         |  52 ++++
 src/cudadev/CUDACore/Product.h                |   4 +-
 src/cudadev/CUDACore/ProductBase.h            |   8 +-
 src/cudadev/CUDACore/TaskContext.cc           |  41 +++
 src/cudadev/CUDACore/TaskContext.h            |  71 +++++
 16 files changed, 433 insertions(+), 354 deletions(-)
 create mode 100644 src/cudadev/CUDACore/AcquireContext.cc
 create mode 100644 src/cudadev/CUDACore/AcquireContext.h
 create mode 100644 src/cudadev/CUDACore/AnalyzeContext.h
 delete mode 100644 src/cudadev/CUDACore/Context.cc
 create mode 100644 src/cudadev/CUDACore/EDGetterContextBase.cc
 create mode 100644 src/cudadev/CUDACore/EDGetterContextBase.h
 create mode 100644 src/cudadev/CUDACore/FwkContextBase.cc
 create mode 100644 src/cudadev/CUDACore/FwkContextBase.h
 create mode 100644 src/cudadev/CUDACore/FwkContextHolderHelper.h
 create mode 100644 src/cudadev/CUDACore/ProduceContext.cc
 create mode 100644 src/cudadev/CUDACore/ProduceContext.h
 create mode 100644 src/cudadev/CUDACore/TaskContext.cc
 create mode 100644 src/cudadev/CUDACore/TaskContext.h

diff --git a/src/cudadev/CUDACore/AcquireContext.cc b/src/cudadev/CUDACore/AcquireContext.cc
new file mode 100644
index 000000000..dd6e8d1de
--- /dev/null
+++ b/src/cudadev/CUDACore/AcquireContext.cc
@@ -0,0 +1,5 @@
+#include "CUDACore/AcquireContext.h"
+
+namespace cms::cuda {
+  void AcquireContext::commit() { holderHelper_.enqueueCallback(stream()); }
+}
diff --git a/src/cudadev/CUDACore/AcquireContext.h b/src/cudadev/CUDACore/AcquireContext.h
new file mode 100644
index 000000000..409320abe
--- /dev/null
+++ b/src/cudadev/CUDACore/AcquireContext.h
@@ -0,0 +1,45 @@
+#ifndef HeterogeneousCore_CUDACore_AcquireContext_h
+#define HeterogeneousCore_CUDACore_AcquireContext_h
+
+#include "CUDACore/EDGetterContextBase.h"
+#include "CUDACore/TaskContext.h"
+
+namespace cms::cuda {
+  /**
+   * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire():
+   * - setting the current device
+   * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
+   * - synchronizing between CUDA streams if necessary
+   * Users should not, however, construct it explicitly.
+   */
+  class AcquireContext : public impl::EDGetterContextBase {
+  public:
+    explicit AcquireContext(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+        : EDGetterContextBase(streamID), holderHelper_{std::move(waitingTaskHolder), device()} {}
+    ~AcquireContext() = default;
+
+    template <typename F>
+    void pushNextTask(F&& f) {
+      holderHelper_.pushNextTask(std::forward<F>(f));
+    }
+
+    void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+      holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
+    }
+
+    // internal API
+    void commit();
+
+  private:
+    impl::FwkContextHolderHelper holderHelper_;
+  };
+
+  template <typename F>
+  void runAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder holder, F func) {
+    AcquireContext context(streamID, std::move(holder));
+    func(context);
+    context.commit();
+  }
+}
+
+#endif
diff --git a/src/cudadev/CUDACore/AnalyzeContext.h b/src/cudadev/CUDACore/AnalyzeContext.h
new file mode 100644
index 000000000..cbe9a66dd
--- /dev/null
+++ b/src/cudadev/CUDACore/AnalyzeContext.h
@@ -0,0 +1,20 @@
+#ifndef HeterogeneousCore_CUDACore_AnalyzeContext_h
+#define HeterogeneousCore_CUDACore_AnalyzeContext_h
+
+#include "CUDACore/EDGetterContextBase.h"
+
+namespace cms::cuda {
+  /**
+   * The aim of this class is to do necessary per-event "initialization" in analyze()
+   * - setting the current device
+   * - synchronizing between CUDA streams if necessary
+   * and enforce that those get done in a proper way in RAII fashion.
+   */
+  class AnalyzeContext : public impl::EDGetterContextBase {
+  public:
+    /// Constructor to (possibly) re-use a CUDA stream
+    explicit AnalyzeContext(edm::StreamID streamID) : EDGetterContextBase(streamID) {}
+  };
+}
+
+#endif
diff --git a/src/cudadev/CUDACore/Context.cc b/src/cudadev/CUDACore/Context.cc
deleted file mode 100644
index b34041e53..000000000
--- a/src/cudadev/CUDACore/Context.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-#include "CUDACore/Context.h"
-
-#include "CUDACore/StreamCache.h"
-#include "CUDACore/cudaCheck.h"
-
-#include "chooseDevice.h"
-
-namespace {
-  struct CallbackData {
-    edm::WaitingTaskWithArenaHolder holder;
-    int device;
-  };
-
-  void CUDART_CB cudaContextCallback(cudaStream_t streamId, cudaError_t status, void* data) {
-    std::unique_ptr<CallbackData> guard{reinterpret_cast<CallbackData*>(data)};
-    edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder;
-    int device = guard->device;
-    if (status == cudaSuccess) {
-      //std::cout << " GPU kernel finished (in callback) device " << device << " CUDA stream "
-      //          << streamId << std::endl;
-      waitingTaskHolder.doneWaiting(nullptr);
-    } else {
-      // wrap the exception in a try-catch block to let GDB "catch throw" break on it
-      try {
-        auto error = cudaGetErrorName(status);
-        auto message = cudaGetErrorString(status);
-        throw std::runtime_error("Callback of CUDA stream " +
-                                 std::to_string(reinterpret_cast<unsigned long>(streamId)) + " in device " +
-                                 std::to_string(device) + " error " + std::string(error) + ": " + std::string(message));
-      } catch (std::exception&) {
-        waitingTaskHolder.doneWaiting(std::current_exception());
-      }
-    }
-  }
-}  // namespace
-
-namespace cms::cuda {
-  namespace impl {
-    Context::Context(edm::StreamID streamID) : Context(chooseDevice(streamID)) {}
-
-    Context::Context(int device) : currentDevice_(device) { cudaCheck(cudaSetDevice(currentDevice_)); }
-
-    Context::Context(int device, SharedStreamPtr stream)
-        : currentDevice_(device), stream_(std::make_shared<impl::StreamSharingHelper>(std::move(stream))) {
-      cudaCheck(cudaSetDevice(currentDevice_));
-    }
-
-    void Context::initialize() { stream_ = std::make_shared<impl::StreamSharingHelper>(getStreamCache().get()); }
-
-    void Context::initialize(const ProductBase& data) {
-      SharedStreamPtr stream;
-      if (data.mayReuseStream()) {
-        stream = data.streamPtr();
-      } else {
-        stream = getStreamCache().get();
-      }
-      stream_ = std::make_shared<impl::StreamSharingHelper>(std::move(stream));
-    }
-
-    ////////////////////
-
-    void ContextGetterBase::synchronizeStreams(int dataDevice,
-                                               cudaStream_t dataStream,
-                                               bool available,
-                                               cudaEvent_t dataEvent) {
-      if (dataDevice != device()) {
-        // Eventually replace with prefetch to current device (assuming unified memory works)
-        // If we won't go to unified memory, need to figure out something else...
-        throw std::runtime_error("Handling data from multiple devices is not yet supported");
-      }
-
-      if (dataStream != stream()) {
-        // Different streams, need to synchronize
-        if (not available) {
-          // Event not yet occurred, so need to add synchronization
-          // here. Sychronization is done by making the CUDA stream to
-          // wait for an event, so all subsequent work in the stream
-          // will run only after the event has "occurred" (i.e. data
-          // product became available).
-          cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event");
-        }
-      }
-    }
-
-    ////////////////////
-
-    void ContextHolderHelper::enqueueCallback(cudaStream_t stream) {
-      cudaCheck(cudaStreamAddCallback(stream, cudaContextCallback, new CallbackData{waitingTaskHolder_, device_}, 0));
-    }
-  }  // namespace impl
-
-  ////////////////////
-
-  void AcquireContext::commit() { holderHelper_.enqueueCallback(stream()); }
-
-  ////////////////////
-
-  void ProduceContext::commit() { cudaCheck(cudaEventRecord(event_.get(), stream())); }
-
-  ////////////////////
-
-  void TaskContext::commit() { holderHelper_.enqueueCallback(stream()); }
-}  // namespace cms::cuda
diff --git a/src/cudadev/CUDACore/Context.h b/src/cudadev/CUDACore/Context.h
index d2b7f6343..c55fc59fb 100644
--- a/src/cudadev/CUDACore/Context.h
+++ b/src/cudadev/CUDACore/Context.h
@@ -1,249 +1,7 @@
 #ifndef HeterogeneousCore_CUDACore_Context_h
 #define HeterogeneousCore_CUDACore_Context_h
 
-#include "CUDACore/Product.h"
-#include "Framework/WaitingTaskWithArenaHolder.h"
-#include "Framework/Event.h"
-#include "Framework/EDGetToken.h"
-#include "Framework/EDPutToken.h"
-#include "CUDACore/EventCache.h"
-#include "CUDACore/SharedEventPtr.h"
-#include "CUDACore/SharedStreamPtr.h"
-
-namespace cms::cuda {
-  namespace impl {
-    // This class is intended to be derived by other Context*, not for general use
-    class Context {
-    public:
-      Context(Context const&) = delete;
-      Context& operator=(Context const&) = delete;
-      Context(Context&&) = delete;
-      Context& operator=(Context&&) = delete;
-
-      int device() const { return currentDevice_; }
-
-      // cudaStream_t is a pointer to a thread-safe object, for which a
-      // mutable access is needed even if the Context itself
-      // would be const. Therefore it is ok to return a non-const
-      // pointer from a const method here.
-      cudaStream_t stream() {
-        if (not isInitialized()) {
-          initialize();
-        }
-        return stream_->streamPtr().get();
-      }
-      const SharedStreamPtr& streamPtr() {
-        if (not isInitialized()) {
-          initialize();
-        }
-        return stream_->streamPtr();
-      }
-
-    protected:
-      // The constructors set the current device, but the device
-      // is not set back to the previous value at the destructor. This
-      // should be sufficient (and tiny bit faster) as all CUDA API
-      // functions relying on the current device should be called from
-      // the scope where this context is. The current device doesn't
-      // really matter between modules (or across TBB tasks).
-      explicit Context(edm::StreamID streamID);
-
-      explicit Context(int device);
-
-      // meant only for testing
-      explicit Context(int device, SharedStreamPtr stream);
-
-      bool isInitialized() const { return bool(stream_); }
-
-      void initialize();
-      void initialize(const ProductBase& data);
-
-      const std::shared_ptr<impl::StreamSharingHelper>& streamSharingHelper() {
-        if (not isInitialized()) {
-          initialize();
-        }
-        return stream_;
-      }
-
-    private:
-      int currentDevice_ = -1;
-      std::shared_ptr<impl::StreamSharingHelper> stream_;
-    };
-
-    class ContextGetterBase : public Context {
-    public:
-      template <typename T>
-      const T& get(const Product<T>& data) {
-        if (not isInitialized()) {
-          initialize(data);
-        }
-        synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event());
-        return data.data_;
-      }
-
-      template <typename T>
-      const T& get(const edm::Event& iEvent, edm::EDGetTokenT<Product<T>> token) {
-        return get(iEvent.get(token));
-      }
-
-    protected:
-      template <typename... Args>
-      ContextGetterBase(Args&&... args) : Context(std::forward<Args>(args)...) {}
-
-    private:
-      void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent);
-    };
-
-    class ContextHolderHelper {
-    public:
-      ContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder, int device)
-          : waitingTaskHolder_{std::move(waitingTaskHolder)}, device_{device} {}
-
-      template <typename F>
-      void pushNextTask(F&& f);
-
-      void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-        waitingTaskHolder_ = std::move(waitingTaskHolder);
-      }
-
-      void enqueueCallback(cudaStream_t stream);
-
-    private:
-      edm::WaitingTaskWithArenaHolder waitingTaskHolder_;
-      int device_;
-    };
-  }  // namespace impl
-
-  /**
-     * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire():
-     * - setting the current device
-     * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
-     * - synchronizing between CUDA streams if necessary
-     * Users should not, however, construct it explicitly.
-     */
-  class AcquireContext : public impl::ContextGetterBase {
-  public:
-    explicit AcquireContext(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
-        : ContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder), device()} {}
-    ~AcquireContext() = default;
-
-    template <typename F>
-    void pushNextTask(F&& f) {
-      holderHelper_.pushNextTask(std::forward<F>(f));
-    }
-
-    void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-      holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
-    }
-
-    // internal API
-    void commit();
-
-  private:
-    impl::ContextHolderHelper holderHelper_;
-  };
-
-  /**
-   * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce():
-   * - setting the current device
-   * - synchronizing between CUDA streams if necessary
-   * Users should not, however, construct it explicitly.
-   */
-  class ProduceContext : public impl::ContextGetterBase {
-  public:
-    explicit ProduceContext(edm::StreamID streamID) : ContextGetterBase(streamID) {}
-
-    ~ProduceContext() = default;
-
-    template <typename T>
-    std::unique_ptr<Product<T>> wrap(T data) {
-      // make_unique doesn't work because of private constructor
-      return std::unique_ptr<Product<T>>(new Product<T>(device(), streamSharingHelper(), event_, std::move(data)));
-    }
-
-    template <typename T, typename... Args>
-    auto emplace(edm::Event& iEvent, edm::EDPutTokenT<T> token, Args&&... args) {
-      return iEvent.emplace(token, device(), streamSharingHelper(), event_, std::forward<Args>(args)...);
-    }
-
-    // internal API
-    void commit();
-
-  private:
-    // This construcor is only meant for testing
-    explicit ProduceContext(int device, SharedStreamPtr stream, SharedEventPtr event)
-        : ContextGetterBase(device, std::move(stream)), event_{std::move(event)} {}
-
-    // create the CUDA Event upfront to catch possible errors from its creation
-    SharedEventPtr event_ = getEventCache().get();
-  };
-
-  /**
-   * The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire():
-   * - setting the current device
-   * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
-   */
-  class TaskContext : public impl::Context {
-  public:
-    /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
-    explicit TaskContext(int device, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
-        : Context(device), holderHelper_{std::move(waitingTaskHolder), device} {}
-
-    ~TaskContext() = default;
-
-    template <typename F>
-    void pushNextTask(F&& f) {
-      holderHelper_.pushNextTask(std::forward<F>(f));
-    }
-
-    void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-      holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
-    }
-
-    // Internal API
-    void commit();
-
-  private:
-    impl::ContextHolderHelper holderHelper_;
-  };
-
-  /**
-   * The aim of this class is to do necessary per-event "initialization" in analyze()
-   * - setting the current device
-   * - synchronizing between CUDA streams if necessary
-   * and enforce that those get done in a proper way in RAII fashion.
-   */
-  class AnalyzeContext : public impl::ContextGetterBase {
-  public:
-    /// Constructor to (possibly) re-use a CUDA stream
-    explicit AnalyzeContext(edm::StreamID streamID) : ContextGetterBase(streamID) {}
-  };
-
-  namespace impl {
-    template <typename F>
-    void ContextHolderHelper::pushNextTask(F&& f) {
-      replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{edm::make_waiting_task_with_holder(
-          tbb::task::allocate_root(),
-          std::move(waitingTaskHolder_),
-          [device = device_, func = std::forward<F>(f)](edm::WaitingTaskWithArenaHolder h) {
-            func(TaskContext{device, std::move(h)});
-          })});
-    }
-  }  // namespace impl
-
-  template <typename F>
-  void runAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder holder, F func) {
-    AcquireContext context(streamID, std::move(holder));
-    func(context);
-    context.commit();
-  }
-
-  template <typename F>
-  void runProduce(edm::StreamID streamID, F func) {
-    ProduceContext context(streamID);
-    func(context);
-    context.commit();
-  }
-}  // namespace cms::cuda
-
+#include "CUDACore/AcquireContext.h"
+#include "CUDACore/ProduceContext.h"
+#include "CUDACore/AnalyzeContext.h"
 #endif
diff --git a/src/cudadev/CUDACore/EDGetterContextBase.cc b/src/cudadev/CUDACore/EDGetterContextBase.cc
new file mode 100644
index 000000000..cfeccb178
--- /dev/null
+++ b/src/cudadev/CUDACore/EDGetterContextBase.cc
@@ -0,0 +1,27 @@
+#include "CUDACore/EDGetterContextBase.h"
+#include "CUDACore/cudaCheck.h"
+
+namespace cms::cuda::impl {
+  void EDGetterContextBase::synchronizeStreams(int dataDevice,
+                                               cudaStream_t dataStream,
+                                               bool available,
+                                               cudaEvent_t dataEvent) {
+    if (dataDevice != device()) {
+      // Eventually replace with prefetch to current device (assuming unified memory works)
+      // If we won't go to unified memory, need to figure out something else...
+      throw std::runtime_error("Handling data from multiple devices is not yet supported");
+    }
+
+    if (dataStream != stream()) {
+      // Different streams, need to synchronize
+      if (not available) {
+        // Event not yet occurred, so need to add synchronization
+        // here. Sychronization is done by making the CUDA stream to
+        // wait for an event, so all subsequent work in the stream
+        // will run only after the event has "occurred" (i.e. data
+        // product became available).
+        cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event");
+      }
+    }
+  }
+}
diff --git a/src/cudadev/CUDACore/EDGetterContextBase.h b/src/cudadev/CUDACore/EDGetterContextBase.h
new file mode 100644
index 000000000..d17814d69
--- /dev/null
+++ b/src/cudadev/CUDACore/EDGetterContextBase.h
@@ -0,0 +1,38 @@
+#ifndef HeterogeneousCore_CUDACore_EDGetterContextBase_h
+#define HeterogeneousCore_CUDACore_EDGetterContextBase_h
+
+#include "CUDACore/FwkContextBase.h"
+#include "CUDACore/Product.h"
+#include "Framework/EDGetToken.h"
+
+namespace cms::cuda::impl {
+  /**
+   * This class is a base class for Context classes that should be
+   * able to read Event Data products
+   */
+  class EDGetterContextBase : public FwkContextBase {
+  public:
+    template <typename T>
+    const T& get(const Product<T>& data) {
+      if (not isInitialized()) {
+        initialize(data);
+      }
+      synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event());
+      return data.data_;
+    }
+
+    template <typename T>
+    const T& get(const edm::Event& iEvent, edm::EDGetTokenT<Product<T>> token) {
+      return get(iEvent.get(token));
+    }
+
+  protected:
+    template <typename... Args>
+    EDGetterContextBase(Args&&... args) : FwkContextBase(std::forward<Args>(args)...) {}
+
+  private:
+    void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent);
+  };
+}
+
+#endif
diff --git a/src/cudadev/CUDACore/FwkContextBase.cc b/src/cudadev/CUDACore/FwkContextBase.cc
new file mode 100644
index 000000000..47e2d8199
--- /dev/null
+++ b/src/cudadev/CUDACore/FwkContextBase.cc
@@ -0,0 +1,28 @@
+#include "CUDACore/FwkContextBase.h"
+#include "CUDACore/StreamCache.h"
+#include "CUDACore/cudaCheck.h"
+
+#include "chooseDevice.h"
+
+namespace cms::cuda::impl {
+  FwkContextBase::FwkContextBase(edm::StreamID streamID) : FwkContextBase(chooseDevice(streamID)) {}
+
+  FwkContextBase::FwkContextBase(int device) : currentDevice_(device) { cudaCheck(cudaSetDevice(currentDevice_)); }
+
+  FwkContextBase::FwkContextBase(int device, SharedStreamPtr stream)
+    : currentDevice_(device), stream_(std::make_shared<impl::StreamSharingHelper>(std::move(stream))) {
+    cudaCheck(cudaSetDevice(currentDevice_));
+  }
+
+  void FwkContextBase::initialize() { stream_ = std::make_shared<impl::StreamSharingHelper>(getStreamCache().get()); }
+
+  void FwkContextBase::initialize(const ProductBase& data) {
+    SharedStreamPtr stream;
+    if (data.mayReuseStream()) {
+      stream = data.streamPtr();
+    } else {
+      stream = getStreamCache().get();
+    }
+    stream_ = std::make_shared<impl::StreamSharingHelper>(std::move(stream));
+  }
+}
diff --git a/src/cudadev/CUDACore/FwkContextBase.h b/src/cudadev/CUDACore/FwkContextBase.h
new file mode 100644
index 000000000..1b329cb03
--- /dev/null
+++ b/src/cudadev/CUDACore/FwkContextBase.h
@@ -0,0 +1,66 @@
+#ifndef HeterogeneousCore_CUDACore_FwkContextBase_h
+#define HeterogeneousCore_CUDACore_FwkContextBase_h
+
+#include "CUDACore/ProductBase.h"
+#include "CUDACore/SharedStreamPtr.h"
+#include "Framework/Event.h"
+
+namespace cms::cuda::impl {
+  /**
+   * This class is a base class for other Context classes for interacting with the framework
+   */
+  class FwkContextBase {
+  public:
+    FwkContextBase(FwkContextBase const&) = delete;
+    FwkContextBase& operator=(FwkContextBase const&) = delete;
+    FwkContextBase(FwkContextBase&&) = delete;
+    FwkContextBase& operator=(FwkContextBase&&) = delete;
+    
+    int device() const { return currentDevice_; }
+
+    cudaStream_t stream() {
+      if (not isInitialized()) {
+        initialize();
+      }
+      return stream_->streamPtr().get();
+    }
+    const SharedStreamPtr& streamPtr() {
+      if (not isInitialized()) {
+        initialize();
+      }
+      return stream_->streamPtr();
+    }
+
+  protected:
+    // The constructors set the current device, but the device
+    // is not set back to the previous value at the destructor. This
+    // should be sufficient (and tiny bit faster) as all CUDA API
+    // functions relying on the current device should be called from
+    // the scope where this context is. The current device doesn't
+    // really matter between modules (or across TBB tasks).
+    explicit FwkContextBase(edm::StreamID streamID);
+
+    explicit FwkContextBase(int device);
+
+    // meant only for testing
+    explicit FwkContextBase(int device, SharedStreamPtr stream);
+
+    bool isInitialized() const { return bool(stream_); }
+
+    void initialize();
+    void initialize(const ProductBase& data);
+
+    const std::shared_ptr<impl::StreamSharingHelper>& streamSharingHelper() {
+      if (not isInitialized()) {
+        initialize();
+      }
+      return stream_;
+    }
+
+  private:
+    int currentDevice_ = -1;
+    std::shared_ptr<impl::StreamSharingHelper> stream_;
+  };
+}
+
+#endif
diff --git a/src/cudadev/CUDACore/FwkContextHolderHelper.h b/src/cudadev/CUDACore/FwkContextHolderHelper.h
new file mode 100644
index 000000000..18e465142
--- /dev/null
+++ b/src/cudadev/CUDACore/FwkContextHolderHelper.h
@@ -0,0 +1,25 @@
+#ifndef HeterogeneousCore_CUDACore_FwkContextHolderHelper_h
+#define HeterogeneousCore_CUDACore_FwkContextHolderHelper_h
+
+namespace cms::cuda::impl {
+  class FwkContextHolderHelper {
+  public:
+    FwkContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder, int device)
+      : waitingTaskHolder_{std::move(waitingTaskHolder)}, device_{device} {}
+
+    template <typename F>
+    void pushNextTask(F&& f);
+
+    void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+      waitingTaskHolder_ = std::move(waitingTaskHolder);
+    }
+
+    void enqueueCallback(cudaStream_t stream);
+
+  private:
+    edm::WaitingTaskWithArenaHolder waitingTaskHolder_;
+    int device_;
+  };
+}
+
+#endif
diff --git a/src/cudadev/CUDACore/ProduceContext.cc b/src/cudadev/CUDACore/ProduceContext.cc
new file mode 100644
index 000000000..0af59c3a7
--- /dev/null
+++ b/src/cudadev/CUDACore/ProduceContext.cc
@@ -0,0 +1,6 @@
+#include "CUDACore/ProduceContext.h"
+#include "CUDACore/cudaCheck.h"
+
+namespace cms::cuda {
+  void ProduceContext::commit() { cudaCheck(cudaEventRecord(event_.get(), stream())); }
+}
diff --git a/src/cudadev/CUDACore/ProduceContext.h b/src/cudadev/CUDACore/ProduceContext.h
new file mode 100644
index 000000000..980e53377
--- /dev/null
+++ b/src/cudadev/CUDACore/ProduceContext.h
@@ -0,0 +1,52 @@
+#ifndef HeterogeneousCore_CUDACore_ProduceContext_h
+#define HeterogeneousCore_CUDACore_ProduceContext_h
+
+#include "CUDACore/EDGetterContextBase.h"
+#include "CUDACore/EventCache.h"
+#include "Framework/EDPutToken.h"
+
+namespace cms::cuda {
+  /**
+   * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce():
+   * - setting the current device
+   * - synchronizing between CUDA streams if necessary
+   * Users should not, however, construct it explicitly.
+   */
+  class ProduceContext : public impl::EDGetterContextBase {
+  public:
+    explicit ProduceContext(edm::StreamID streamID) : EDGetterContextBase(streamID) {}
+
+    ~ProduceContext() = default;
+
+    template <typename T>
+    std::unique_ptr<Product<T>> wrap(T data) {
+      // make_unique doesn't work because of private constructor
+      return std::unique_ptr<Product<T>>(new Product<T>(device(), streamSharingHelper(), event_, std::move(data)));
+    }
+
+    template <typename T, typename... Args>
+    auto emplace(edm::Event& iEvent, edm::EDPutTokenT<T> token, Args&&... args) {
+      return iEvent.emplace(token, device(), streamSharingHelper(), event_, std::forward<Args>(args)...);
+    }
+
+    // internal API
+    void commit();
+
+  private:
+    // This construcor is only meant for testing
+    explicit ProduceContext(int device, SharedStreamPtr stream, SharedEventPtr event)
+        : EDGetterContextBase(device, std::move(stream)), event_{std::move(event)} {}
+
+    // create the CUDA Event upfront to catch possible errors from its creation
+    SharedEventPtr event_ = getEventCache().get();
+  };
+
+  template <typename F>
+  void runProduce(edm::StreamID streamID, F&& func) {
+    ProduceContext context(streamID);
+    func(context);
+    context.commit();
+  }
+}
+
+#endif
diff --git a/src/cudadev/CUDACore/Product.h b/src/cudadev/CUDACore/Product.h
index 759079003..5d9aaf36e 100644
--- a/src/cudadev/CUDACore/Product.h
+++ b/src/cudadev/CUDACore/Product.h
@@ -13,7 +13,7 @@ namespace edm {
 namespace cms {
   namespace cuda {
     namespace impl {
-      class ContextGetterBase;
+      class EDGetterContextBase;
     }  // namespace impl
 
     /**
@@ -41,7 +41,7 @@ namespace cms {
       Product& operator=(Product&&) = default;
 
     private:
-      friend class impl::ContextGetterBase;
+      friend class impl::EDGetterContextBase;
       friend class ProduceContext;
       friend class edm::Wrapper<Product<T>>;
 
diff --git a/src/cudadev/CUDACore/ProductBase.h b/src/cudadev/CUDACore/ProductBase.h
index e2a367341..e1f04ec12 100644
--- a/src/cudadev/CUDACore/ProductBase.h
+++ b/src/cudadev/CUDACore/ProductBase.h
@@ -10,7 +10,7 @@
 namespace cms {
   namespace cuda {
     namespace impl {
-      class Context;
+      class FwkContextBase;
 
       /**
        * The CUDA stream is shared between all the Event products of
@@ -66,13 +66,13 @@ namespace cms {
       int device() const { return device_; }
 
       // cudaStream_t is a pointer to a thread-safe object, for which a
-      // mutable access is needed even if the cms::cuda::Context itself
+      // mutable access is needed even if the ProductBase itself
       // would be const. Therefore it is ok to return a non-const
       // pointer from a const method here.
       cudaStream_t stream() const { return stream_->streamPtr().get(); }
 
       // cudaEvent_t is a pointer to a thread-safe object, for which a
-      // mutable access is needed even if the cms::cuda::Context itself
+      // mutable access is needed even if the ProductBase itself
       // would be const. Therefore it is ok to return a non-const
       // pointer from a const method here.
       cudaEvent_t event() const { return event_.get(); }
@@ -82,7 +82,7 @@ namespace cms {
           : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {}
 
     private:
-      friend class impl::Context;
+      friend class impl::FwkContextBase;
       friend class ProduceContext;
 
       // The following function is intended to be used only from Context
diff --git a/src/cudadev/CUDACore/TaskContext.cc b/src/cudadev/CUDACore/TaskContext.cc
new file mode 100644
index 000000000..a2c5b0194
--- /dev/null
+++ b/src/cudadev/CUDACore/TaskContext.cc
@@ -0,0 +1,41 @@
+#include "CUDACore/TaskContext.h"
+#include "CUDACore/cudaCheck.h"
+
+namespace {
+  struct CallbackData {
+    edm::WaitingTaskWithArenaHolder holder;
+    int device;
+  };
+
+  void CUDART_CB cudaContextCallback(cudaStream_t streamId, cudaError_t status, void* data) {
+    std::unique_ptr<CallbackData> guard{reinterpret_cast<CallbackData*>(data)};
+    edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder;
+    int device = guard->device;
+    if (status == cudaSuccess) {
+      //std::cout << " GPU kernel finished (in callback) device " << device << " CUDA stream "
+      //          << streamId << std::endl;
+      waitingTaskHolder.doneWaiting(nullptr);
+    } else {
+      // wrap the exception in a try-catch block to let GDB "catch throw" break on it
+      try {
+        auto error = cudaGetErrorName(status);
+        auto message = cudaGetErrorString(status);
+        throw std::runtime_error("Callback of CUDA stream " +
+                                 std::to_string(reinterpret_cast<unsigned long>(streamId)) + " in device " +
+                                 std::to_string(device) + " error " + std::string(error) + ": " + std::string(message));
+      } catch (std::exception&) {
+        waitingTaskHolder.doneWaiting(std::current_exception());
+      }
+    }
+  }
+}  // namespace
+
+namespace cms::cuda {
+  namespace impl {
+    void FwkContextHolderHelper::enqueueCallback(cudaStream_t stream) {
+      cudaCheck(cudaStreamAddCallback(stream, cudaContextCallback, new CallbackData{waitingTaskHolder_, device_}, 0));
+    }
+  }
+
+  void TaskContext::commit() { holderHelper_.enqueueCallback(stream()); }
+}
diff --git a/src/cudadev/CUDACore/TaskContext.h b/src/cudadev/CUDACore/TaskContext.h
new file mode 100644
index 000000000..121c9165d
--- /dev/null
+++ b/src/cudadev/CUDACore/TaskContext.h
@@ -0,0 +1,71 @@
+#ifndef HeterogeneousCore_CUDACore_TaskContext_h
+#define HeterogeneousCore_CUDACore_TaskContext_h
+
+#include "CUDACore/FwkContextBase.h"
+#include "Framework/WaitingTaskWithArenaHolder.h"
+
+namespace cms::cuda {
+  namespace impl {
+    class FwkContextHolderHelper {
+    public:
+      FwkContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder, int device)
+        : waitingTaskHolder_{std::move(waitingTaskHolder)}, device_{device} {}
+
+      template <typename F>
+      void pushNextTask(F&& f);
+
+      void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+        waitingTaskHolder_ = std::move(waitingTaskHolder);
+      }
+
+      void enqueueCallback(cudaStream_t stream);
+
+    private:
+      edm::WaitingTaskWithArenaHolder waitingTaskHolder_;
+      int device_;
+    };
+  }
+
+  /**
+   * The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire():
+   * - setting the current device
+   * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
+   */
+  class TaskContext : public impl::FwkContextBase {
+  public:
+    /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
+    explicit TaskContext(int device, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+        : FwkContextBase(device), holderHelper_{std::move(waitingTaskHolder), device} {}
+
+    ~TaskContext() = default;
+
+    template <typename F>
+    void pushNextTask(F&& f) {
+      holderHelper_.pushNextTask(std::forward<F>(f));
+    }
+
+    void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+      holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
+    }
+
+    // Internal API
+    void commit();
+
+  private:
+    impl::FwkContextHolderHelper holderHelper_;
+  };
+
+  namespace impl {
+    template <typename F>
+    void FwkContextHolderHelper::pushNextTask(F&& f) {
+      replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{edm::make_waiting_task_with_holder(
+          tbb::task::allocate_root(),
+          std::move(waitingTaskHolder_),
+          [device = device_, func = std::forward<F>(f)](edm::WaitingTaskWithArenaHolder h) {
+            func(TaskContext{device, std::move(h)});
+          })});
+    }
+  }  // namespace impl
+}
+
+#endif

From e5b8bddea19f8ca28ec2dc859ebe78bc9d2c8ced Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 8 Sep 2021 18:32:51 -0700
Subject: [PATCH 08/12] Migrate code away from Context.h

---
 src/cudadev/CUDACore/EDProducer.h                              | 3 ++-
 src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc  | 1 -
 .../plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc       | 2 +-
 .../plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc        | 1 -
 .../plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc       | 1 -
 src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc | 1 -
 src/cudadev/plugin-Validation/CountValidator.cc                | 2 +-
 src/cudadev/plugin-Validation/HistoValidator.cc                | 1 -
 8 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/cudadev/CUDACore/EDProducer.h b/src/cudadev/CUDACore/EDProducer.h
index 6fe6962bd..92529c502 100644
--- a/src/cudadev/CUDACore/EDProducer.h
+++ b/src/cudadev/CUDACore/EDProducer.h
@@ -2,7 +2,8 @@
 #define HeterogeneousCore_CUDACore_stream_EDProducer_h
 
 #include "Framework/EDProducer.h"
-#include "CUDACore/Context.h"
+#include "CUDACore/AcquireContext.h"
+#include "CUDACore/ProduceContext.h"
 
 namespace cms::cuda {
   class EDProducer : public edm::EDProducer {
diff --git a/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc b/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc
index 137bcb073..c9e0cc5d9 100644
--- a/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc
+++ b/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc
@@ -1,6 +1,5 @@
 #include <cuda_runtime.h>
 
-#include "CUDACore/Context.h"
 #include "CUDACore/EDProducer.h"
 #include "CUDACore/Product.h"
 #include "CUDACore/HostProduct.h"
diff --git a/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc b/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc
index 50827a71a..d4ca2196b 100644
--- a/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc
+++ b/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc
@@ -1,12 +1,12 @@
 #include <cuda_runtime.h>
 
 #include "CUDACore/Product.h"
+#include "CUDACore/ProduceContext.h"
 #include "Framework/EventSetup.h"
 #include "Framework/Event.h"
 #include "Framework/PluginFactory.h"
 #include "Framework/EDProducer.h"
 #include "Framework/RunningAverage.h"
-#include "CUDACore/Context.h"
 
 #include "gpuVertexFinder.h"
 
diff --git a/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc b/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc
index 79ad0ca91..4f38c0ad8 100644
--- a/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc
+++ b/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc
@@ -1,6 +1,5 @@
 #include <cuda_runtime.h>
 
-#include "CUDACore/Context.h"
 #include "CUDACore/EDProducer.h"
 #include "CUDACore/Product.h"
 #include "CUDACore/HostProduct.h"
diff --git a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
index 7a2875ad9..4f06bf540 100644
--- a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
+++ b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
@@ -4,7 +4,6 @@
 #include <vector>
 
 // CMSSW includes
-#include "CUDACore/Context.h"
 #include "CUDACore/EDProducer.h"
 #include "CUDACore/Product.h"
 #include "CUDADataFormats/SiPixelClustersCUDA.h"
diff --git a/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc b/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc
index 8b90c5326..8d1f8b3c4 100644
--- a/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc
+++ b/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc
@@ -1,4 +1,3 @@
-#include "CUDACore/Context.h"
 #include "CUDACore/EDProducer.h"
 #include "CUDACore/Product.h"
 #include "CUDADataFormats/SiPixelDigisCUDA.h"
diff --git a/src/cudadev/plugin-Validation/CountValidator.cc b/src/cudadev/plugin-Validation/CountValidator.cc
index 1993e20eb..bd9f40989 100644
--- a/src/cudadev/plugin-Validation/CountValidator.cc
+++ b/src/cudadev/plugin-Validation/CountValidator.cc
@@ -1,4 +1,4 @@
-#include "CUDACore/Context.h"
+#include "CUDACore/ProduceContext.h"
 #include "CUDACore/Product.h"
 #include "CUDADataFormats/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/SiPixelClustersCUDA.h"
diff --git a/src/cudadev/plugin-Validation/HistoValidator.cc b/src/cudadev/plugin-Validation/HistoValidator.cc
index c23bb42af..f8af1773d 100644
--- a/src/cudadev/plugin-Validation/HistoValidator.cc
+++ b/src/cudadev/plugin-Validation/HistoValidator.cc
@@ -1,4 +1,3 @@
-#include "CUDACore/Context.h"
 #include "CUDACore/EDProducer.h"
 #include "CUDACore/Product.h"
 #include "CUDADataFormats/PixelTrackHeterogeneous.h"

From eaf55b24d80adc9917a5653f1f5ef62a23ce1971 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 8 Sep 2021 18:33:11 -0700
Subject: [PATCH 09/12] Remove Context.h

---
 src/cudadev/CUDACore/Context.h | 7 -------
 1 file changed, 7 deletions(-)
 delete mode 100644 src/cudadev/CUDACore/Context.h

diff --git a/src/cudadev/CUDACore/Context.h b/src/cudadev/CUDACore/Context.h
deleted file mode 100644
index c55fc59fb..000000000
--- a/src/cudadev/CUDACore/Context.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_Context_h
-#define HeterogeneousCore_CUDACore_Context_h
-
-#include "CUDACore/AcquireContext.h"
-#include "CUDACore/ProduceContext.h"
-#include "CUDACore/AnalyzeContext.h"
-#endif

From 744c94f783121adbd80cdaeb762d774fcbe0039b Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 3 Sep 2021 12:18:45 -0700
Subject: [PATCH 10/12] Add lightweight Context class with memory allocation
 oriented counterparts

---
 src/cudadev/CUDACore/Context.h        | 46 +++++++++++++++++++++++++++
 src/cudadev/CUDACore/FwkContextBase.h |  5 +++
 2 files changed, 51 insertions(+)
 create mode 100644 src/cudadev/CUDACore/Context.h

diff --git a/src/cudadev/CUDACore/Context.h b/src/cudadev/CUDACore/Context.h
new file mode 100644
index 000000000..17e814bab
--- /dev/null
+++ b/src/cudadev/CUDACore/Context.h
@@ -0,0 +1,46 @@
+#ifndef HeterogeneousCore_CUDAUtilities_Context_h
+#define HeterogeneousCore_CUDAUtilities_Context_h
+
+#include "CUDACore/allocate_device.h"
+#include "CUDACore/allocate_host.h"
+
+namespace cms::cuda {
+  class HostAllocatorContext {
+  public:
+    explicit HostAllocatorContext(cudaStream_t stream) : stream_(stream) {}
+
+    void *allocate_host(size_t nbytes) const { return cms::cuda::allocate_host(nbytes, stream_); }
+
+    void free_host(void *ptr) const { cms::cuda::free_host(ptr); }
+
+  private:
+    cudaStream_t stream_;
+  };
+
+  class DeviceAllocatorContext {
+  public:
+    explicit DeviceAllocatorContext(cudaStream_t stream) : stream_(stream) {}
+
+    void *allocate_device(size_t nbytes) const { return cms::cuda::allocate_device(nbytes, stream_); }
+
+    void free_device(void *ptr) const { cms::cuda::free_device(ptr, stream_); }
+
+  private:
+    cudaStream_t stream_;
+  };
+
+  class Context {
+  public:
+    explicit Context(cudaStream_t stream) : stream_(stream) {}
+
+    cudaStream_t stream() const { return stream_; }
+
+    operator HostAllocatorContext() const { return HostAllocatorContext(stream()); }
+    operator DeviceAllocatorContext() const { return DeviceAllocatorContext(stream()); }
+
+  private:
+    cudaStream_t stream_;
+  };
+}  // namespace cms::cuda
+
+#endif
diff --git a/src/cudadev/CUDACore/FwkContextBase.h b/src/cudadev/CUDACore/FwkContextBase.h
index 1b329cb03..7fb82478a 100644
--- a/src/cudadev/CUDACore/FwkContextBase.h
+++ b/src/cudadev/CUDACore/FwkContextBase.h
@@ -1,6 +1,7 @@
 #ifndef HeterogeneousCore_CUDACore_FwkContextBase_h
 #define HeterogeneousCore_CUDACore_FwkContextBase_h
 
+#include "CUDACore/Context.h"
 #include "CUDACore/ProductBase.h"
 #include "CUDACore/SharedStreamPtr.h"
 #include "Framework/Event.h"
@@ -31,6 +32,10 @@ namespace cms::cuda::impl {
       return stream_->streamPtr();
     }
 
+    operator HostAllocatorContext() { return HostAllocatorContext(stream()); }
+    operator DeviceAllocatorContext() { return DeviceAllocatorContext(stream()); }
+    operator Context() { return Context(stream()); }
+
   protected:
     // The constructors set the current device, but the device
     // is not set back to the previous value at the destructor. This

From 0429ad5dfb6c4b8941b8753debb9030ba99d34ad Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 7 Sep 2021 14:06:43 -0700
Subject: [PATCH 11/12] Add TestContext for tests

---
 src/cudadev/CUDACore/TestContext.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 src/cudadev/CUDACore/TestContext.h

diff --git a/src/cudadev/CUDACore/TestContext.h b/src/cudadev/CUDACore/TestContext.h
new file mode 100644
index 000000000..ce0c8c35a
--- /dev/null
+++ b/src/cudadev/CUDACore/TestContext.h
@@ -0,0 +1,22 @@
+#ifndef HeterogeneousCore_CUDAUtilities_TestContext_h
+#define HeterogeneousCore_CUDAUtilities_TestContext_h
+
+#include "CUDACore/Context.h"
+#include "CUDACore/currentDevice.h"
+
+namespace cms::cudatest {
+  class TestContext {
+  public:
+    TestContext() : TestContext(cudaStreamDefault) {}
+    explicit TestContext(cudaStream_t stream) : stream_{stream} {}
+
+    operator cms::cuda::HostAllocatorContext() const { return cms::cuda::HostAllocatorContext(stream_); }
+    operator cms::cuda::DeviceAllocatorContext() const { return cms::cuda::DeviceAllocatorContext(stream_); }
+    operator cms::cuda::Context() const { return cms::cuda::Context(stream_); }
+
+  private:
+    cudaStream_t stream_;
+  };
+}  // namespace cms::cudatest
+
+#endif

From c91e0e309db65d7aca1094021a68011ecdfc4d8b Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 7 Sep 2021 14:07:36 -0700
Subject: [PATCH 12/12] Migrate make_(device|host)_unique and all calling code
 to pass cms::cuda::Context objects instead of cudaStream_t

---
 src/cudadev/CUDACore/device_unique_ptr.h      |  41 ++---
 src/cudadev/CUDACore/host_unique_ptr.h        |  40 +++--
 src/cudadev/CUDADataFormats/BeamSpotCUDA.h    |   4 +-
 .../CUDADataFormats/HeterogeneousSoA.h        |  66 ++++----
 .../CUDADataFormats/SiPixelClustersCUDA.cc    |  16 +-
 .../CUDADataFormats/SiPixelClustersCUDA.h     |   2 +-
 .../CUDADataFormats/SiPixelDigiErrorsCUDA.cc  |  24 +--
 .../CUDADataFormats/SiPixelDigiErrorsCUDA.h   |   6 +-
 .../CUDADataFormats/SiPixelDigisCUDA.cc       |  46 +++---
 .../CUDADataFormats/SiPixelDigisCUDA.h        |  10 +-
 .../TrackingRecHit2DHeterogeneous.cc          |  38 +++--
 .../TrackingRecHit2DHeterogeneous.h           |  31 ++--
 .../plugin-BeamSpotProducer/BeamSpotToCUDA.cc |   2 +-
 .../PixelTrackSoAFromCUDA.cc                  |   2 +-
 .../BrokenLineFitOnGPU.cu                     |  82 +++++-----
 .../plugin-PixelTriplets/CAHitNtupletCUDA.cc  |   2 +-
 .../CAHitNtupletGeneratorKernels.cc           |   2 +-
 .../CAHitNtupletGeneratorKernels.cu           |  44 ++---
 .../CAHitNtupletGeneratorKernels.h            |   4 +-
 .../CAHitNtupletGeneratorKernelsAlloc.h       |  22 +--
 .../CAHitNtupletGeneratorOnGPU.cc             |  23 +--
 .../CAHitNtupletGeneratorOnGPU.h              |   4 +-
 .../plugin-PixelTriplets/HelixFitOnGPU.h      |  10 +-
 .../plugin-PixelTriplets/RiemannFitOnGPU.cu   | 154 +++++++++---------
 .../PixelVertexProducerCUDA.cc                |   2 +-
 .../PixelVertexSoAFromCUDA.cc                 |   2 +-
 .../gpuVertexFinder.cc                        |  33 ++--
 .../gpuVertexFinder.h                         |   2 +-
 .../SiPixelRawToClusterCUDA.cc                |   2 +-
 .../SiPixelRawToClusterGPUKernel.cu           |  84 +++++-----
 .../SiPixelRawToClusterGPUKernel.h            |   2 +-
 .../SiPixelDigisSoAFromCUDA.cc                |   8 +-
 .../PixelRecHitGPUKernel.cu                   |  18 +-
 .../PixelRecHitGPUKernel.h                    |   2 +-
 .../SiPixelRecHitCUDA.cc                      |   5 +-
 .../plugin-Validation/HistoValidator.cc       |  12 +-
 src/cudadev/test/HistoContainer_t.cu          |   8 +-
 src/cudadev/test/OneHistoContainer_t.cu       |   4 +-
 src/cudadev/test/OneToManyAssoc_t.h           |   7 +-
 src/cudadev/test/TrackingRecHit2DCUDA_t.cu    |   4 +-
 src/cudadev/test/VertexFinder_t.h             |   6 +-
 src/cudadev/test/gpuClustering_t.h            |  18 +-
 src/cudadev/test/radixSort_t.cu               |  10 +-
 43 files changed, 477 insertions(+), 427 deletions(-)

diff --git a/src/cudadev/CUDACore/device_unique_ptr.h b/src/cudadev/CUDACore/device_unique_ptr.h
index ab5d6bc25..4736bdaad 100644
--- a/src/cudadev/CUDACore/device_unique_ptr.h
+++ b/src/cudadev/CUDACore/device_unique_ptr.h
@@ -3,11 +3,9 @@
 
 #include <functional>
 #include <memory>
+#include <optional>
 
-#include <cuda_runtime.h>
-
-#include "CUDACore/allocate_device.h"
-#include "CUDACore/currentDevice.h"
+#include "CUDACore/Context.h"
 
 namespace cms {
   namespace cuda {
@@ -17,14 +15,16 @@ namespace cms {
         class DeviceDeleter {
         public:
           DeviceDeleter() = default;  // for edm::Wrapper
-          DeviceDeleter(cudaStream_t stream) : stream_{stream} {}
+          DeviceDeleter(DeviceAllocatorContext const &ctx) : ctx_(ctx) {}
 
           void operator()(void *ptr) {
-            free_device(ptr, stream_);
+            if (ctx_) {
+              ctx_->free_device(ptr);
+            }
           }
 
         private:
-          cudaStream_t stream_ = cudaStreamDefault;
+          std::optional<DeviceAllocatorContext> ctx_;
         };
       }  // namespace impl
 
@@ -48,23 +48,24 @@ namespace cms {
     }    // namespace device
 
     template <typename T>
-    typename device::impl::make_device_unique_selector<T>::non_array make_device_unique(cudaStream_t stream) {
+    typename device::impl::make_device_unique_selector<T>::non_array make_device_unique(
+        DeviceAllocatorContext const &ctx) {
       static_assert(std::is_trivially_constructible<T>::value,
                     "Allocating with non-trivial constructor on the device memory is not supported");
-      void *mem = allocate_device(sizeof(T), stream);
+      void *mem = ctx.allocate_device(sizeof(T));
       return typename device::impl::make_device_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
-                                                                              device::impl::DeviceDeleter{stream}};
+                                                                              device::impl::DeviceDeleter{ctx}};
     }
 
     template <typename T>
-    typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique(size_t n,
-                                                                                              cudaStream_t stream) {
+    typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique(
+        size_t n, DeviceAllocatorContext const &ctx) {
       using element_type = typename std::remove_extent<T>::type;
       static_assert(std::is_trivially_constructible<element_type>::value,
                     "Allocating with non-trivial constructor on the device memory is not supported");
-      void *mem = allocate_device(n * sizeof(element_type), stream);
+      void *mem = ctx.allocate_device(n * sizeof(element_type));
       return typename device::impl::make_device_unique_selector<T>::unbounded_array{
-          reinterpret_cast<element_type *>(mem), device::impl::DeviceDeleter{stream}};
+          reinterpret_cast<element_type *>(mem), device::impl::DeviceDeleter{ctx}};
     }
 
     template <typename T, typename... Args>
@@ -73,19 +74,19 @@ namespace cms {
     // No check for the trivial constructor, make it clear in the interface
     template <typename T>
     typename device::impl::make_device_unique_selector<T>::non_array make_device_unique_uninitialized(
-        cudaStream_t stream) {
-      void *mem = allocate_device(sizeof(T), stream);
+        DeviceAllocatorContext const &ctx) {
+      void *mem = ctx.allocate_device(sizeof(T));
       return typename device::impl::make_device_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
-                                                                              device::impl::DeviceDeleter{stream}};
+                                                                              device::impl::DeviceDeleter{ctx}};
     }
 
     template <typename T>
     typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique_uninitialized(
-        size_t n, cudaStream_t stream) {
+        size_t n, DeviceAllocatorContext const &ctx) {
       using element_type = typename std::remove_extent<T>::type;
-      void *mem = allocate_device(n * sizeof(element_type), stream);
+      void *mem = ctx.allocate_device(n * sizeof(element_type));
       return typename device::impl::make_device_unique_selector<T>::unbounded_array{
-          reinterpret_cast<element_type *>(mem), device::impl::DeviceDeleter{stream}};
+          reinterpret_cast<element_type *>(mem), device::impl::DeviceDeleter{ctx}};
     }
 
     template <typename T, typename... Args>
diff --git a/src/cudadev/CUDACore/host_unique_ptr.h b/src/cudadev/CUDACore/host_unique_ptr.h
index f34798da3..2e8240476 100644
--- a/src/cudadev/CUDACore/host_unique_ptr.h
+++ b/src/cudadev/CUDACore/host_unique_ptr.h
@@ -3,8 +3,9 @@
 
 #include <memory>
 #include <functional>
+#include <optional>
 
-#include "CUDACore/allocate_host.h"
+#include "CUDACore/Context.h"
 
 namespace cms {
   namespace cuda {
@@ -13,7 +14,12 @@ namespace cms {
         // Additional layer of types to distinguish from host::unique_ptr
         class HostDeleter {
         public:
-          void operator()(void *ptr) { cms::cuda::free_host(ptr); }
+          HostDeleter() = default;  // for edm::Wrapper
+          HostDeleter(HostAllocatorContext const &ctx) : ctx_(ctx) {}
+          void operator()(void *ptr) { ctx_->free_host(ptr); }
+
+        private:
+          std::optional<HostAllocatorContext> ctx_;
         };
       }  // namespace impl
 
@@ -38,20 +44,23 @@ namespace cms {
 
     // Allocate pinned host memory
     template <typename T>
-    typename host::impl::make_host_unique_selector<T>::non_array make_host_unique(cudaStream_t stream) {
+    typename host::impl::make_host_unique_selector<T>::non_array make_host_unique(HostAllocatorContext const &ctx) {
       static_assert(std::is_trivially_constructible<T>::value,
                     "Allocating with non-trivial constructor on the pinned host memory is not supported");
-      void *mem = allocate_host(sizeof(T), stream);
-      return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem)};
+      void *mem = ctx.allocate_host(sizeof(T));
+      return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                          host::impl::HostDeleter{ctx}};
     }
 
     template <typename T>
-    typename host::impl::make_host_unique_selector<T>::unbounded_array make_host_unique(size_t n, cudaStream_t stream) {
+    typename host::impl::make_host_unique_selector<T>::unbounded_array make_host_unique(
+        size_t n, HostAllocatorContext const &ctx) {
       using element_type = typename std::remove_extent<T>::type;
       static_assert(std::is_trivially_constructible<element_type>::value,
                     "Allocating with non-trivial constructor on the pinned host memory is not supported");
-      void *mem = allocate_host(n * sizeof(element_type), stream);
-      return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem)};
+      void *mem = ctx.allocate_host(n * sizeof(element_type));
+      return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem),
+                                                                                host::impl::HostDeleter{ctx}};
     }
 
     template <typename T, typename... Args>
@@ -59,17 +68,20 @@ namespace cms {
 
     // No check for the trivial constructor, make it clear in the interface
     template <typename T>
-    typename host::impl::make_host_unique_selector<T>::non_array make_host_unique_uninitialized(cudaStream_t stream) {
-      void *mem = allocate_host(sizeof(T), stream);
-      return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem)};
+    typename host::impl::make_host_unique_selector<T>::non_array make_host_unique_uninitialized(
+        HostAllocatorContext const &ctx) {
+      void *mem = ctx.allocate_host(sizeof(T));
+      return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                          host::impl::HostDeleter{ctx}};
     }
 
     template <typename T>
     typename host::impl::make_host_unique_selector<T>::unbounded_array make_host_unique_uninitialized(
-        size_t n, cudaStream_t stream) {
+        size_t n, HostAllocatorContext const &ctx) {
       using element_type = typename std::remove_extent<T>::type;
-      void *mem = allocate_host(n * sizeof(element_type), stream);
-      return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem)};
+      void *mem = ctx.allocate_host(n * sizeof(element_type));
+      return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem),
+                                                                                host::impl::HostDeleter{ctx}};
     }
 
     template <typename T, typename... Args>
diff --git a/src/cudadev/CUDADataFormats/BeamSpotCUDA.h b/src/cudadev/CUDADataFormats/BeamSpotCUDA.h
index a090ef347..0e9311ea1 100644
--- a/src/cudadev/CUDADataFormats/BeamSpotCUDA.h
+++ b/src/cudadev/CUDADataFormats/BeamSpotCUDA.h
@@ -12,7 +12,9 @@ class BeamSpotCUDA {
   BeamSpotCUDA() = default;
 
   // constructor that allocates cached device memory on the given CUDA stream
-  BeamSpotCUDA(cudaStream_t stream) { data_d_ = cms::cuda::make_device_unique<BeamSpotPOD>(stream); }
+  BeamSpotCUDA(cms::cuda::DeviceAllocatorContext const& ctx) {
+    data_d_ = cms::cuda::make_device_unique<BeamSpotPOD>(ctx);
+  }
 
   // movable, non-copiable
   BeamSpotCUDA(BeamSpotCUDA const&) = delete;
diff --git a/src/cudadev/CUDADataFormats/HeterogeneousSoA.h b/src/cudadev/CUDADataFormats/HeterogeneousSoA.h
index cfaad449c..96ade9feb 100644
--- a/src/cudadev/CUDADataFormats/HeterogeneousSoA.h
+++ b/src/cudadev/CUDADataFormats/HeterogeneousSoA.h
@@ -36,10 +36,10 @@ class HeterogeneousSoA {
   auto *operator->() { return get(); }
 
   // in reality valid only for GPU version...
-  cms::cuda::host::unique_ptr<T> toHostAsync(cudaStream_t stream) const {
+  cms::cuda::host::unique_ptr<T> toHostAsync(cms::cuda::Context const &ctx) const {
     assert(dm_ptr);
-    auto ret = cms::cuda::make_host_unique<T>(stream);
-    cudaCheck(cudaMemcpyAsync(ret.get(), dm_ptr.get(), sizeof(T), cudaMemcpyDefault, stream));
+    auto ret = cms::cuda::make_host_unique<T>(ctx);
+    cudaCheck(cudaMemcpyAsync(ret.get(), dm_ptr.get(), sizeof(T), cudaMemcpyDefault, ctx.stream()));
     return ret;
   }
 
@@ -58,28 +58,28 @@ namespace cms {
       using unique_ptr = cms::cuda::device::unique_ptr<T>;
 
       template <typename T>
-      static auto make_unique(cudaStream_t stream) {
-        return cms::cuda::make_device_unique<T>(stream);
+      static auto make_unique(cms::cuda::DeviceAllocatorContext const &ctx) {
+        return cms::cuda::make_device_unique<T>(ctx);
       }
 
       template <typename T>
-      static auto make_unique(size_t size, cudaStream_t stream) {
-        return cms::cuda::make_device_unique<T>(size, stream);
+      static auto make_unique(size_t size, cms::cuda::DeviceAllocatorContext const &ctx) {
+        return cms::cuda::make_device_unique<T>(size, ctx);
       }
 
       template <typename T>
-      static auto make_host_unique(cudaStream_t stream) {
-        return cms::cuda::make_host_unique<T>(stream);
+      static auto make_host_unique(cms::cuda::HostAllocatorContext const &ctx) {
+        return cms::cuda::make_host_unique<T>(ctx);
       }
 
       template <typename T>
-      static auto make_device_unique(cudaStream_t stream) {
-        return cms::cuda::make_device_unique<T>(stream);
+      static auto make_device_unique(cms::cuda::DeviceAllocatorContext const &ctx) {
+        return cms::cuda::make_device_unique<T>(ctx);
       }
 
       template <typename T>
-      static auto make_device_unique(size_t size, cudaStream_t stream) {
-        return cms::cuda::make_device_unique<T>(size, stream);
+      static auto make_device_unique(size_t size, cms::cuda::DeviceAllocatorContext const &ctx) {
+        return cms::cuda::make_device_unique<T>(size, ctx);
       }
     };
 
@@ -88,23 +88,23 @@ namespace cms {
       using unique_ptr = cms::cuda::host::unique_ptr<T>;
 
       template <typename T>
-      static auto make_unique(cudaStream_t stream) {
-        return cms::cuda::make_host_unique<T>(stream);
+      static auto make_unique(cms::cuda::HostAllocatorContext const &ctx) {
+        return cms::cuda::make_host_unique<T>(ctx);
       }
 
       template <typename T>
-      static auto make_host_unique(cudaStream_t stream) {
-        return cms::cuda::make_host_unique<T>(stream);
+      static auto make_host_unique(cms::cuda::HostAllocatorContext const &ctx) {
+        return cms::cuda::make_host_unique<T>(ctx);
       }
 
       template <typename T>
-      static auto make_device_unique(cudaStream_t stream) {
-        return cms::cuda::make_device_unique<T>(stream);
+      static auto make_device_unique(cms::cuda::DeviceAllocatorContext const &ctx) {
+        return cms::cuda::make_device_unique<T>(ctx);
       }
 
       template <typename T>
-      static auto make_device_unique(size_t size, cudaStream_t stream) {
-        return cms::cuda::make_device_unique<T>(size, stream);
+      static auto make_device_unique(size_t size, cms::cuda::DeviceAllocatorContext const &ctx) {
+        return cms::cuda::make_device_unique<T>(size, ctx);
       }
     };
 
@@ -113,27 +113,27 @@ namespace cms {
       using unique_ptr = std::unique_ptr<T>;
 
       template <typename T>
-      static auto make_unique(cudaStream_t) {
+      static auto make_unique(cms::cuda::DeviceAllocatorContext const &) {
         return std::make_unique<T>();
       }
 
       template <typename T>
-      static auto make_unique(size_t size, cudaStream_t) {
+      static auto make_unique(size_t size, cms::cuda::DeviceAllocatorContext const &) {
         return std::make_unique<T>(size);
       }
 
       template <typename T>
-      static auto make_host_unique(cudaStream_t) {
+      static auto make_host_unique(cms::cuda::HostAllocatorContext const &) {
         return std::make_unique<T>();
       }
 
       template <typename T>
-      static auto make_device_unique(cudaStream_t) {
+      static auto make_device_unique(cms::cuda::DeviceAllocatorContext const &) {
         return std::make_unique<T>();
       }
 
       template <typename T>
-      static auto make_device_unique(size_t size, cudaStream_t) {
+      static auto make_device_unique(size_t size, cms::cuda::DeviceAllocatorContext const &) {
         return std::make_unique<T>(size);
       }
     };
@@ -154,28 +154,28 @@ class HeterogeneousSoAImpl {
   HeterogeneousSoAImpl &operator=(HeterogeneousSoAImpl &&) = default;
 
   explicit HeterogeneousSoAImpl(unique_ptr<T> &&p) : m_ptr(std::move(p)) {}
-  explicit HeterogeneousSoAImpl(cudaStream_t stream);
+  explicit HeterogeneousSoAImpl(cms::cuda::DeviceAllocatorContext const &ctx);
 
   T const *get() const { return m_ptr.get(); }
 
   T *get() { return m_ptr.get(); }
 
-  cms::cuda::host::unique_ptr<T> toHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<T> toHostAsync(cms::cuda::Context const &ctx) const;
 
 private:
   unique_ptr<T> m_ptr;  //!
 };
 
 template <typename T, typename Traits>
-HeterogeneousSoAImpl<T, Traits>::HeterogeneousSoAImpl(cudaStream_t stream) {
-  m_ptr = Traits::template make_unique<T>(stream);
+HeterogeneousSoAImpl<T, Traits>::HeterogeneousSoAImpl(cms::cuda::DeviceAllocatorContext const &ctx) {
+  m_ptr = Traits::template make_unique<T>(ctx);
 }
 
 // in reality valid only for GPU version...
 template <typename T, typename Traits>
-cms::cuda::host::unique_ptr<T> HeterogeneousSoAImpl<T, Traits>::toHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<T>(stream);
-  cudaCheck(cudaMemcpyAsync(ret.get(), get(), sizeof(T), cudaMemcpyDefault, stream));
+cms::cuda::host::unique_ptr<T> HeterogeneousSoAImpl<T, Traits>::toHostAsync(cms::cuda::Context const &ctx) const {
+  auto ret = cms::cuda::make_host_unique<T>(ctx);
+  cudaCheck(cudaMemcpyAsync(ret.get(), get(), sizeof(T), cudaMemcpyDefault, ctx.stream()));
   return ret;
 }
 
diff --git a/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.cc b/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.cc
index a9feabb92..3d17c8371 100644
--- a/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.cc
+++ b/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.cc
@@ -3,17 +3,17 @@
 #include "CUDACore/host_unique_ptr.h"
 #include "CUDADataFormats/SiPixelClustersCUDA.h"
 
-SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream)
-    : moduleStart_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules + 1, stream)),
-      clusInModule_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules, stream)),
-      moduleId_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules, stream)),
-      clusModuleStart_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules + 1, stream)) {
-  auto view = cms::cuda::make_host_unique<DeviceConstView>(stream);
+SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxModules, cms::cuda::Context const& ctx)
+    : moduleStart_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules + 1, ctx)),
+      clusInModule_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules, ctx)),
+      moduleId_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules, ctx)),
+      clusModuleStart_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules + 1, ctx)) {
+  auto view = cms::cuda::make_host_unique<DeviceConstView>(ctx);
   view->moduleStart_ = moduleStart_d.get();
   view->clusInModule_ = clusInModule_d.get();
   view->moduleId_ = moduleId_d.get();
   view->clusModuleStart_ = clusModuleStart_d.get();
 
-  view_d = cms::cuda::make_device_unique<DeviceConstView>(stream);
-  cms::cuda::copyAsync(view_d, view, stream);
+  view_d = cms::cuda::make_device_unique<DeviceConstView>(ctx);
+  cms::cuda::copyAsync(view_d, view, ctx.stream());
 }
diff --git a/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.h b/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.h
index e93b742cf..ac06f9fb0 100644
--- a/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.h
+++ b/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.h
@@ -10,7 +10,7 @@
 class SiPixelClustersCUDA {
 public:
   SiPixelClustersCUDA() = default;
-  explicit SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream);
+  explicit SiPixelClustersCUDA(size_t maxModules, cms::cuda::Context const &ctx);
   ~SiPixelClustersCUDA() = default;
 
   SiPixelClustersCUDA(const SiPixelClustersCUDA &) = delete;
diff --git a/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.cc b/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.cc
index d79942608..c2fe7bf8c 100644
--- a/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.cc
+++ b/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.cc
@@ -6,33 +6,35 @@
 #include "CUDACore/memsetAsync.h"
 #include "CUDADataFormats/SiPixelDigiErrorsCUDA.h"
 
-SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, SiPixelFormatterErrors errors, cudaStream_t stream)
-    : data_d(cms::cuda::make_device_unique<SiPixelErrorCompact[]>(maxFedWords, stream)),
-      error_d(cms::cuda::make_device_unique<SiPixelErrorCompactVector>(stream)),
-      error_h(cms::cuda::make_host_unique<SiPixelErrorCompactVector>(stream)),
+SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords,
+                                             SiPixelFormatterErrors errors,
+                                             cms::cuda::Context const& ctx)
+    : data_d(cms::cuda::make_device_unique<SiPixelErrorCompact[]>(maxFedWords, ctx)),
+      error_d(cms::cuda::make_device_unique<SiPixelErrorCompactVector>(ctx)),
+      error_h(cms::cuda::make_host_unique<SiPixelErrorCompactVector>(ctx)),
       formatterErrors_h(std::move(errors)) {
-  cms::cuda::memsetAsync(data_d, 0x00, maxFedWords, stream);
+  cms::cuda::memsetAsync(data_d, 0x00, maxFedWords, ctx.stream());
 
   cms::cuda::make_SimpleVector(error_h.get(), maxFedWords, data_d.get());
   assert(error_h->empty());
   assert(error_h->capacity() == static_cast<int>(maxFedWords));
 
-  cms::cuda::copyAsync(error_d, error_h, stream);
+  cms::cuda::copyAsync(error_d, error_h, ctx.stream());
 }
 
-void SiPixelDigiErrorsCUDA::copyErrorToHostAsync(cudaStream_t stream) {
-  cms::cuda::copyAsync(error_h, error_d, stream);
+void SiPixelDigiErrorsCUDA::copyErrorToHostAsync(cms::cuda::Context const& ctx) {
+  cms::cuda::copyAsync(error_h, error_d, ctx.stream());
 }
 
-SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync(cudaStream_t stream) const {
+SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync(cms::cuda::Context const& ctx) const {
   // On one hand size() could be sufficient. On the other hand, if
   // someone copies the SimpleVector<>, (s)he might expect the data
   // buffer to actually have space for capacity() elements.
-  auto data = cms::cuda::make_host_unique<SiPixelErrorCompact[]>(error_h->capacity(), stream);
+  auto data = cms::cuda::make_host_unique<SiPixelErrorCompact[]>(error_h->capacity(), ctx);
 
   // but transfer only the required amount
   if (not error_h->empty()) {
-    cms::cuda::copyAsync(data, data_d, error_h->size(), stream);
+    cms::cuda::copyAsync(data, data_d, error_h->size(), ctx.stream());
   }
   auto err = *error_h;
   err.set_data(data.get());
diff --git a/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.h b/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.h
index 442a66f92..4c277d001 100644
--- a/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.h
+++ b/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.h
@@ -14,7 +14,7 @@ class SiPixelDigiErrorsCUDA {
   using SiPixelErrorCompactVector = cms::cuda::SimpleVector<SiPixelErrorCompact>;
 
   SiPixelDigiErrorsCUDA() = default;
-  explicit SiPixelDigiErrorsCUDA(size_t maxFedWords, SiPixelFormatterErrors errors, cudaStream_t stream);
+  explicit SiPixelDigiErrorsCUDA(size_t maxFedWords, SiPixelFormatterErrors errors, cms::cuda::Context const& ctx);
   ~SiPixelDigiErrorsCUDA() = default;
 
   SiPixelDigiErrorsCUDA(const SiPixelDigiErrorsCUDA&) = delete;
@@ -28,9 +28,9 @@ class SiPixelDigiErrorsCUDA {
   SiPixelErrorCompactVector const* error() const { return error_d.get(); }
 
   using HostDataError = std::pair<SiPixelErrorCompactVector, cms::cuda::host::unique_ptr<SiPixelErrorCompact[]>>;
-  HostDataError dataErrorToHostAsync(cudaStream_t stream) const;
+  HostDataError dataErrorToHostAsync(cms::cuda::Context const& ctx) const;
 
-  void copyErrorToHostAsync(cudaStream_t stream);
+  void copyErrorToHostAsync(cms::cuda::Context const& ctx);
 
 private:
   cms::cuda::device::unique_ptr<SiPixelErrorCompact[]> data_d;
diff --git a/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc b/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc
index fd87fee56..9be592dc1 100644
--- a/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc
+++ b/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc
@@ -4,45 +4,45 @@
 #include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/host_unique_ptr.h"
 
-SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream)
-    : xx_d(cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream)),
-      yy_d(cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream)),
-      adc_d(cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream)),
-      moduleInd_d(cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream)),
-      clus_d(cms::cuda::make_device_unique<int32_t[]>(maxFedWords, stream)),
-      view_d(cms::cuda::make_device_unique<DeviceConstView>(stream)),
-      pdigi_d(cms::cuda::make_device_unique<uint32_t[]>(maxFedWords, stream)),
-      rawIdArr_d(cms::cuda::make_device_unique<uint32_t[]>(maxFedWords, stream)) {
-  auto view = cms::cuda::make_host_unique<DeviceConstView>(stream);
+SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cms::cuda::Context const& ctx)
+    : xx_d(cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, ctx)),
+      yy_d(cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, ctx)),
+      adc_d(cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, ctx)),
+      moduleInd_d(cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, ctx)),
+      clus_d(cms::cuda::make_device_unique<int32_t[]>(maxFedWords, ctx)),
+      view_d(cms::cuda::make_device_unique<DeviceConstView>(ctx)),
+      pdigi_d(cms::cuda::make_device_unique<uint32_t[]>(maxFedWords, ctx)),
+      rawIdArr_d(cms::cuda::make_device_unique<uint32_t[]>(maxFedWords, ctx)) {
+  auto view = cms::cuda::make_host_unique<DeviceConstView>(ctx);
   view->xx_ = xx_d.get();
   view->yy_ = yy_d.get();
   view->adc_ = adc_d.get();
   view->moduleInd_ = moduleInd_d.get();
   view->clus_ = clus_d.get();
 
-  cms::cuda::copyAsync(view_d, view, stream);
+  cms::cuda::copyAsync(view_d, view, ctx.stream());
 }
 
-cms::cuda::host::unique_ptr<uint16_t[]> SiPixelDigisCUDA::adcToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<uint16_t[]>(nDigis(), stream);
-  cms::cuda::copyAsync(ret, adc_d, nDigis(), stream);
+cms::cuda::host::unique_ptr<uint16_t[]> SiPixelDigisCUDA::adcToHostAsync(cms::cuda::Context const& ctx) const {
+  auto ret = cms::cuda::make_host_unique<uint16_t[]>(nDigis(), ctx);
+  cms::cuda::copyAsync(ret, adc_d, nDigis(), ctx.stream());
   return ret;
 }
 
-cms::cuda::host::unique_ptr<int32_t[]> SiPixelDigisCUDA::clusToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<int32_t[]>(nDigis(), stream);
-  cms::cuda::copyAsync(ret, clus_d, nDigis(), stream);
+cms::cuda::host::unique_ptr<int32_t[]> SiPixelDigisCUDA::clusToHostAsync(cms::cuda::Context const& ctx) const {
+  auto ret = cms::cuda::make_host_unique<int32_t[]>(nDigis(), ctx);
+  cms::cuda::copyAsync(ret, clus_d, nDigis(), ctx.stream());
   return ret;
 }
 
-cms::cuda::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::pdigiToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<uint32_t[]>(nDigis(), stream);
-  cms::cuda::copyAsync(ret, pdigi_d, nDigis(), stream);
+cms::cuda::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::pdigiToHostAsync(cms::cuda::Context const& ctx) const {
+  auto ret = cms::cuda::make_host_unique<uint32_t[]>(nDigis(), ctx);
+  cms::cuda::copyAsync(ret, pdigi_d, nDigis(), ctx.stream());
   return ret;
 }
 
-cms::cuda::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::rawIdArrToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<uint32_t[]>(nDigis(), stream);
-  cms::cuda::copyAsync(ret, rawIdArr_d, nDigis(), stream);
+cms::cuda::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::rawIdArrToHostAsync(cms::cuda::Context const& ctx) const {
+  auto ret = cms::cuda::make_host_unique<uint32_t[]>(nDigis(), ctx);
+  cms::cuda::copyAsync(ret, rawIdArr_d, nDigis(), ctx.stream());
   return ret;
 }
diff --git a/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.h b/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.h
index 03ae6639a..e0f95ca34 100644
--- a/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.h
+++ b/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.h
@@ -10,7 +10,7 @@
 class SiPixelDigisCUDA {
 public:
   SiPixelDigisCUDA() = default;
-  explicit SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream);
+  explicit SiPixelDigisCUDA(size_t maxFedWords, cms::cuda::Context const &ctx);
   ~SiPixelDigisCUDA() = default;
 
   SiPixelDigisCUDA(const SiPixelDigisCUDA &) = delete;
@@ -42,10 +42,10 @@ class SiPixelDigisCUDA {
   uint32_t const *pdigi() const { return pdigi_d.get(); }
   uint32_t const *rawIdArr() const { return rawIdArr_d.get(); }
 
-  cms::cuda::host::unique_ptr<uint16_t[]> adcToHostAsync(cudaStream_t stream) const;
-  cms::cuda::host::unique_ptr<int32_t[]> clusToHostAsync(cudaStream_t stream) const;
-  cms::cuda::host::unique_ptr<uint32_t[]> pdigiToHostAsync(cudaStream_t stream) const;
-  cms::cuda::host::unique_ptr<uint32_t[]> rawIdArrToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<uint16_t[]> adcToHostAsync(cms::cuda::Context const &ctx) const;
+  cms::cuda::host::unique_ptr<int32_t[]> clusToHostAsync(cms::cuda::Context const &ctx) const;
+  cms::cuda::host::unique_ptr<uint32_t[]> pdigiToHostAsync(cms::cuda::Context const &ctx) const;
+  cms::cuda::host::unique_ptr<uint32_t[]> rawIdArrToHostAsync(cms::cuda::Context const &ctx) const;
 
   class DeviceConstView {
   public:
diff --git a/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.cc b/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.cc
index 5c1aacaf4..611bbb101 100644
--- a/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.cc
+++ b/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.cc
@@ -5,40 +5,44 @@
 #include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
 
 template <>
-cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DCUDA::localCoordToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<float[]>(4 * nHits(), stream);
-  cms::cuda::copyAsync(ret, m_store32, 4 * nHits(), stream);
+cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DCUDA::localCoordToHostAsync(cms::cuda::Context const& ctx) const {
+  auto ret = cms::cuda::make_host_unique<float[]>(4 * nHits(), ctx);
+  cms::cuda::copyAsync(ret, m_store32, 4 * nHits(), ctx.stream());
   return ret;
 }
 
 template <>
-cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DCUDA::hitsModuleStartToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<uint32_t[]>(gpuClustering::maxNumModules + 1, stream);
-  cudaCheck(cudaMemcpyAsync(
-      ret.get(), m_hitsModuleStart, sizeof(uint32_t) * (gpuClustering::maxNumModules + 1), cudaMemcpyDefault, stream));
+cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DCUDA::hitsModuleStartToHostAsync(
+    cms::cuda::Context const& ctx) const {
+  auto ret = cms::cuda::make_host_unique<uint32_t[]>(gpuClustering::maxNumModules + 1, ctx);
+  cudaCheck(cudaMemcpyAsync(ret.get(),
+                            m_hitsModuleStart,
+                            sizeof(uint32_t) * (gpuClustering::maxNumModules + 1),
+                            cudaMemcpyDefault,
+                            ctx.stream()));
   return ret;
 }
 
 template <>
-cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DCUDA::globalCoordToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<float[]>(4 * nHits(), stream);
+cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DCUDA::globalCoordToHostAsync(cms::cuda::Context const& ctx) const {
+  auto ret = cms::cuda::make_host_unique<float[]>(4 * nHits(), ctx);
   cudaCheck(cudaMemcpyAsync(
-      ret.get(), m_store32.get() + 4 * nHits(), 4 * nHits() * sizeof(float), cudaMemcpyDefault, stream));
+      ret.get(), m_store32.get() + 4 * nHits(), 4 * nHits() * sizeof(float), cudaMemcpyDefault, ctx.stream()));
   return ret;
 }
 
 template <>
-cms::cuda::host::unique_ptr<int32_t[]> TrackingRecHit2DCUDA::chargeToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<int32_t[]>(nHits(), stream);
-  cudaCheck(
-      cudaMemcpyAsync(ret.get(), m_store32.get() + 8 * nHits(), nHits() * sizeof(int32_t), cudaMemcpyDefault, stream));
+cms::cuda::host::unique_ptr<int32_t[]> TrackingRecHit2DCUDA::chargeToHostAsync(cms::cuda::Context const& ctx) const {
+  auto ret = cms::cuda::make_host_unique<int32_t[]>(nHits(), ctx);
+  cudaCheck(cudaMemcpyAsync(
+      ret.get(), m_store32.get() + 8 * nHits(), nHits() * sizeof(int32_t), cudaMemcpyDefault, ctx.stream()));
   return ret;
 }
 
 template <>
-cms::cuda::host::unique_ptr<int16_t[]> TrackingRecHit2DCUDA::sizeToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<int16_t[]>(2 * nHits(), stream);
+cms::cuda::host::unique_ptr<int16_t[]> TrackingRecHit2DCUDA::sizeToHostAsync(cms::cuda::Context const& ctx) const {
+  auto ret = cms::cuda::make_host_unique<int16_t[]>(2 * nHits(), ctx);
   cudaCheck(cudaMemcpyAsync(
-      ret.get(), m_store16.get() + 2 * nHits(), 2 * nHits() * sizeof(int16_t), cudaMemcpyDefault, stream));
+      ret.get(), m_store16.get() + 2 * nHits(), 2 * nHits() * sizeof(int16_t), cudaMemcpyDefault, ctx.stream()));
   return ret;
 }
diff --git a/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.h b/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.h
index 7a19299a9..e3e706b08 100644
--- a/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.h
+++ b/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.h
@@ -17,7 +17,7 @@ class TrackingRecHit2DHeterogeneous {
   explicit TrackingRecHit2DHeterogeneous(uint32_t nHits,
                                          pixelCPEforGPU::ParamsOnGPU const* cpeParams,
                                          uint32_t const* hitsModuleStart,
-                                         cudaStream_t stream);
+                                         cms::cuda::Context const& ctx);
 
   ~TrackingRecHit2DHeterogeneous() = default;
 
@@ -38,13 +38,13 @@ class TrackingRecHit2DHeterogeneous {
   auto iphi() { return m_iphi; }
 
   // only the local coord and detector index
-  cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const;
-  cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cms::cuda::Context const& ctx) const;
+  cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cms::cuda::Context const& ctx) const;
 
   // for validation
-  cms::cuda::host::unique_ptr<float[]> globalCoordToHostAsync(cudaStream_t stream) const;
-  cms::cuda::host::unique_ptr<int32_t[]> chargeToHostAsync(cudaStream_t stream) const;
-  cms::cuda::host::unique_ptr<int16_t[]> sizeToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<float[]> globalCoordToHostAsync(cms::cuda::Context const& ctx) const;
+  cms::cuda::host::unique_ptr<int32_t[]> chargeToHostAsync(cms::cuda::Context const& ctx) const;
+  cms::cuda::host::unique_ptr<int16_t[]> sizeToHostAsync(cms::cuda::Context const& ctx) const;
 
 private:
   static constexpr uint32_t n16 = 4;                 // number of elements in m_store16
@@ -77,13 +77,13 @@ template <typename Traits>
 TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nHits,
                                                                      pixelCPEforGPU::ParamsOnGPU const* cpeParams,
                                                                      uint32_t const* hitsModuleStart,
-                                                                     cudaStream_t stream)
+                                                                     cms::cuda::Context const& ctx)
     : m_nHits(nHits), m_hitsModuleStart(hitsModuleStart) {
-  auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);
+  auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(ctx);
 
   view->m_nHits = nHits;
-  m_view = Traits::template make_device_unique<TrackingRecHit2DSOAView>(stream);
-  m_AverageGeometryStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
+  m_view = Traits::template make_device_unique<TrackingRecHit2DSOAView>(ctx);
+  m_AverageGeometryStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::AverageGeometry>(ctx);
   view->m_averageGeometry = m_AverageGeometryStore.get();
   view->m_cpeParams = cpeParams;
   view->m_hitsModuleStart = hitsModuleStart;
@@ -91,7 +91,7 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
   // if empy do not bother
   if (0 == nHits) {
     if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
-      cms::cuda::copyAsync(m_view, view, stream);
+      cms::cuda::copyAsync(m_view, view, ctx.stream());
     } else {
       m_view.reset(view.release());  // NOLINT: std::move() breaks CUDA version
     }
@@ -103,10 +103,9 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
   // if ordering is relevant they may have to be stored phi-ordered by layer or so
   // this will break 1to1 correspondence with cluster and module locality
   // so unless proven VERY inefficient we keep it ordered as generated
-  m_store16 = Traits::template make_device_unique<uint16_t[]>(nHits * n16, stream);
-  m_store32 =
-      Traits::template make_device_unique<float[]>(nHits * n32 + phase1PixelTopology::numberOfLayers + 1, stream);
-  m_PhiBinnerStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::PhiBinner>(stream);
+  m_store16 = Traits::template make_device_unique<uint16_t[]>(nHits * n16, ctx);
+  m_store32 = Traits::template make_device_unique<float[]>(nHits * n32 + phase1PixelTopology::numberOfLayers + 1, ctx);
+  m_PhiBinnerStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::PhiBinner>(ctx);
 
   static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
   static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type));
@@ -140,7 +139,7 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
 
   // transfer view
   if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
-    cms::cuda::copyAsync(m_view, view, stream);
+    cms::cuda::copyAsync(m_view, view, ctx.stream());
   } else {
     m_view.reset(view.release());  // NOLINT: std::move() breaks CUDA version
   }
diff --git a/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc b/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc
index b9633a856..819ecb4d7 100644
--- a/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc
+++ b/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc
@@ -32,7 +32,7 @@ BeamSpotToCUDA::BeamSpotToCUDA(edm::ProductRegistry& reg)
 void BeamSpotToCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext& ctx) {
   *bsHost = iSetup.get<BeamSpotPOD>();
 
-  BeamSpotCUDA bsDevice(ctx.stream());
+  BeamSpotCUDA bsDevice(ctx);
   cms::cuda::copyAsync(bsDevice.ptr(), bsHost, ctx.stream());
 
   ctx.emplace(iEvent, bsPutToken_, std::move(bsDevice));
diff --git a/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc b/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc
index c9e0cc5d9..e3b525f28 100644
--- a/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc
+++ b/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc
@@ -35,7 +35,7 @@ void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
                                     cms::cuda::AcquireContext& ctx) {
   auto const& inputData = ctx.get(iEvent, tokenCUDA_);
 
-  soa_ = inputData.toHostAsync(ctx.stream());
+  soa_ = inputData.toHostAsync(ctx);
 }
 
 void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup, cms::cuda::ProduceContext&) {
diff --git a/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.cu b/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.cu
index deadb1a3c..0dcbb9b72 100644
--- a/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.cu
+++ b/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.cu
@@ -1,10 +1,10 @@
 #include "BrokenLineFitOnGPU.h"
 #include "CUDACore/device_unique_ptr.h"
 
-void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
+void HelixFitOnGPU::launchBrokenLineKernels(HitsView const* hv,
                                             uint32_t hitsInFit,
                                             uint32_t maxNumberOfTuples,
-                                            cudaStream_t stream) {
+                                            cms::cuda::Context const& ctx) {
   assert(tuples_);
 
   auto blockSize = 64;
@@ -12,72 +12,72 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
 
   //  Fit internals
   auto hitsGPU_ = cms::cuda::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double), stream);
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double), ctx);
   auto hits_geGPU_ = cms::cuda::make_device_unique<float[]>(
-      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float), stream);
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float), ctx);
   auto fast_fit_resultsGPU_ = cms::cuda::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), stream);
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), ctx);
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // fit triplets
-    kernel_BLFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
+    kernel_BLFastFit<3><<<numberOfBlocks, blockSize, 0, ctx.stream()>>>(
         tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
     cudaCheck(cudaGetLastError());
 
-    kernel_BLFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                              bField_,
-                                                              outputSoa_,
-                                                              hitsGPU_.get(),
-                                                              hits_geGPU_.get(),
-                                                              fast_fit_resultsGPU_.get(),
-                                                              3,
-                                                              offset);
+    kernel_BLFit<3><<<numberOfBlocks, blockSize, 0, ctx.stream()>>>(tupleMultiplicity_,
+                                                                    bField_,
+                                                                    outputSoa_,
+                                                                    hitsGPU_.get(),
+                                                                    hits_geGPU_.get(),
+                                                                    fast_fit_resultsGPU_.get(),
+                                                                    3,
+                                                                    offset);
     cudaCheck(cudaGetLastError());
 
     // fit quads
-    kernel_BLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+    kernel_BLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, ctx.stream()>>>(
         tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
     cudaCheck(cudaGetLastError());
 
-    kernel_BLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                                  bField_,
-                                                                  outputSoa_,
-                                                                  hitsGPU_.get(),
-                                                                  hits_geGPU_.get(),
-                                                                  fast_fit_resultsGPU_.get(),
-                                                                  4,
-                                                                  offset);
+    kernel_BLFit<4><<<numberOfBlocks / 4, blockSize, 0, ctx.stream()>>>(tupleMultiplicity_,
+                                                                        bField_,
+                                                                        outputSoa_,
+                                                                        hitsGPU_.get(),
+                                                                        hits_geGPU_.get(),
+                                                                        fast_fit_resultsGPU_.get(),
+                                                                        4,
+                                                                        offset);
     cudaCheck(cudaGetLastError());
 
     if (fit5as4_) {
       // fit penta (only first 4)
-      kernel_BLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+      kernel_BLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, ctx.stream()>>>(
           tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
       cudaCheck(cudaGetLastError());
 
-      kernel_BLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                                    bField_,
-                                                                    outputSoa_,
-                                                                    hitsGPU_.get(),
-                                                                    hits_geGPU_.get(),
-                                                                    fast_fit_resultsGPU_.get(),
-                                                                    5,
-                                                                    offset);
+      kernel_BLFit<4><<<numberOfBlocks / 4, blockSize, 0, ctx.stream()>>>(tupleMultiplicity_,
+                                                                          bField_,
+                                                                          outputSoa_,
+                                                                          hitsGPU_.get(),
+                                                                          hits_geGPU_.get(),
+                                                                          fast_fit_resultsGPU_.get(),
+                                                                          5,
+                                                                          offset);
       cudaCheck(cudaGetLastError());
     } else {
       // fit penta (all 5)
-      kernel_BLFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+      kernel_BLFastFit<5><<<numberOfBlocks / 4, blockSize, 0, ctx.stream()>>>(
           tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
       cudaCheck(cudaGetLastError());
 
-      kernel_BLFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                                    bField_,
-                                                                    outputSoa_,
-                                                                    hitsGPU_.get(),
-                                                                    hits_geGPU_.get(),
-                                                                    fast_fit_resultsGPU_.get(),
-                                                                    5,
-                                                                    offset);
+      kernel_BLFit<5><<<numberOfBlocks / 4, blockSize, 0, ctx.stream()>>>(tupleMultiplicity_,
+                                                                          bField_,
+                                                                          outputSoa_,
+                                                                          hitsGPU_.get(),
+                                                                          hits_geGPU_.get(),
+                                                                          fast_fit_resultsGPU_.get(),
+                                                                          5,
+                                                                          offset);
       cudaCheck(cudaGetLastError());
     }
 
diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc b/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc
index aca6bd319..c1d7be1a3 100644
--- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc
+++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc
@@ -35,7 +35,7 @@ void CAHitNtupletCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es, cm
 
   auto const& hits = ctx.get(iEvent, tokenHitGPU_);
 
-  ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream()));
+  ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx));
 }
 
 DEFINE_FWK_MODULE(CAHitNtupletCUDA);
diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc
index f2805d018..9e6f59a53 100644
--- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc
+++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc
@@ -11,7 +11,7 @@ void CAHitNtupletGeneratorKernelsCPU::fillHitDetIndices(HitsView const *hv, TkSo
 }
 
 template <>
-void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
+void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cms::cuda::Context const &ctx) {
   auto nhits = hh.nHits();
 
 #ifdef NTUPLE_DEBUG
diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu
index edc1eb49b..620ce551e 100644
--- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu
+++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu
@@ -137,7 +137,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
 }
 
 template <>
-void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
+void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cms::cuda::Context const &ctx) {
   int32_t nhits = hh.nHits();
 
 #ifdef NTUPLE_DEBUG
@@ -150,13 +150,13 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
 #endif
 
   // in principle we can use "nhits" to heuristically dimension the workspace...
-  device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1, nhits), stream);
+  device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1, nhits), ctx);
   assert(device_isOuterHitOfCell_.get());
 
   cellStorage_ = cms::cuda::make_device_unique<unsigned char[]>(
       caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
           caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks),
-      stream);
+      ctx);
   device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
   device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets *
                                                                                       sizeof(GPUCACell::CellNeighbors));
@@ -165,16 +165,16 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
     int threadsPerBlock = 128;
     // at least one block!
     int blocks = (std::max(1, nhits) + threadsPerBlock - 1) / threadsPerBlock;
-    gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream>>>(device_isOuterHitOfCell_.get(),
-                                                                           nhits,
-                                                                           device_theCellNeighbors_.get(),
-                                                                           device_theCellNeighborsContainer_,
-                                                                           device_theCellTracks_.get(),
-                                                                           device_theCellTracksContainer_);
+    gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, ctx.stream()>>>(device_isOuterHitOfCell_.get(),
+                                                                                 nhits,
+                                                                                 device_theCellNeighbors_.get(),
+                                                                                 device_theCellNeighborsContainer_,
+                                                                                 device_theCellTracks_.get(),
+                                                                                 device_theCellTracksContainer_);
     cudaCheck(cudaGetLastError());
   }
 
-  device_theCells_ = cms::cuda::make_device_unique<GPUCACell[]>(params_.maxNumberOfDoublets_, stream);
+  device_theCells_ = cms::cuda::make_device_unique<GPUCACell[]>(params_.maxNumberOfDoublets_, ctx);
 
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
@@ -201,18 +201,18 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
   int blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock;
   dim3 blks(1, blocks, 1);
   dim3 thrs(stride, threadsPerBlock, 1);
-  gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream>>>(device_theCells_.get(),
-                                                                    device_nCells_,
-                                                                    device_theCellNeighbors_.get(),
-                                                                    device_theCellTracks_.get(),
-                                                                    hh.view(),
-                                                                    device_isOuterHitOfCell_.get(),
-                                                                    nActualPairs,
-                                                                    params_.idealConditions_,
-                                                                    params_.doClusterCut_,
-                                                                    params_.doZ0Cut_,
-                                                                    params_.doPtCut_,
-                                                                    params_.maxNumberOfDoublets_);
+  gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, ctx.stream()>>>(device_theCells_.get(),
+                                                                          device_nCells_,
+                                                                          device_theCellNeighbors_.get(),
+                                                                          device_theCellTracks_.get(),
+                                                                          hh.view(),
+                                                                          device_isOuterHitOfCell_.get(),
+                                                                          nActualPairs,
+                                                                          params_.idealConditions_,
+                                                                          params_.doClusterCut_,
+                                                                          params_.doZ0Cut_,
+                                                                          params_.doPtCut_,
+                                                                          params_.maxNumberOfDoublets_);
   cudaCheck(cudaGetLastError());
 
 #ifdef GPU_DEBUG
diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h
index dd87597a4..c57f97456 100644
--- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h
+++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h
@@ -180,8 +180,8 @@ class CAHitNtupletGeneratorKernels {
 
   void fillHitDetIndices(HitsView const* hv, TkSoA* tuples_d, cudaStream_t cudaStream);
 
-  void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream);
-  void allocateOnGPU(int32_t nHits, cudaStream_t stream);
+  void buildDoublets(HitsOnCPU const& hh, cms::cuda::Context const& ctx);
+  void allocateOnGPU(int32_t nHits, cms::cuda::Context const& ctx);
   void cleanup(cudaStream_t cudaStream);
 
   static void printCounters(Counters const* counters);
diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h
index 929677a44..1f8c3f931 100644
--- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h
+++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h
@@ -4,16 +4,16 @@
 
 template <>
 #ifdef __CUDACC__
-void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(int32_t nHits, cudaStream_t stream) {
+void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(int32_t nHits, cms::cuda::Context const& ctx) {
 #else
-void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cudaStream_t stream) {
+void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cms::cuda::Context const& ctx) {
 #endif
   //////////////////////////////////////////////////////////
   // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
   //////////////////////////////////////////////////////////
 
-  device_theCellNeighbors_ = Traits::template make_unique<caConstants::CellNeighborsVector>(stream);
-  device_theCellTracks_ = Traits::template make_unique<caConstants::CellTracksVector>(stream);
+  device_theCellNeighbors_ = Traits::template make_unique<caConstants::CellNeighborsVector>(ctx);
+  device_theCellTracks_ = Traits::template make_unique<caConstants::CellTracksVector>(ctx);
 
 #ifdef GPU_DEBUG
   std::cout << "Allocation for tuple building. N hits " << nHits << std::endl;
@@ -21,15 +21,15 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cudaStream_t
 
   nHits++;  // storage requires one more counter;
   assert(nHits > 0);
-  device_hitToTuple_ = Traits::template make_unique<HitToTuple>(stream);
-  device_hitToTupleStorage_ = Traits::template make_unique<HitToTuple::Counter[]>(nHits, stream);
+  device_hitToTuple_ = Traits::template make_unique<HitToTuple>(ctx);
+  device_hitToTupleStorage_ = Traits::template make_unique<HitToTuple::Counter[]>(nHits, ctx);
   hitToTupleView_.assoc = device_hitToTuple_.get();
   hitToTupleView_.offStorage = device_hitToTupleStorage_.get();
   hitToTupleView_.offSize = nHits;
 
-  device_tupleMultiplicity_ = Traits::template make_unique<TupleMultiplicity>(stream);
+  device_tupleMultiplicity_ = Traits::template make_unique<TupleMultiplicity>(ctx);
 
-  device_storage_ = Traits::template make_unique<cms::cuda::AtomicPairCounter::c_type[]>(3, stream);
+  device_storage_ = Traits::template make_unique<cms::cuda::AtomicPairCounter::c_type[]>(3, ctx);
 
   device_hitTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get();
   device_hitToTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get() + 1;
@@ -37,12 +37,12 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cudaStream_t
 
   // FIXME: consider collapsing these 3 in one adhoc kernel
   if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
-    cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream));
+    cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), ctx.stream()));
   } else {
     *device_nCells_ = 0;
   }
-  cms::cuda::launchZero(device_tupleMultiplicity_.get(), stream);
-  cms::cuda::launchZero(hitToTupleView_, stream);  // we may wish to keep it in the edm
+  cms::cuda::launchZero(device_tupleMultiplicity_.get(), ctx.stream());
+  cms::cuda::launchZero(hitToTupleView_, ctx.stream());  // we may wish to keep it in the edm
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc
index 714748cc1..7bddc5322 100644
--- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc
+++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc
@@ -112,28 +112,28 @@ CAHitNtupletGeneratorOnGPU::~CAHitNtupletGeneratorOnGPU() {
 
 PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d,
                                                                     float bfield,
-                                                                    cudaStream_t stream) const {
-  PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique<pixelTrack::TrackSoA>(stream));
+                                                                    cms::cuda::Context const& ctx) const {
+  PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique<pixelTrack::TrackSoA>(ctx));
 
   auto* soa = tracks.get();
   assert(soa);
 
   CAHitNtupletGeneratorKernelsGPU kernels(m_params);
   kernels.setCounters(m_counters);
-  kernels.allocateOnGPU(hits_d.nHits(), stream);
+  kernels.allocateOnGPU(hits_d.nHits(), ctx);
 
-  kernels.buildDoublets(hits_d, stream);
-  kernels.launchKernels(hits_d, soa, stream);
-  kernels.fillHitDetIndices(hits_d.view(), soa, stream);  // in principle needed only if Hits not "available"
+  kernels.buildDoublets(hits_d, ctx);
+  kernels.launchKernels(hits_d, soa, ctx.stream());
+  kernels.fillHitDetIndices(hits_d.view(), soa, ctx.stream());  // in principle needed only if Hits not "available"
 
   HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
   fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
   if (m_params.useRiemannFit_) {
-    fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream);
+    fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, ctx);
   } else {
-    fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream);
+    fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, ctx);
   }
-  kernels.classifyTuples(hits_d, soa, stream);
+  kernels.classifyTuples(hits_d, soa, ctx.stream());
 
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
@@ -152,9 +152,10 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC
 
   CAHitNtupletGeneratorKernelsCPU kernels(m_params);
   kernels.setCounters(m_counters);
-  kernels.allocateOnGPU(hits_d.nHits(), nullptr);
+  cms::cuda::Context dummy{nullptr};
+  kernels.allocateOnGPU(hits_d.nHits(), dummy);
 
-  kernels.buildDoublets(hits_d, nullptr);
+  kernels.buildDoublets(hits_d, dummy);
   kernels.launchKernels(hits_d, soa, nullptr);
   kernels.fillHitDetIndices(hits_d.view(), soa, nullptr);  // in principle needed only if Hits not "available"
 
diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h
index f42bb301b..c6b10c574 100644
--- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h
+++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h
@@ -37,7 +37,9 @@ class CAHitNtupletGeneratorOnGPU {
 
   ~CAHitNtupletGeneratorOnGPU();
 
-  PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const;
+  PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
+                                          float bfield,
+                                          cms::cuda::Context const& ctx) const;
 
   PixelTrackHeterogeneous makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const;
 
diff --git a/src/cudadev/plugin-PixelTriplets/HelixFitOnGPU.h b/src/cudadev/plugin-PixelTriplets/HelixFitOnGPU.h
index fee0f8dae..d74599bf1 100644
--- a/src/cudadev/plugin-PixelTriplets/HelixFitOnGPU.h
+++ b/src/cudadev/plugin-PixelTriplets/HelixFitOnGPU.h
@@ -44,8 +44,14 @@ class HelixFitOnGPU {
   ~HelixFitOnGPU() { deallocateOnGPU(); }
 
   void setBField(double bField) { bField_ = bField; }
-  void launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
-  void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
+  void launchRiemannKernels(HitsView const *hv,
+                            uint32_t nhits,
+                            uint32_t maxNumberOfTuples,
+                            cms::cuda::Context const &ctx);
+  void launchBrokenLineKernels(HitsView const *hv,
+                               uint32_t nhits,
+                               uint32_t maxNumberOfTuples,
+                               cms::cuda::Context const &ctx);
 
   void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
   void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
diff --git a/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.cu b/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.cu
index 8e6061611..ba5514a52 100644
--- a/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.cu
+++ b/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.cu
@@ -4,7 +4,7 @@
 void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv,
                                          uint32_t nhits,
                                          uint32_t maxNumberOfTuples,
-                                         cudaStream_t stream) {
+                                         cms::cuda::Context const &ctx) {
   assert(tuples_);
 
   auto blockSize = 64;
@@ -12,50 +12,35 @@ void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv,
 
   //  Fit internals
   auto hitsGPU = cms::cuda::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double), stream);
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double), ctx);
   auto hits_geGPU = cms::cuda::make_device_unique<float[]>(
-      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float), stream);
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float), ctx);
   auto fast_fit_resultsGPU = cms::cuda::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), stream);
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), ctx);
   auto circle_fit_resultsGPU_holder =
-      cms::cuda::make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::CircleFit), stream);
+      cms::cuda::make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::CircleFit), ctx);
   riemannFit::CircleFit *circle_fit_resultsGPU_ = (riemannFit::CircleFit *)(circle_fit_resultsGPU_holder.get());
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // triplets
-    kernel_FastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
+    kernel_FastFit<3><<<numberOfBlocks, blockSize, 0, ctx.stream()>>>(
         tuples_, tupleMultiplicity_, 3, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
     cudaCheck(cudaGetLastError());
 
-    kernel_CircleFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                                  3,
-                                                                  bField_,
-                                                                  hitsGPU.get(),
-                                                                  hits_geGPU.get(),
-                                                                  fast_fit_resultsGPU.get(),
-                                                                  circle_fit_resultsGPU_,
-                                                                  offset);
-    cudaCheck(cudaGetLastError());
-
-    kernel_LineFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                                3,
-                                                                bField_,
-                                                                outputSoa_,
-                                                                hitsGPU.get(),
-                                                                hits_geGPU.get(),
-                                                                fast_fit_resultsGPU.get(),
-                                                                circle_fit_resultsGPU_,
-                                                                offset);
-    cudaCheck(cudaGetLastError());
-
-    // quads
-    kernel_FastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-        tuples_, tupleMultiplicity_, 4, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
+    kernel_CircleFit<3><<<numberOfBlocks, blockSize, 0, ctx.stream()>>>(tupleMultiplicity_,
+                                                                        3,
+                                                                        bField_,
+                                                                        hitsGPU.get(),
+                                                                        hits_geGPU.get(),
+                                                                        fast_fit_resultsGPU.get(),
+                                                                        circle_fit_resultsGPU_,
+                                                                        offset);
     cudaCheck(cudaGetLastError());
 
-    kernel_CircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                                      4,
+    kernel_LineFit<3><<<numberOfBlocks, blockSize, 0, ctx.stream()>>>(tupleMultiplicity_,
+                                                                      3,
                                                                       bField_,
+                                                                      outputSoa_,
                                                                       hitsGPU.get(),
                                                                       hits_geGPU.get(),
                                                                       fast_fit_resultsGPU.get(),
@@ -63,68 +48,83 @@ void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv,
                                                                       offset);
     cudaCheck(cudaGetLastError());
 
-    kernel_LineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                                    4,
-                                                                    bField_,
-                                                                    outputSoa_,
-                                                                    hitsGPU.get(),
-                                                                    hits_geGPU.get(),
-                                                                    fast_fit_resultsGPU.get(),
-                                                                    circle_fit_resultsGPU_,
-                                                                    offset);
+    // quads
+    kernel_FastFit<4><<<numberOfBlocks / 4, blockSize, 0, ctx.stream()>>>(
+        tuples_, tupleMultiplicity_, 4, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
+    cudaCheck(cudaGetLastError());
+
+    kernel_CircleFit<4><<<numberOfBlocks / 4, blockSize, 0, ctx.stream()>>>(tupleMultiplicity_,
+                                                                            4,
+                                                                            bField_,
+                                                                            hitsGPU.get(),
+                                                                            hits_geGPU.get(),
+                                                                            fast_fit_resultsGPU.get(),
+                                                                            circle_fit_resultsGPU_,
+                                                                            offset);
+    cudaCheck(cudaGetLastError());
+
+    kernel_LineFit<4><<<numberOfBlocks / 4, blockSize, 0, ctx.stream()>>>(tupleMultiplicity_,
+                                                                          4,
+                                                                          bField_,
+                                                                          outputSoa_,
+                                                                          hitsGPU.get(),
+                                                                          hits_geGPU.get(),
+                                                                          fast_fit_resultsGPU.get(),
+                                                                          circle_fit_resultsGPU_,
+                                                                          offset);
     cudaCheck(cudaGetLastError());
 
     if (fit5as4_) {
       // penta
-      kernel_FastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+      kernel_FastFit<4><<<numberOfBlocks / 4, blockSize, 0, ctx.stream()>>>(
           tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
       cudaCheck(cudaGetLastError());
 
-      kernel_CircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                                        5,
-                                                                        bField_,
-                                                                        hitsGPU.get(),
-                                                                        hits_geGPU.get(),
-                                                                        fast_fit_resultsGPU.get(),
-                                                                        circle_fit_resultsGPU_,
-                                                                        offset);
+      kernel_CircleFit<4><<<numberOfBlocks / 4, blockSize, 0, ctx.stream()>>>(tupleMultiplicity_,
+                                                                              5,
+                                                                              bField_,
+                                                                              hitsGPU.get(),
+                                                                              hits_geGPU.get(),
+                                                                              fast_fit_resultsGPU.get(),
+                                                                              circle_fit_resultsGPU_,
+                                                                              offset);
       cudaCheck(cudaGetLastError());
 
-      kernel_LineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                                      5,
-                                                                      bField_,
-                                                                      outputSoa_,
-                                                                      hitsGPU.get(),
-                                                                      hits_geGPU.get(),
-                                                                      fast_fit_resultsGPU.get(),
-                                                                      circle_fit_resultsGPU_,
-                                                                      offset);
+      kernel_LineFit<4><<<numberOfBlocks / 4, blockSize, 0, ctx.stream()>>>(tupleMultiplicity_,
+                                                                            5,
+                                                                            bField_,
+                                                                            outputSoa_,
+                                                                            hitsGPU.get(),
+                                                                            hits_geGPU.get(),
+                                                                            fast_fit_resultsGPU.get(),
+                                                                            circle_fit_resultsGPU_,
+                                                                            offset);
       cudaCheck(cudaGetLastError());
     } else {
       // penta all 5
-      kernel_FastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+      kernel_FastFit<5><<<numberOfBlocks / 4, blockSize, 0, ctx.stream()>>>(
           tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
       cudaCheck(cudaGetLastError());
 
-      kernel_CircleFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                                        5,
-                                                                        bField_,
-                                                                        hitsGPU.get(),
-                                                                        hits_geGPU.get(),
-                                                                        fast_fit_resultsGPU.get(),
-                                                                        circle_fit_resultsGPU_,
-                                                                        offset);
+      kernel_CircleFit<5><<<numberOfBlocks / 4, blockSize, 0, ctx.stream()>>>(tupleMultiplicity_,
+                                                                              5,
+                                                                              bField_,
+                                                                              hitsGPU.get(),
+                                                                              hits_geGPU.get(),
+                                                                              fast_fit_resultsGPU.get(),
+                                                                              circle_fit_resultsGPU_,
+                                                                              offset);
       cudaCheck(cudaGetLastError());
 
-      kernel_LineFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                                      5,
-                                                                      bField_,
-                                                                      outputSoa_,
-                                                                      hitsGPU.get(),
-                                                                      hits_geGPU.get(),
-                                                                      fast_fit_resultsGPU.get(),
-                                                                      circle_fit_resultsGPU_,
-                                                                      offset);
+      kernel_LineFit<5><<<numberOfBlocks / 4, blockSize, 0, ctx.stream()>>>(tupleMultiplicity_,
+                                                                            5,
+                                                                            bField_,
+                                                                            outputSoa_,
+                                                                            hitsGPU.get(),
+                                                                            hits_geGPU.get(),
+                                                                            fast_fit_resultsGPU.get(),
+                                                                            circle_fit_resultsGPU_,
+                                                                            offset);
       cudaCheck(cudaGetLastError());
     }
   }
diff --git a/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc b/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc
index d4ca2196b..c9334747c 100644
--- a/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc
+++ b/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc
@@ -63,7 +63,7 @@ void PixelVertexProducerCUDA::produceOnGPU(edm::Event& iEvent, const edm::EventS
 
     assert(tracks);
 
-    ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_));
+    ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx, tracks, ptMin_));
   });
 }
 
diff --git a/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc b/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc
index 4f38c0ad8..454e6b32a 100644
--- a/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc
+++ b/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc
@@ -33,7 +33,7 @@ void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent,
                                      cms::cuda::AcquireContext& ctx) {
   auto const& inputData = ctx.get(iEvent, tokenCUDA_);
 
-  m_soa = inputData.toHostAsync(ctx.stream());
+  m_soa = inputData.toHostAsync(ctx);
 }
 
 void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup, cms::cuda::ProduceContext&) {
diff --git a/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.cc b/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.cc
index 608d4efc0..c05e31dde 100644
--- a/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.cc
+++ b/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.cc
@@ -93,11 +93,11 @@ namespace gpuVertexFinder {
 #endif
 
 #ifdef __CUDACC__
-  ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
+  ZVertexHeterogeneous Producer::makeAsync(cms::cuda::Context const& ctx, TkSoA const* tksoa, float ptMin) const {
 #ifdef PIXVERTEX_DEBUG_PRODUCE
     std::cout << "producing Vertices on GPU" << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
-    ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));
+    ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(ctx));
 #else
   ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const {
 #ifdef PIXVERTEX_DEBUG_PRODUCE
@@ -110,16 +110,16 @@ namespace gpuVertexFinder {
     assert(soa);
 
 #ifdef __CUDACC__
-    auto ws_d = cms::cuda::make_device_unique<WorkSpace>(stream);
+    auto ws_d = cms::cuda::make_device_unique<WorkSpace>(ctx);
 #else
     auto ws_d = std::make_unique<WorkSpace>();
 #endif
 
 #ifdef __CUDACC__
-    init<<<1, 1, 0, stream>>>(soa, ws_d.get());
+    init<<<1, 1, 0, ctx.stream()>>>(soa, ws_d.get());
     auto blockSize = 128;
     auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize;
-    loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin);
+    loadTracks<<<numberOfBlocks, blockSize, 0, ctx.stream()>>>(tksoa, soa, ws_d.get(), ptMin);
     cudaCheck(cudaGetLastError());
 #else
     init(soa, ws_d.get());
@@ -135,32 +135,33 @@ namespace gpuVertexFinder {
     if (oneKernel_) {
       // implemented only for density clustesrs
 #ifndef THREE_KERNELS
-      vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, ctx.stream()>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
 #else
-      vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      vertexFinderKernel1<<<1, maxThreadsForPrint, 0, ctx.stream()>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
       cudaCheck(cudaGetLastError());
       // one block per vertex...
-      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
+      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, ctx.stream()>>>(soa, ws_d.get(), maxChi2ForSplit);
       cudaCheck(cudaGetLastError());
-      vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
+      vertexFinderKernel2<<<1, maxThreadsForPrint, 0, ctx.stream()>>>(soa, ws_d.get());
 #endif
     } else {  // five kernels
       if (useDensity_) {
-        clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+        clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, ctx.stream()>>>(
+            soa, ws_d.get(), minT, eps, errmax, chi2max);
       } else if (useDBSCAN_) {
-        clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+        clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, ctx.stream()>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
       } else if (useIterative_) {
-        clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+        clusterTracksIterative<<<1, maxThreadsForPrint, 0, ctx.stream()>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
       }
       cudaCheck(cudaGetLastError());
-      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit);
+      fitVerticesKernel<<<1, maxThreadsForPrint, 0, ctx.stream()>>>(soa, ws_d.get(), maxChi2ForFirstFit);
       cudaCheck(cudaGetLastError());
       // one block per vertex...
-      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
+      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, ctx.stream()>>>(soa, ws_d.get(), maxChi2ForSplit);
       cudaCheck(cudaGetLastError());
-      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit);
+      fitVerticesKernel<<<1, maxThreadsForPrint, 0, ctx.stream()>>>(soa, ws_d.get(), maxChi2ForFinalFit);
       cudaCheck(cudaGetLastError());
-      sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
+      sortByPt2Kernel<<<1, maxThreadsForPrint, 0, ctx.stream()>>>(soa, ws_d.get());
     }
     cudaCheck(cudaGetLastError());
 #else  // __CUDACC__
diff --git a/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.h b/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.h
index b9b8b35d7..8e569654e 100644
--- a/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.h
+++ b/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.h
@@ -63,7 +63,7 @@ namespace gpuVertexFinder {
 
     ~Producer() = default;
 
-    ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const;
+    ZVertexHeterogeneous makeAsync(cms::cuda::Context const& ctx, TkSoA const* tksoa, float ptMin) const;
     ZVertexHeterogeneous make(TkSoA const* tksoa, float ptMin) const;
 
   private:
diff --git a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
index 4f06bf540..17674d46e 100644
--- a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
+++ b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
@@ -160,7 +160,7 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
                              useQuality_,
                              includeErrors_,
                              false,  // debug
-                             ctx.stream());
+                             ctx);
 }
 
 void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent,
diff --git a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu
index aaa72c5e0..dc8591063 100644
--- a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu
+++ b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu
@@ -509,20 +509,20 @@ namespace pixelgpudetails {
                                                        bool useQualityInfo,
                                                        bool includeErrors,
                                                        bool debug,
-                                                       cudaStream_t stream) {
+                                                       cms::cuda::Context const &ctx) {
     nDigis = wordCounter;
 
 #ifdef GPU_DEBUG
     std::cout << "decoding " << wordCounter << " digis. Max is " << pixelgpudetails::MAX_FED_WORDS << std::endl;
 #endif
 
-    digis_d = SiPixelDigisCUDA(pixelgpudetails::MAX_FED_WORDS, stream);
+    digis_d = SiPixelDigisCUDA(pixelgpudetails::MAX_FED_WORDS, ctx);
     if (includeErrors) {
-      digiErrors_d = SiPixelDigiErrorsCUDA(pixelgpudetails::MAX_FED_WORDS, std::move(errors), stream);
+      digiErrors_d = SiPixelDigiErrorsCUDA(pixelgpudetails::MAX_FED_WORDS, std::move(errors), ctx);
     }
-    clusters_d = SiPixelClustersCUDA(gpuClustering::maxNumModules, stream);
+    clusters_d = SiPixelClustersCUDA(gpuClustering::maxNumModules, ctx);
 
-    nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(2, stream);
+    nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(2, ctx);
 
     if (wordCounter)  // protect in case of empty event....
     {
@@ -531,16 +531,16 @@ namespace pixelgpudetails {
 
       assert(0 == wordCounter % 2);
       // wordCounter is the total no of words in each event to be trasfered on device
-      auto word_d = cms::cuda::make_device_unique<uint32_t[]>(wordCounter, stream);
-      auto fedId_d = cms::cuda::make_device_unique<uint8_t[]>(wordCounter, stream);
+      auto word_d = cms::cuda::make_device_unique<uint32_t[]>(wordCounter, ctx);
+      auto fedId_d = cms::cuda::make_device_unique<uint8_t[]>(wordCounter, ctx);
 
-      cudaCheck(
-          cudaMemcpyAsync(word_d.get(), wordFed.word(), wordCounter * sizeof(uint32_t), cudaMemcpyDefault, stream));
       cudaCheck(cudaMemcpyAsync(
-          fedId_d.get(), wordFed.fedId(), wordCounter * sizeof(uint8_t) / 2, cudaMemcpyDefault, stream));
+          word_d.get(), wordFed.word(), wordCounter * sizeof(uint32_t), cudaMemcpyDefault, ctx.stream()));
+      cudaCheck(cudaMemcpyAsync(
+          fedId_d.get(), wordFed.fedId(), wordCounter * sizeof(uint8_t) / 2, cudaMemcpyDefault, ctx.stream()));
 
       // Launch rawToDigi kernel
-      RawToDigi_kernel<<<blocks, threadsPerBlock, 0, stream>>>(
+      RawToDigi_kernel<<<blocks, threadsPerBlock, 0, ctx.stream()>>>(
           cablingMap,
           modToUnp,
           wordCounter,
@@ -563,7 +563,7 @@ namespace pixelgpudetails {
 #endif
 
       if (includeErrors) {
-        digiErrors_d.copyErrorToHostAsync(stream);
+        digiErrors_d.copyErrorToHostAsync(ctx);
       }
     }
     // End of Raw2Digi and passing data for clustering
@@ -575,16 +575,16 @@ namespace pixelgpudetails {
       int blocks =
           (std::max(int(wordCounter), int(gpuClustering::maxNumModules)) + threadsPerBlock - 1) / threadsPerBlock;
 
-      gpuCalibPixel::calibDigis<<<blocks, threadsPerBlock, 0, stream>>>(isRun2,
-                                                                        digis_d.moduleInd(),
-                                                                        digis_d.xx(),
-                                                                        digis_d.yy(),
-                                                                        digis_d.adc(),
-                                                                        gains,
-                                                                        wordCounter,
-                                                                        clusters_d.moduleStart(),
-                                                                        clusters_d.clusInModule(),
-                                                                        clusters_d.clusModuleStart());
+      gpuCalibPixel::calibDigis<<<blocks, threadsPerBlock, 0, ctx.stream()>>>(isRun2,
+                                                                              digis_d.moduleInd(),
+                                                                              digis_d.xx(),
+                                                                              digis_d.yy(),
+                                                                              digis_d.adc(),
+                                                                              gains,
+                                                                              wordCounter,
+                                                                              clusters_d.moduleStart(),
+                                                                              clusters_d.clusInModule(),
+                                                                              clusters_d.clusModuleStart());
       cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
       cudaDeviceSynchronize();
@@ -596,27 +596,27 @@ namespace pixelgpudetails {
                 << " threads\n";
 #endif
 
-      countModules<<<blocks, threadsPerBlock, 0, stream>>>(
+      countModules<<<blocks, threadsPerBlock, 0, ctx.stream()>>>(
           digis_d.moduleInd(), clusters_d.moduleStart(), digis_d.clus(), wordCounter);
       cudaCheck(cudaGetLastError());
 
       // read the number of modules into a data member, used by getProduct())
       cudaCheck(cudaMemcpyAsync(
-          &(nModules_Clusters_h[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream));
+          &(nModules_Clusters_h[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, ctx.stream()));
 
       threadsPerBlock = 256;
       blocks = maxNumModules;
 #ifdef GPU_DEBUG
       std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
 #endif
-      findClus<<<blocks, threadsPerBlock, 0, stream>>>(digis_d.moduleInd(),
-                                                       digis_d.xx(),
-                                                       digis_d.yy(),
-                                                       clusters_d.moduleStart(),
-                                                       clusters_d.clusInModule(),
-                                                       clusters_d.moduleId(),
-                                                       digis_d.clus(),
-                                                       wordCounter);
+      findClus<<<blocks, threadsPerBlock, 0, ctx.stream()>>>(digis_d.moduleInd(),
+                                                             digis_d.xx(),
+                                                             digis_d.yy(),
+                                                             clusters_d.moduleStart(),
+                                                             clusters_d.clusInModule(),
+                                                             clusters_d.moduleId(),
+                                                             digis_d.clus(),
+                                                             wordCounter);
       cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
       cudaDeviceSynchronize();
@@ -624,14 +624,14 @@ namespace pixelgpudetails {
 #endif
 
       // apply charge cut
-      clusterChargeCut<<<blocks, threadsPerBlock, 0, stream>>>(clusterThresholds,
-                                                               digis_d.moduleInd(),
-                                                               digis_d.adc(),
-                                                               clusters_d.moduleStart(),
-                                                               clusters_d.clusInModule(),
-                                                               clusters_d.moduleId(),
-                                                               digis_d.clus(),
-                                                               wordCounter);
+      clusterChargeCut<<<blocks, threadsPerBlock, 0, ctx.stream()>>>(clusterThresholds,
+                                                                     digis_d.moduleInd(),
+                                                                     digis_d.adc(),
+                                                                     clusters_d.moduleStart(),
+                                                                     clusters_d.clusInModule(),
+                                                                     clusters_d.moduleId(),
+                                                                     digis_d.clus(),
+                                                                     wordCounter);
       cudaCheck(cudaGetLastError());
 
       // count the module start indices already here (instead of
@@ -640,14 +640,14 @@ namespace pixelgpudetails {
       // synchronization/ExternalWork
 
       // MUST be ONE block
-      fillHitsModuleStart<<<1, 1024, 0, stream>>>(clusters_d.clusInModule(), clusters_d.clusModuleStart());
+      fillHitsModuleStart<<<1, 1024, 0, ctx.stream()>>>(clusters_d.clusInModule(), clusters_d.clusModuleStart());
 
       // last element holds the number of all clusters
       cudaCheck(cudaMemcpyAsync(&(nModules_Clusters_h[1]),
                                 clusters_d.clusModuleStart() + gpuClustering::maxNumModules,
                                 sizeof(uint32_t),
                                 cudaMemcpyDefault,
-                                stream));
+                                ctx.stream()));
 
 #ifdef GPU_DEBUG
       cudaDeviceSynchronize();
diff --git a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h
index 04e8b99b9..ec70894c4 100644
--- a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h
+++ b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h
@@ -151,7 +151,7 @@ namespace pixelgpudetails {
                            bool useQualityInfo,
                            bool includeErrors,
                            bool debug,
-                           cudaStream_t stream);
+                           const cms::cuda::Context& ctx);
 
     std::pair<SiPixelDigisCUDA, SiPixelClustersCUDA> getResults() {
       digis_d.setNModulesDigis(nModules_Clusters_h[0], nDigis);
diff --git a/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc b/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc
index 8d1f8b3c4..bd07cf430 100644
--- a/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc
+++ b/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc
@@ -37,10 +37,10 @@ void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent,
   const auto& gpuDigis = ctx.get(iEvent, digiGetToken_);
 
   nDigis_ = gpuDigis.nDigis();
-  pdigi_ = gpuDigis.pdigiToHostAsync(ctx.stream());
-  rawIdArr_ = gpuDigis.rawIdArrToHostAsync(ctx.stream());
-  adc_ = gpuDigis.adcToHostAsync(ctx.stream());
-  clus_ = gpuDigis.clusToHostAsync(ctx.stream());
+  pdigi_ = gpuDigis.pdigiToHostAsync(ctx);
+  rawIdArr_ = gpuDigis.rawIdArrToHostAsync(ctx);
+  adc_ = gpuDigis.adcToHostAsync(ctx);
+  clus_ = gpuDigis.clusToHostAsync(ctx);
 }
 
 void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext&) {
diff --git a/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.cu b/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.cu
index ba62da1b5..520a395ef 100644
--- a/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.cu
+++ b/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.cu
@@ -37,9 +37,9 @@ namespace pixelgpudetails {
                                                            SiPixelClustersCUDA const& clusters_d,
                                                            BeamSpotCUDA const& bs_d,
                                                            pixelCPEforGPU::ParamsOnGPU const* cpeParams,
-                                                           cudaStream_t stream) const {
+                                                           cms::cuda::Context const& ctx) const {
     auto nHits = clusters_d.nClusters();
-    TrackingRecHit2DCUDA hits_d(nHits, cpeParams, clusters_d.clusModuleStart(), stream);
+    TrackingRecHit2DCUDA hits_d(nHits, cpeParams, clusters_d.clusModuleStart(), ctx);
 
     int threadsPerBlock = 128;
     int blocks = digis_d.nModules();  // active modules (with digis)
@@ -49,7 +49,7 @@ namespace pixelgpudetails {
 #endif
     // protect from empty events
     if (blocks) {
-      gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream>>>(
+      gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, ctx.stream()>>>(
           cpeParams, bs_d.data(), digis_d.view(), digis_d.nDigis(), clusters_d.view(), hits_d.view());
       cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
@@ -59,11 +59,17 @@ namespace pixelgpudetails {
 
     // assuming full warp of threads is better than a smaller number...
     if (nHits) {
-      setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart());
+      setHitsLayerStart<<<1, 32, 0, ctx.stream()>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart());
       cudaCheck(cudaGetLastError());
 
-      cms::cuda::fillManyFromVector(
-          hits_d.phiBinner(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256, hits_d.phiBinnerStorage(), stream);
+      cms::cuda::fillManyFromVector(hits_d.phiBinner(),
+                                    10,
+                                    hits_d.iphi(),
+                                    hits_d.hitsLayerStart(),
+                                    nHits,
+                                    256,
+                                    hits_d.phiBinnerStorage(),
+                                    ctx.stream());
       cudaCheck(cudaGetLastError());
 
 #ifdef GPU_DEBUG
diff --git a/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.h b/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.h
index 7b0a38a15..f4735489d 100644
--- a/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.h
+++ b/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.h
@@ -26,7 +26,7 @@ namespace pixelgpudetails {
                                        SiPixelClustersCUDA const& clusters_d,
                                        BeamSpotCUDA const& bs_d,
                                        pixelCPEforGPU::ParamsOnGPU const* cpeParams,
-                                       cudaStream_t stream) const;
+                                       cms::cuda::Context const& ctx) const;
   };
 }  // namespace pixelgpudetails
 
diff --git a/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc b/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc
index 133fa8eaf..c363a6a67 100644
--- a/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc
+++ b/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc
@@ -42,9 +42,8 @@ void SiPixelRecHitCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es, c
   auto const& digis = ctx.get(iEvent, tokenDigi_);
   auto const& bs = ctx.get(iEvent, tBeamSpot);
 
-  ctx.emplace(iEvent,
-              tokenHit_,
-              gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe.getGPUProductAsync(ctx.stream()), ctx.stream()));
+  ctx.emplace(
+      iEvent, tokenHit_, gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe.getGPUProductAsync(ctx.stream()), ctx));
 }
 
 DEFINE_FWK_MODULE(SiPixelRecHitCUDA);
diff --git a/src/cudadev/plugin-Validation/HistoValidator.cc b/src/cudadev/plugin-Validation/HistoValidator.cc
index f8af1773d..e45e27f47 100644
--- a/src/cudadev/plugin-Validation/HistoValidator.cc
+++ b/src/cudadev/plugin-Validation/HistoValidator.cc
@@ -92,18 +92,18 @@ void HistoValidator::acquire(const edm::Event& iEvent, const edm::EventSetup& iS
 
   nDigis = digis.nDigis();
   nModules = digis.nModules();
-  h_adc = digis.adcToHostAsync(ctx.stream());
+  h_adc = digis.adcToHostAsync(ctx);
 
   nClusters = clusters.nClusters();
-  h_clusInModule = cms::cuda::make_host_unique<uint32_t[]>(nModules, ctx.stream());
+  h_clusInModule = cms::cuda::make_host_unique<uint32_t[]>(nModules, ctx);
   cudaCheck(cudaMemcpyAsync(
       h_clusInModule.get(), clusters.clusInModule(), sizeof(uint32_t) * nModules, cudaMemcpyDefault, ctx.stream()));
 
   nHits = hits.nHits();
-  h_localCoord = hits.localCoordToHostAsync(ctx.stream());
-  h_globalCoord = hits.globalCoordToHostAsync(ctx.stream());
-  h_charge = hits.chargeToHostAsync(ctx.stream());
-  h_size = hits.sizeToHostAsync(ctx.stream());
+  h_localCoord = hits.localCoordToHostAsync(ctx);
+  h_globalCoord = hits.globalCoordToHostAsync(ctx);
+  h_charge = hits.chargeToHostAsync(ctx);
+  h_size = hits.sizeToHostAsync(ctx);
 }
 
 void HistoValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext&) {
diff --git a/src/cudadev/test/HistoContainer_t.cu b/src/cudadev/test/HistoContainer_t.cu
index 4f5a4ee54..4263a7e82 100644
--- a/src/cudadev/test/HistoContainer_t.cu
+++ b/src/cudadev/test/HistoContainer_t.cu
@@ -8,6 +8,7 @@
 #include "CUDACore/cudaCheck.h"
 #include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/requireDevices.h"
+#include "CUDACore/TestContext.h"
 
 using namespace cms::cuda;
 
@@ -15,10 +16,11 @@ template <typename T>
 void go() {
   std::mt19937 eng;
   std::uniform_int_distribution<T> rgen(std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
+  cms::cudatest::TestContext ctx;
 
   constexpr int N = 12000;
   T v[N];
-  auto v_d = make_device_unique<T[]>(N, nullptr);
+  auto v_d = make_device_unique<T[]>(N, ctx);
 
   cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
 
@@ -34,9 +36,9 @@ void go() {
   assert(Hist::totbins() == Hist::ctNOnes());
 
   Hist h;
-  auto h_d = make_device_unique<Hist[]>(1, nullptr);
+  auto h_d = make_device_unique<Hist[]>(1, ctx);
 
-  auto off_d = make_device_unique<uint32_t[]>(nParts + 1, nullptr);
+  auto off_d = make_device_unique<uint32_t[]>(nParts + 1, ctx);
 
   for (int it = 0; it < 5; ++it) {
     offsets[0] = 0;
diff --git a/src/cudadev/test/OneHistoContainer_t.cu b/src/cudadev/test/OneHistoContainer_t.cu
index 89fed087a..e37b21b9c 100644
--- a/src/cudadev/test/OneHistoContainer_t.cu
+++ b/src/cudadev/test/OneHistoContainer_t.cu
@@ -9,6 +9,7 @@
 #include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/launch.h"
 #include "CUDACore/requireDevices.h"
+#include "CUDACore/TestContext.h"
 
 using namespace cms::cuda;
 
@@ -104,11 +105,12 @@ void go() {
   }
 
   std::uniform_int_distribution<T> rgen(rmin, rmax);
+  cms::cudatest::TestContext ctx;
 
   constexpr int N = 12000;
   T v[N];
 
-  auto v_d = make_device_unique<T[]>(N, nullptr);
+  auto v_d = make_device_unique<T[]>(N, ctx);
   assert(v_d.get());
 
   using Hist = HistoContainer<T, NBINS, N, S>;
diff --git a/src/cudadev/test/OneToManyAssoc_t.h b/src/cudadev/test/OneToManyAssoc_t.h
index 50bf6fb38..a80b71a1c 100644
--- a/src/cudadev/test/OneToManyAssoc_t.h
+++ b/src/cudadev/test/OneToManyAssoc_t.h
@@ -11,6 +11,7 @@
 #include "CUDACore/cudaCheck.h"
 #include "CUDACore/requireDevices.h"
 #include "CUDACore/currentDevice.h"
+#include "CUDACore/TestContext.h"
 #endif
 
 #include "CUDACore/OneToManyAssoc.h"
@@ -141,7 +142,8 @@ __global__ void verifyFinal(Assoc const* __restrict__ la, int N) {
 template <typename T>
 auto make_unique(std::size_t size) {
 #ifdef __CUDACC__
-  return cms::cuda::make_device_unique<T>(size, 0);
+  cms::cudatest::TestContext ctx;
+  return cms::cuda::make_device_unique<T>(size, ctx);
 #else
   return std::make_unique<T>(size);
 #endif
@@ -219,7 +221,8 @@ int main() {
   auto a_d = make_unique<Assoc[]>(1);
   auto sa_d = make_unique<SmallAssoc[]>(1);
 #ifdef __CUDACC__
-  auto v_d = cms::cuda::make_device_unique<std::array<uint16_t, 4>[]>(N, nullptr);
+  cms::cudatest::TestContext ctx;
+  auto v_d = cms::cuda::make_device_unique<std::array<uint16_t, 4>[]>(N, ctx);
   assert(v_d.get());
   cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array<uint16_t, 4>), cudaMemcpyHostToDevice));
 #else
diff --git a/src/cudadev/test/TrackingRecHit2DCUDA_t.cu b/src/cudadev/test/TrackingRecHit2DCUDA_t.cu
index 5f3a26391..86abafbe6 100644
--- a/src/cudadev/test/TrackingRecHit2DCUDA_t.cu
+++ b/src/cudadev/test/TrackingRecHit2DCUDA_t.cu
@@ -1,6 +1,7 @@
 #include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
 #include "CUDACore/copyAsync.h"
 #include "CUDACore/cudaCheck.h"
+#include "CUDACore/TestContext.h"
 
 namespace testTrackingRecHit2D {
 
@@ -41,11 +42,12 @@ namespace testTrackingRecHit2D {
 int main() {
   cudaStream_t stream;
   cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+  cms::cudatest::TestContext ctx{stream};
 
   // inner scope to deallocate memory before destroying the stream
   {
     auto nHits = 200;
-    TrackingRecHit2DCUDA tkhit(nHits, nullptr, nullptr, stream);
+    TrackingRecHit2DCUDA tkhit(nHits, nullptr, nullptr, ctx);
 
     testTrackingRecHit2D::runKernels(tkhit.view());
   }
diff --git a/src/cudadev/test/VertexFinder_t.h b/src/cudadev/test/VertexFinder_t.h
index aed660c0d..cb9ba942e 100644
--- a/src/cudadev/test/VertexFinder_t.h
+++ b/src/cudadev/test/VertexFinder_t.h
@@ -7,6 +7,7 @@
 #include "CUDACore/cudaCheck.h"
 #include "CUDACore/requireDevices.h"
 #include "CUDACore/launch.h"
+#include "CUDACore/TestContext.h"
 #ifdef USE_DBSCAN
 #include "plugin-PixelVertexFinding/gpuClusterTracksDBSCAN.h"
 #define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN
@@ -114,8 +115,9 @@ int main() {
 #ifdef __CUDACC__
   cms::cudatest::requireDevices();
 
-  auto onGPU_d = cms::cuda::make_device_unique<gpuVertexFinder::ZVertices[]>(1, nullptr);
-  auto ws_d = cms::cuda::make_device_unique<gpuVertexFinder::WorkSpace[]>(1, nullptr);
+  cms::cudatest::TestContext ctx;
+  auto onGPU_d = cms::cuda::make_device_unique<gpuVertexFinder::ZVertices[]>(1, ctx);
+  auto ws_d = cms::cuda::make_device_unique<gpuVertexFinder::WorkSpace[]>(1, ctx);
 #else
   auto onGPU_d = std::make_unique<gpuVertexFinder::ZVertices>();
   auto ws_d = std::make_unique<gpuVertexFinder::WorkSpace>();
diff --git a/src/cudadev/test/gpuClustering_t.h b/src/cudadev/test/gpuClustering_t.h
index 4f273e57a..03d090ce7 100644
--- a/src/cudadev/test/gpuClustering_t.h
+++ b/src/cudadev/test/gpuClustering_t.h
@@ -14,6 +14,7 @@
 #include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/launch.h"
 #include "CUDACore/requireDevices.h"
+#include "CUDACore/TestContext.h"
 #endif  // __CUDACC__
 
 // dirty, but works
@@ -39,14 +40,15 @@ int main(void) {
   auto h_clus = std::make_unique<int[]>(numElements);
 
 #ifdef __CUDACC__
-  auto d_id = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
-  auto d_x = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
-  auto d_y = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
-  auto d_adc = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
-  auto d_clus = cms::cuda::make_device_unique<int[]>(numElements, nullptr);
-  auto d_moduleStart = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules + 1, nullptr);
-  auto d_clusInModule = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules, nullptr);
-  auto d_moduleId = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules, nullptr);
+  cms::cudatest::TestContext ctx;
+  auto d_id = cms::cuda::make_device_unique<uint16_t[]>(numElements, ctx);
+  auto d_x = cms::cuda::make_device_unique<uint16_t[]>(numElements, ctx);
+  auto d_y = cms::cuda::make_device_unique<uint16_t[]>(numElements, ctx);
+  auto d_adc = cms::cuda::make_device_unique<uint16_t[]>(numElements, ctx);
+  auto d_clus = cms::cuda::make_device_unique<int[]>(numElements, ctx);
+  auto d_moduleStart = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules + 1, ctx);
+  auto d_clusInModule = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules, ctx);
+  auto d_moduleId = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules, ctx);
 #else   // __CUDACC__
   auto h_moduleStart = std::make_unique<uint32_t[]>(maxNumModules + 1);
   auto h_clusInModule = std::make_unique<uint32_t[]>(maxNumModules);
diff --git a/src/cudadev/test/radixSort_t.cu b/src/cudadev/test/radixSort_t.cu
index d2c469510..c94273fce 100644
--- a/src/cudadev/test/radixSort_t.cu
+++ b/src/cudadev/test/radixSort_t.cu
@@ -15,6 +15,7 @@
 #include "CUDACore/launch.h"
 #include "CUDACore/radixSort.h"
 #include "CUDACore/requireDevices.h"
+#include "CUDACore/TestContext.h"
 
 using namespace cms::cuda;
 
@@ -137,10 +138,11 @@ void go(bool useShared) {
 
     std::random_shuffle(v, v + N);
 
-    auto v_d = cms::cuda::make_device_unique<U[]>(N, nullptr);
-    auto ind_d = cms::cuda::make_device_unique<uint16_t[]>(N, nullptr);
-    auto ws_d = cms::cuda::make_device_unique<uint16_t[]>(N, nullptr);
-    auto off_d = cms::cuda::make_device_unique<uint32_t[]>(blocks + 1, nullptr);
+    cms::cudatest::TestContext ctx;
+    auto v_d = cms::cuda::make_device_unique<U[]>(N, ctx);
+    auto ind_d = cms::cuda::make_device_unique<uint16_t[]>(N, ctx);
+    auto ws_d = cms::cuda::make_device_unique<uint16_t[]>(N, ctx);
+    auto off_d = cms::cuda::make_device_unique<uint32_t[]>(blocks + 1, ctx);
 
     cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
     cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks + 1), cudaMemcpyHostToDevice));