From c8cd6cae4095b7b0539fad1c101c40e58e4b5fbc Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Thu, 7 Nov 2019 23:54:58 +0100 Subject: [PATCH 1/2] Remove cuda::device::current::scoped_override_t from CUDAScopedContext --- .../CUDACore/interface/CUDAScopedContext.h | 7 ++++++- HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc | 10 +++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h index 2f71ae5c54026..f17fc37e4b949 100644 --- a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h +++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h @@ -33,6 +33,12 @@ namespace impl { const cudautils::SharedStreamPtr& streamPtr() const { return stream_; } protected: + // The constructors set the current device device, but the device + // is not set back to the previous value at the destructor. This + // should be sufficient (and tiny bit faster) as all CUDA API + // functions relying on the current device should be called from + // the scope where this context is. The current device doesn't + // really matter between modules (or across TBB tasks). explicit CUDAScopedContextBase(edm::StreamID streamID); explicit CUDAScopedContextBase(const CUDAProductBase& data); @@ -41,7 +47,6 @@ namespace impl { private: int currentDevice_; - cuda::device::current::scoped_override_t<> setDeviceForThisScope_; cudautils::SharedStreamPtr stream_; }; diff --git a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc index 54dafc26d19d3..227c1f9928037 100644 --- a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc +++ b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc @@ -39,12 +39,14 @@ namespace { namespace impl { CUDAScopedContextBase::CUDAScopedContextBase(edm::StreamID streamID) - : currentDevice_(cudacore::chooseCUDADevice(streamID)), setDeviceForThisScope_(currentDevice_) { + : currentDevice_(cudacore::chooseCUDADevice(streamID)) { + cudaCheck(cudaSetDevice(currentDevice_)); stream_ = cudautils::getCUDAStreamCache().getCUDAStream(); } CUDAScopedContextBase::CUDAScopedContextBase(const CUDAProductBase& data) - : currentDevice_(data.device()), setDeviceForThisScope_(currentDevice_) { + : currentDevice_(data.device()) { + cudaCheck(cudaSetDevice(currentDevice_)); if (data.mayReuseStream()) { stream_ = data.streamPtr(); } else { @@ -53,7 +55,9 @@ namespace impl { } CUDAScopedContextBase::CUDAScopedContextBase(int device, cudautils::SharedStreamPtr stream) - : currentDevice_(device), setDeviceForThisScope_(device), stream_(std::move(stream)) {} + : currentDevice_(device), stream_(std::move(stream)) { + cudaCheck(cudaSetDevice(currentDevice_)); + } //////////////////// From 3613acd3cb22baef2fc6ef63638bbfc7daa15b15 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Fri, 8 Nov 2019 00:15:04 +0100 Subject: [PATCH 2/2] Replace cuda::device::current::scoped_override_t wth cudautils::ScopedSetDevice --- HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc | 3 ++- HeterogeneousCore/CUDAUtilities/src/allocate_device.cc | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc index 1888319af2698..fde80b3c7cc0f 100644 --- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc +++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc @@ -11,6 +11,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h" #include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h" +#include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h" #include "test_CUDAScopedContextKernels.h" @@ -84,7 +85,7 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") { } SECTION("Joining multiple CUDA streams") { - cuda::device::current::scoped_override_t<> setDeviceForThisScope(defaultDevice); + cudautils::ScopedSetDevice setDeviceForThisScope(defaultDevice); auto current_device = cuda::device::current::get(); // Mimick a producer on the first CUDA stream diff --git a/HeterogeneousCore/CUDAUtilities/src/allocate_device.cc b/HeterogeneousCore/CUDAUtilities/src/allocate_device.cc index 425c16bcf287b..f0bab1b57aa1b 100644 --- a/HeterogeneousCore/CUDAUtilities/src/allocate_device.cc +++ b/HeterogeneousCore/CUDAUtilities/src/allocate_device.cc @@ -1,5 +1,6 @@ #include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h" #include "FWCore/Utilities/interface/Likely.h" #include "getCachingDeviceAllocator.h" @@ -23,7 +24,7 @@ namespace cudautils { } cuda::throw_if_error(cudautils::allocator::getCachingDeviceAllocator().DeviceAllocate(dev, &ptr, nbytes, stream)); } else { - cuda::device::current::scoped_override_t<> setDeviceForThisScope(dev); + ScopedSetDevice setDeviceForThisScope(dev); cuda::throw_if_error(cudaMalloc(&ptr, nbytes)); } return ptr; @@ -33,7 +34,7 @@ namespace cudautils { if constexpr (cudautils::allocator::useCaching) { cuda::throw_if_error(cudautils::allocator::getCachingDeviceAllocator().DeviceFree(device, ptr)); } else { - cuda::device::current::scoped_override_t<> setDeviceForThisScope(device); + ScopedSetDevice setDeviceForThisScope(device); cuda::throw_if_error(cudaFree(ptr)); } }