diff --git a/HeterogeneousCore/CUDACore/src/GPUCuda.cc b/HeterogeneousCore/CUDACore/src/GPUCuda.cc
index d712dcba75a8b..c26ecce0f854a 100644
--- a/HeterogeneousCore/CUDACore/src/GPUCuda.cc
+++ b/HeterogeneousCore/CUDACore/src/GPUCuda.cc
@@ -1,4 +1,5 @@
#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
+#include "chooseCUDADevice.h"
#include "FWCore/MessageLogger/interface/MessageLogger.h"
#include "FWCore/ServiceRegistry/interface/Service.h"
@@ -34,20 +35,14 @@ namespace heterogeneous {
return;
}
- // For startes we "statically" assign the device based on
- // edm::Stream number. This is suboptimal if the number of
- // edm::Streams is not a multiple of the number of CUDA devices
- // (and even then there is no load balancing).
- //
- // TODO: improve. Possible ideas include
+ // TODO: possible ideas to improve the "assignment" logic include
// - allocate M (< N(edm::Streams)) buffers per device per module, choose dynamically which (buffer, device) to use
// * the first module of a chain dictates the device for the rest of the chain
// - our own CUDA memory allocator
// * being able to cheaply allocate+deallocate scratch memory allows to make the execution fully dynamic e.g. based on current load
// * would probably still need some buffer space/device to hold e.g. conditions data
// - for conditions, how to handle multiple lumis per job?
- deviceId_ = id % cudaService->numberOfDevices();
-
+ deviceId_ = cudacore::chooseCUDADevice(id);
cuda::device::current::scoped_override_t<> setDeviceForThisScope(deviceId_);
// Create the CUDA stream for this module-edm::Stream pair
diff --git a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
index a582ed2f72866..ce487507500cc 100644
--- a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
+++ b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
@@ -13,6 +13,6 @@ namespace cudacore {
// (and even then there is no load balancing).
//
// TODO: improve the "assignment" logic
- return id % cudaService->numberOfDevices();
+ return cudaService->devices()[id % cudaService->numberOfDevices()];
}
}
diff --git a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
index 041ed25ba134a..58ce8cc807515 100644
--- a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
+++ b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
@@ -4,4 +4,5 @@
+
diff --git a/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
index b24f05adb2213..db6e7dd141c19 100644
--- a/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
+++ b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
@@ -1,31 +1,7 @@
-#include
-#include
#include
-#include
-#include
+#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
int main() {
- int devices = 0;
- auto status = cudaGetDeviceCount(& devices);
- if (status != cudaSuccess) {
- return EXIT_FAILURE;
- }
-
- int minimumMajor = 6; // min minor is implicitly 0
-
- // This approach (requiring all devices are supported) is rather
- // conservative. In principle we could consider just dropping the
- // unsupported devices. Currently that would be easiest to achieve
- // in CUDAService though.
- for (int i = 0; i < devices; ++i) {
- cudaDeviceProp properties;
- cudaGetDeviceProperties(&properties, i);
-
- if(properties.major < minimumMajor) {
- return EXIT_FAILURE;
- }
- }
-
- return EXIT_SUCCESS;
+ return supportedCUDADevices().empty() ? EXIT_FAILURE : EXIT_SUCCESS;
}
diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
index e54ec1be8ad20..7125b2c0dcf6e 100644
--- a/HeterogeneousCore/CUDAServices/interface/CUDAService.h
+++ b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
@@ -52,6 +52,9 @@ class CUDAService {
int numberOfDevices() const { return numberOfDevices_; }
+ // devices supported by the CUDA configuration and compilation flags
+ std::vector const& devices() const { return supportedDevices_; }
+
// major, minor
std::pair computeCapability(int device) { return computeCapabilities_.at(device); }
@@ -152,6 +155,7 @@ class CUDAService {
std::unique_ptr cudaEventCache_;
int numberOfDevices_ = 0;
+ std::vector supportedDevices_;
std::vector> computeCapabilities_;
bool enabled_ = false;
};
diff --git a/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
index 7b7711c63c502..5c1d042a6420b 100644
--- a/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
+++ b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
@@ -30,7 +30,7 @@ class CUDAMonitoringService {
void postEvent(edm::StreamContext const& sc);
private:
- int numberOfDevices_ = 0;
+ std::vector devices_;
};
CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) {
@@ -38,7 +38,7 @@ CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, ed
edm::Service cudaService;
if(!cudaService->enabled())
return;
- numberOfDevices_ = cudaService->numberOfDevices();
+ devices_ = cudaService->devices();
if(config.getUntrackedParameter("memoryConstruction")) {
registry.watchPostModuleConstruction(this, &CUDAMonitoringService::postModuleConstruction);
@@ -66,10 +66,10 @@ void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions & de
// activity handlers
namespace {
template
- void dumpUsedMemory(T& log, int num) {
+ void dumpUsedMemory(T& log, std::vector const& devices) {
int old = 0;
cudaCheck(cudaGetDevice(&old));
- for(int i = 0; i < num; ++i) {
+ for(int i: devices) {
size_t freeMemory, totalMemory;
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
@@ -82,19 +82,19 @@ namespace {
void CUDAMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
- dumpUsedMemory(log, numberOfDevices_);
+ dumpUsedMemory(log, devices_);
}
void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log<< "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " (" << mcc.moduleDescription()->moduleName() << ")";
- dumpUsedMemory(log, numberOfDevices_);
+ dumpUsedMemory(log, devices_);
}
void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log << "CUDA device memory after event";
- dumpUsedMemory(log, numberOfDevices_);
+ dumpUsedMemory(log, devices_);
}
DEFINE_FWK_SERVICE(CUDAMonitoringService);
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
index 9db5d89de1f83..2d6e0bfc6fd09 100644
--- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc
+++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -12,6 +12,7 @@
#include "FWCore/Utilities/interface/ReusableObjectHolder.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
#include "CachingDeviceAllocator.h"
#include "CachingHostAllocator.h"
@@ -94,10 +95,10 @@ namespace {
}
}
- void devicePreallocate(CUDAService& cs, int numberOfDevices, const std::vector& bufferSizes) {
+ void devicePreallocate(CUDAService& cs, const std::vector& bufferSizes) {
int device;
cudaCheck(cudaGetDevice(&device));
- for(int i=0; i([&](size_t size, cuda::stream_t<>& stream) {
return cs.make_device_unique(size, stream);
@@ -121,14 +122,14 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
return;
}
- auto status = cudaGetDeviceCount(&numberOfDevices_);
- if (cudaSuccess != status) {
+ supportedDevices_ = supportedCUDADevices();
+ numberOfDevices_ = supportedDevices_.size();
+ if (numberOfDevices_ == 0) {
edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n" << "Disabling the CUDAService.";
return;
}
edm::LogInfo log("CUDAService");
- computeCapabilities_.reserve(numberOfDevices_);
- log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n";
+ log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " supported compute devices.\n\n";
auto const& limits = config.getUntrackedParameter("limits");
auto printfFifoSize = limits.getUntrackedParameter("cudaLimitPrintfFifoSize");
@@ -137,7 +138,9 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
auto devRuntimeSyncDepth = limits.getUntrackedParameter("cudaLimitDevRuntimeSyncDepth");
auto devRuntimePendingLaunchCount = limits.getUntrackedParameter("cudaLimitDevRuntimePendingLaunchCount");
- for (int i = 0; i < numberOfDevices_; ++i) {
+ int lastDevice = supportedDevices_.back();
+ computeCapabilities_.resize(lastDevice + 1, std::make_pair(0, 0));
+ for (int i: supportedDevices_) {
// read information about the compute device.
// see the documentation of cudaGetDeviceProperties() for more information.
cudaDeviceProp properties;
@@ -145,10 +148,10 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
log << "CUDA device " << i << ": " << properties.name << '\n';
// compute capabilities
+ computeCapabilities_[i] = std::make_pair(properties.major, properties.minor);
log << " compute capability: " << properties.major << "." << properties.minor << " (sm_" << properties.major << properties.minor << ")\n";
- computeCapabilities_.emplace_back(properties.major, properties.minor);
log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
- log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor ) << '\n';
+ log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio << ":1\n";
// compute mode
@@ -291,7 +294,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
size_t minCachedBytes = std::numeric_limits::max();
int currentDevice;
cudaCheck(cudaGetDevice(¤tDevice));
- for (int i = 0; i < numberOfDevices_; ++i) {
+ for (int i: supportedDevices_) {
size_t freeMemory, totalMemory;
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
@@ -331,8 +334,8 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
log << "cub::CachingDeviceAllocator disabled\n";
}
- cudaStreamCache_ = std::make_unique(numberOfDevices_);
- cudaEventCache_ = std::make_unique(numberOfDevices_);
+ cudaStreamCache_ = std::make_unique(lastDevice+1);
+ cudaEventCache_ = std::make_unique(lastDevice+1);
log << "\n";
@@ -340,7 +343,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
enabled_ = true;
// Preallocate buffers if asked to
- devicePreallocate(*this, numberOfDevices_, allocator.getUntrackedParameter >("devicePreallocate"));
+ devicePreallocate(*this, allocator.getUntrackedParameter >("devicePreallocate"));
hostPreallocate(*this, allocator.getUntrackedParameter >("hostPreallocate"));
}
@@ -353,7 +356,7 @@ CUDAService::~CUDAService() {
cudaEventCache_.reset();
cudaStreamCache_.reset();
- for (int i = 0; i < numberOfDevices_; ++i) {
+ for (int i: supportedDevices_) {
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaDeviceSynchronize());
// Explicitly destroys and cleans up all resources associated with the current device in the
@@ -398,7 +401,7 @@ int CUDAService::deviceWithMostFreeMemory() const {
size_t maxFreeMemory = 0;
int device = -1;
- for(int i = 0; i < numberOfDevices_; ++i) {
+ for (int i: supportedDevices_) {
/*
// TODO: understand why the api-wrappers version gives same value for all devices
auto device = cuda::device::get(i);
@@ -432,9 +435,6 @@ struct CUDAService::Allocator {
template
Allocator(size_t max, Args&&... args): maxAllocation(max), deviceAllocator(args...), hostAllocator(std::forward(args)...) {}
- void devicePreallocate(int numberOfDevices, const std::vector& bytes);
- void hostPreallocate(int numberOfDevices, const std::vector& bytes);
-
size_t maxAllocation;
notcub::CachingDeviceAllocator deviceAllocator;
notcub::CachingHostAllocator hostAllocator;
diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
index 95768bdbd4b58..5e1bc65645841 100644
--- a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
+++ b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
@@ -14,6 +14,7 @@
#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
#include "FWCore/Utilities/interface/Exception.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
namespace {
CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) {
@@ -29,13 +30,10 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
// Test setup: check if a simple CUDA runtime API call fails:
// if so, skip the test with the CUDAService enabled
- int deviceCount = 0;
- auto ret = cudaGetDeviceCount( &deviceCount );
+ int deviceCount = supportedCUDADevices().size();
- if( ret != cudaSuccess ) {
- WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
- << ret << ") " << cudaGetErrorString( ret )
- << ". Running only tests not requiring devices.");
+ if (deviceCount == 0) {
+ WARN("No supported CUDA devices available. Running only tests not requiring devices.");
}
SECTION("CUDAService enabled") {
@@ -58,6 +56,7 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
}
auto cs = makeCUDAService(ps, ar);
+ cudaError_t ret;
SECTION("CUDA Queries") {
int driverVersion = 0, runtimeVersion = 0;
diff --git a/HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h b/HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h
new file mode 100644
index 0000000000000..53d984dd2beaa
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h
@@ -0,0 +1,8 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h
+#define HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h
+
+#include
+
+std::vector supportedCUDADevices();
+
+#endif // HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h
diff --git a/HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc b/HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc
index 2d166e5c62840..c20f43c5ec794 100644
--- a/HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc
+++ b/HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc
@@ -4,6 +4,7 @@
#include
#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
void exitSansCUDADevices() {
int devices = 0;
@@ -16,4 +17,9 @@ void exitSansCUDADevices() {
std::cerr << "No CUDA devices available, the test will be skipped." << "\n";
exit(EXIT_SUCCESS);
}
+ int supported = supportedCUDADevices().size();
+ if (supported == 0) {
+ std::cerr << "No supported CUDA devices available, the test will be skipped." << "\n";
+ exit(EXIT_SUCCESS);
+ }
}
diff --git a/HeterogeneousCore/CUDAUtilities/src/supportedCUDADevices.cu b/HeterogeneousCore/CUDAUtilities/src/supportedCUDADevices.cu
new file mode 100644
index 0000000000000..9d629d2fc7554
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/supportedCUDADevices.cu
@@ -0,0 +1,42 @@
+#include
+
+#include
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
+
+__global__
+void isSupported(bool * result) {
+ * result = true;
+}
+
+std::vector supportedCUDADevices() {
+ int devices = 0;
+ auto status = cudaGetDeviceCount(&devices);
+ if (status != cudaSuccess or devices == 0) {
+ return {};
+ }
+
+ std::vector supportedDevices;
+ supportedDevices.reserve(devices);
+
+ for (int i = 0; i < devices; ++i) {
+ cudaCheck(cudaSetDevice(i));
+ bool supported = false;
+ bool * supported_d;
+ cudaCheck(cudaMalloc(&supported_d, sizeof(bool)));
+ cudaCheck(cudaMemset(supported_d, 0x00, sizeof(bool)));
+ isSupported<<<1,1>>>(supported_d);
+ // swallow any eventual error from launching the kernel on an unsupported device
+ cudaGetLastError();
+ cudaCheck(cudaDeviceSynchronize());
+ cudaCheck(cudaMemcpy(& supported, supported_d, sizeof(bool), cudaMemcpyDeviceToHost));
+ cudaCheck(cudaFree(supported_d));
+ if (supported) {
+ supportedDevices.push_back(i);
+ }
+ cudaCheck(cudaDeviceReset());
+ }
+
+ return supportedDevices;
+}