diff --git a/HeterogeneousCore/CUDACore/src/GPUCuda.cc b/HeterogeneousCore/CUDACore/src/GPUCuda.cc index d712dcba75a8b..c26ecce0f854a 100644 --- a/HeterogeneousCore/CUDACore/src/GPUCuda.cc +++ b/HeterogeneousCore/CUDACore/src/GPUCuda.cc @@ -1,4 +1,5 @@ #include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" +#include "chooseCUDADevice.h" #include "FWCore/MessageLogger/interface/MessageLogger.h" #include "FWCore/ServiceRegistry/interface/Service.h" @@ -34,20 +35,14 @@ namespace heterogeneous { return; } - // For startes we "statically" assign the device based on - // edm::Stream number. This is suboptimal if the number of - // edm::Streams is not a multiple of the number of CUDA devices - // (and even then there is no load balancing). - // - // TODO: improve. Possible ideas include + // TODO: possible ideas to improve the "assignment" logic include // - allocate M (< N(edm::Streams)) buffers per device per module, choose dynamically which (buffer, device) to use // * the first module of a chain dictates the device for the rest of the chain // - our own CUDA memory allocator // * being able to cheaply allocate+deallocate scratch memory allows to make the execution fully dynamic e.g. based on current load // * would probably still need some buffer space/device to hold e.g. conditions data // - for conditions, how to handle multiple lumis per job? - deviceId_ = id % cudaService->numberOfDevices(); - + deviceId_ = cudacore::chooseCUDADevice(id); cuda::device::current::scoped_override_t<> setDeviceForThisScope(deviceId_); // Create the CUDA stream for this module-edm::Stream pair diff --git a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc index a582ed2f72866..ce487507500cc 100644 --- a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc +++ b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc @@ -13,6 +13,6 @@ namespace cudacore { // (and even then there is no load balancing). // // TODO: improve the "assignment" logic - return id % cudaService->numberOfDevices(); + return cudaService->devices()[id % cudaService->numberOfDevices()]; } } diff --git a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml index 041ed25ba134a..58ce8cc807515 100644 --- a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml +++ b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml @@ -4,4 +4,5 @@ + diff --git a/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp index b24f05adb2213..db6e7dd141c19 100644 --- a/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp +++ b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp @@ -1,31 +1,7 @@ -#include -#include #include -#include -#include +#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h" int main() { - int devices = 0; - auto status = cudaGetDeviceCount(& devices); - if (status != cudaSuccess) { - return EXIT_FAILURE; - } - - int minimumMajor = 6; // min minor is implicitly 0 - - // This approach (requiring all devices are supported) is rather - // conservative. In principle we could consider just dropping the - // unsupported devices. Currently that would be easiest to achieve - // in CUDAService though. - for (int i = 0; i < devices; ++i) { - cudaDeviceProp properties; - cudaGetDeviceProperties(&properties, i); - - if(properties.major < minimumMajor) { - return EXIT_FAILURE; - } - } - - return EXIT_SUCCESS; + return supportedCUDADevices().empty() ? EXIT_FAILURE : EXIT_SUCCESS; } diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h index e54ec1be8ad20..7125b2c0dcf6e 100644 --- a/HeterogeneousCore/CUDAServices/interface/CUDAService.h +++ b/HeterogeneousCore/CUDAServices/interface/CUDAService.h @@ -52,6 +52,9 @@ class CUDAService { int numberOfDevices() const { return numberOfDevices_; } + // devices supported by the CUDA configuration and compilation flags + std::vector const& devices() const { return supportedDevices_; } + // major, minor std::pair computeCapability(int device) { return computeCapabilities_.at(device); } @@ -152,6 +155,7 @@ class CUDAService { std::unique_ptr cudaEventCache_; int numberOfDevices_ = 0; + std::vector supportedDevices_; std::vector> computeCapabilities_; bool enabled_ = false; }; diff --git a/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc index 7b7711c63c502..5c1d042a6420b 100644 --- a/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc +++ b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc @@ -30,7 +30,7 @@ class CUDAMonitoringService { void postEvent(edm::StreamContext const& sc); private: - int numberOfDevices_ = 0; + std::vector devices_; }; CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) { @@ -38,7 +38,7 @@ CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, ed edm::Service cudaService; if(!cudaService->enabled()) return; - numberOfDevices_ = cudaService->numberOfDevices(); + devices_ = cudaService->devices(); if(config.getUntrackedParameter("memoryConstruction")) { registry.watchPostModuleConstruction(this, &CUDAMonitoringService::postModuleConstruction); @@ -66,10 +66,10 @@ void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions & de // activity handlers namespace { template - void dumpUsedMemory(T& log, int num) { + void dumpUsedMemory(T& log, std::vector const& devices) { int old = 0; cudaCheck(cudaGetDevice(&old)); - for(int i = 0; i < num; ++i) { + for(int i: devices) { size_t freeMemory, totalMemory; cudaCheck(cudaSetDevice(i)); cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory)); @@ -82,19 +82,19 @@ namespace { void CUDAMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) { auto log = edm::LogPrint("CUDAMonitoringService"); log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")"; - dumpUsedMemory(log, numberOfDevices_); + dumpUsedMemory(log, devices_); } void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) { auto log = edm::LogPrint("CUDAMonitoringService"); log<< "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " (" << mcc.moduleDescription()->moduleName() << ")"; - dumpUsedMemory(log, numberOfDevices_); + dumpUsedMemory(log, devices_); } void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) { auto log = edm::LogPrint("CUDAMonitoringService"); log << "CUDA device memory after event"; - dumpUsedMemory(log, numberOfDevices_); + dumpUsedMemory(log, devices_); } DEFINE_FWK_SERVICE(CUDAMonitoringService); diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc index 9db5d89de1f83..2d6e0bfc6fd09 100644 --- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc +++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc @@ -12,6 +12,7 @@ #include "FWCore/Utilities/interface/ReusableObjectHolder.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h" #include "CachingDeviceAllocator.h" #include "CachingHostAllocator.h" @@ -94,10 +95,10 @@ namespace { } } - void devicePreallocate(CUDAService& cs, int numberOfDevices, const std::vector& bufferSizes) { + void devicePreallocate(CUDAService& cs, const std::vector& bufferSizes) { int device; cudaCheck(cudaGetDevice(&device)); - for(int i=0; i([&](size_t size, cuda::stream_t<>& stream) { return cs.make_device_unique(size, stream); @@ -121,14 +122,14 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& return; } - auto status = cudaGetDeviceCount(&numberOfDevices_); - if (cudaSuccess != status) { + supportedDevices_ = supportedCUDADevices(); + numberOfDevices_ = supportedDevices_.size(); + if (numberOfDevices_ == 0) { edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n" << "Disabling the CUDAService."; return; } edm::LogInfo log("CUDAService"); - computeCapabilities_.reserve(numberOfDevices_); - log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n"; + log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " supported compute devices.\n\n"; auto const& limits = config.getUntrackedParameter("limits"); auto printfFifoSize = limits.getUntrackedParameter("cudaLimitPrintfFifoSize"); @@ -137,7 +138,9 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& auto devRuntimeSyncDepth = limits.getUntrackedParameter("cudaLimitDevRuntimeSyncDepth"); auto devRuntimePendingLaunchCount = limits.getUntrackedParameter("cudaLimitDevRuntimePendingLaunchCount"); - for (int i = 0; i < numberOfDevices_; ++i) { + int lastDevice = supportedDevices_.back(); + computeCapabilities_.resize(lastDevice + 1, std::make_pair(0, 0)); + for (int i: supportedDevices_) { // read information about the compute device. // see the documentation of cudaGetDeviceProperties() for more information. cudaDeviceProp properties; @@ -145,10 +148,10 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& log << "CUDA device " << i << ": " << properties.name << '\n'; // compute capabilities + computeCapabilities_[i] = std::make_pair(properties.major, properties.minor); log << " compute capability: " << properties.major << "." << properties.minor << " (sm_" << properties.major << properties.minor << ")\n"; - computeCapabilities_.emplace_back(properties.major, properties.minor); log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n'; - log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor ) << '\n'; + log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n'; log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio << ":1\n"; // compute mode @@ -291,7 +294,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& size_t minCachedBytes = std::numeric_limits::max(); int currentDevice; cudaCheck(cudaGetDevice(¤tDevice)); - for (int i = 0; i < numberOfDevices_; ++i) { + for (int i: supportedDevices_) { size_t freeMemory, totalMemory; cudaCheck(cudaSetDevice(i)); cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory)); @@ -331,8 +334,8 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& log << "cub::CachingDeviceAllocator disabled\n"; } - cudaStreamCache_ = std::make_unique(numberOfDevices_); - cudaEventCache_ = std::make_unique(numberOfDevices_); + cudaStreamCache_ = std::make_unique(lastDevice+1); + cudaEventCache_ = std::make_unique(lastDevice+1); log << "\n"; @@ -340,7 +343,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& enabled_ = true; // Preallocate buffers if asked to - devicePreallocate(*this, numberOfDevices_, allocator.getUntrackedParameter >("devicePreallocate")); + devicePreallocate(*this, allocator.getUntrackedParameter >("devicePreallocate")); hostPreallocate(*this, allocator.getUntrackedParameter >("hostPreallocate")); } @@ -353,7 +356,7 @@ CUDAService::~CUDAService() { cudaEventCache_.reset(); cudaStreamCache_.reset(); - for (int i = 0; i < numberOfDevices_; ++i) { + for (int i: supportedDevices_) { cudaCheck(cudaSetDevice(i)); cudaCheck(cudaDeviceSynchronize()); // Explicitly destroys and cleans up all resources associated with the current device in the @@ -398,7 +401,7 @@ int CUDAService::deviceWithMostFreeMemory() const { size_t maxFreeMemory = 0; int device = -1; - for(int i = 0; i < numberOfDevices_; ++i) { + for (int i: supportedDevices_) { /* // TODO: understand why the api-wrappers version gives same value for all devices auto device = cuda::device::get(i); @@ -432,9 +435,6 @@ struct CUDAService::Allocator { template Allocator(size_t max, Args&&... args): maxAllocation(max), deviceAllocator(args...), hostAllocator(std::forward(args)...) {} - void devicePreallocate(int numberOfDevices, const std::vector& bytes); - void hostPreallocate(int numberOfDevices, const std::vector& bytes); - size_t maxAllocation; notcub::CachingDeviceAllocator deviceAllocator; notcub::CachingHostAllocator hostAllocator; diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp index 95768bdbd4b58..5e1bc65645841 100644 --- a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp +++ b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp @@ -14,6 +14,7 @@ #include "FWCore/ServiceRegistry/interface/ActivityRegistry.h" #include "FWCore/Utilities/interface/Exception.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h" namespace { CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) { @@ -29,13 +30,10 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") { // Test setup: check if a simple CUDA runtime API call fails: // if so, skip the test with the CUDAService enabled - int deviceCount = 0; - auto ret = cudaGetDeviceCount( &deviceCount ); + int deviceCount = supportedCUDADevices().size(); - if( ret != cudaSuccess ) { - WARN("Unable to query the CUDA capable devices from the CUDA runtime API: (" - << ret << ") " << cudaGetErrorString( ret ) - << ". Running only tests not requiring devices."); + if (deviceCount == 0) { + WARN("No supported CUDA devices available. Running only tests not requiring devices."); } SECTION("CUDAService enabled") { @@ -58,6 +56,7 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") { } auto cs = makeCUDAService(ps, ar); + cudaError_t ret; SECTION("CUDA Queries") { int driverVersion = 0, runtimeVersion = 0; diff --git a/HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h b/HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h new file mode 100644 index 0000000000000..53d984dd2beaa --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h @@ -0,0 +1,8 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h +#define HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h + +#include + +std::vector supportedCUDADevices(); + +#endif // HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h diff --git a/HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc b/HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc index 2d166e5c62840..c20f43c5ec794 100644 --- a/HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc +++ b/HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc @@ -4,6 +4,7 @@ #include #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h" void exitSansCUDADevices() { int devices = 0; @@ -16,4 +17,9 @@ void exitSansCUDADevices() { std::cerr << "No CUDA devices available, the test will be skipped." << "\n"; exit(EXIT_SUCCESS); } + int supported = supportedCUDADevices().size(); + if (supported == 0) { + std::cerr << "No supported CUDA devices available, the test will be skipped." << "\n"; + exit(EXIT_SUCCESS); + } } diff --git a/HeterogeneousCore/CUDAUtilities/src/supportedCUDADevices.cu b/HeterogeneousCore/CUDAUtilities/src/supportedCUDADevices.cu new file mode 100644 index 0000000000000..9d629d2fc7554 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/src/supportedCUDADevices.cu @@ -0,0 +1,42 @@ +#include + +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h" + +__global__ +void isSupported(bool * result) { + * result = true; +} + +std::vector supportedCUDADevices() { + int devices = 0; + auto status = cudaGetDeviceCount(&devices); + if (status != cudaSuccess or devices == 0) { + return {}; + } + + std::vector supportedDevices; + supportedDevices.reserve(devices); + + for (int i = 0; i < devices; ++i) { + cudaCheck(cudaSetDevice(i)); + bool supported = false; + bool * supported_d; + cudaCheck(cudaMalloc(&supported_d, sizeof(bool))); + cudaCheck(cudaMemset(supported_d, 0x00, sizeof(bool))); + isSupported<<<1,1>>>(supported_d); + // swallow any eventual error from launching the kernel on an unsupported device + cudaGetLastError(); + cudaCheck(cudaDeviceSynchronize()); + cudaCheck(cudaMemcpy(& supported, supported_d, sizeof(bool), cudaMemcpyDeviceToHost)); + cudaCheck(cudaFree(supported_d)); + if (supported) { + supportedDevices.push_back(i); + } + cudaCheck(cudaDeviceReset()); + } + + return supportedDevices; +}