Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DO NOT MERGE] Use only CUDA devices with a supported architecture #299

Open
wants to merge 1 commit into
base: CMSSW_11_1_X_Patatrack
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 3 additions & 8 deletions HeterogeneousCore/CUDACore/src/GPUCuda.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
#include "chooseCUDADevice.h"

#include "FWCore/MessageLogger/interface/MessageLogger.h"
#include "FWCore/ServiceRegistry/interface/Service.h"
Expand Down Expand Up @@ -34,20 +35,14 @@ namespace heterogeneous {
return;
}

// For startes we "statically" assign the device based on
// edm::Stream number. This is suboptimal if the number of
// edm::Streams is not a multiple of the number of CUDA devices
// (and even then there is no load balancing).
//
// TODO: improve. Possible ideas include
// TODO: possible ideas to improve the "assignment" logic include
// - allocate M (< N(edm::Streams)) buffers per device per module, choose dynamically which (buffer, device) to use
// * the first module of a chain dictates the device for the rest of the chain
// - our own CUDA memory allocator
// * being able to cheaply allocate+deallocate scratch memory allows to make the execution fully dynamic e.g. based on current load
// * would probably still need some buffer space/device to hold e.g. conditions data
// - for conditions, how to handle multiple lumis per job?
deviceId_ = id % cudaService->numberOfDevices();

deviceId_ = cudacore::chooseCUDADevice(id);
cuda::device::current::scoped_override_t<> setDeviceForThisScope(deviceId_);

// Create the CUDA stream for this module-edm::Stream pair
Expand Down
2 changes: 1 addition & 1 deletion HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ namespace cudacore {
// (and even then there is no load balancing).
//
// TODO: improve the "assignment" logic
return id % cudaService->numberOfDevices();
return cudaService->devices()[id % cudaService->numberOfDevices()];
}
}
1 change: 1 addition & 0 deletions HeterogeneousCore/CUDAServices/bin/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@

<bin name="cudaIsEnabled" file="cudaIsEnabled.cpp">
<use name="cuda"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
</bin>
28 changes: 2 additions & 26 deletions HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
Original file line number Diff line number Diff line change
@@ -1,31 +1,7 @@
#include <algorithm>
#include <array>
#include <cstdlib>
#include <iostream>

#include <cuda_runtime.h>
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"

int main() {
int devices = 0;
auto status = cudaGetDeviceCount(& devices);
if (status != cudaSuccess) {
return EXIT_FAILURE;
}

int minimumMajor = 6; // min minor is implicitly 0

// This approach (requiring all devices are supported) is rather
// conservative. In principle we could consider just dropping the
// unsupported devices. Currently that would be easiest to achieve
// in CUDAService though.
for (int i = 0; i < devices; ++i) {
cudaDeviceProp properties;
cudaGetDeviceProperties(&properties, i);

if(properties.major < minimumMajor) {
return EXIT_FAILURE;
}
}

return EXIT_SUCCESS;
return supportedCUDADevices().empty() ? EXIT_FAILURE : EXIT_SUCCESS;
}
4 changes: 4 additions & 0 deletions HeterogeneousCore/CUDAServices/interface/CUDAService.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ class CUDAService {

int numberOfDevices() const { return numberOfDevices_; }

// devices supported by the CUDA configuration and compilation flags
std::vector<int> const& devices() const { return supportedDevices_; }

// major, minor
std::pair<int, int> computeCapability(int device) { return computeCapabilities_.at(device); }

Expand Down Expand Up @@ -152,6 +155,7 @@ class CUDAService {
std::unique_ptr<CUDAEventCache> cudaEventCache_;

int numberOfDevices_ = 0;
std::vector<int> supportedDevices_;
std::vector<std::pair<int, int>> computeCapabilities_;
bool enabled_ = false;
};
Expand Down
14 changes: 7 additions & 7 deletions HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@ class CUDAMonitoringService {
void postEvent(edm::StreamContext const& sc);

private:
int numberOfDevices_ = 0;
std::vector<int> devices_;
};

CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) {
// make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor
edm::Service<CUDAService> cudaService;
if(!cudaService->enabled())
return;
numberOfDevices_ = cudaService->numberOfDevices();
devices_ = cudaService->devices();

if(config.getUntrackedParameter<bool>("memoryConstruction")) {
registry.watchPostModuleConstruction(this, &CUDAMonitoringService::postModuleConstruction);
Expand Down Expand Up @@ -66,10 +66,10 @@ void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions & de
// activity handlers
namespace {
template <typename T>
void dumpUsedMemory(T& log, int num) {
void dumpUsedMemory(T& log, std::vector<int> const& devices) {
int old = 0;
cudaCheck(cudaGetDevice(&old));
for(int i = 0; i < num; ++i) {
for(int i: devices) {
size_t freeMemory, totalMemory;
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
Expand All @@ -82,19 +82,19 @@ namespace {
void CUDAMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
dumpUsedMemory(log, numberOfDevices_);
dumpUsedMemory(log, devices_);
}

void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log<< "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " (" << mcc.moduleDescription()->moduleName() << ")";
dumpUsedMemory(log, numberOfDevices_);
dumpUsedMemory(log, devices_);
}

void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log << "CUDA device memory after event";
dumpUsedMemory(log, numberOfDevices_);
dumpUsedMemory(log, devices_);
}

DEFINE_FWK_SERVICE(CUDAMonitoringService);
36 changes: 18 additions & 18 deletions HeterogeneousCore/CUDAServices/src/CUDAService.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "FWCore/Utilities/interface/ReusableObjectHolder.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"

#include "CachingDeviceAllocator.h"
#include "CachingHostAllocator.h"
Expand Down Expand Up @@ -94,10 +95,10 @@ namespace {
}
}

void devicePreallocate(CUDAService& cs, int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
void devicePreallocate(CUDAService& cs, const std::vector<unsigned int>& bufferSizes) {
int device;
cudaCheck(cudaGetDevice(&device));
for(int i=0; i<numberOfDevices; ++i) {
for (int i : cs.devices()) {
cudaCheck(cudaSetDevice(i));
preallocate<cudautils::device::unique_ptr>([&](size_t size, cuda::stream_t<>& stream) {
return cs.make_device_unique<char[]>(size, stream);
Expand All @@ -121,14 +122,14 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
return;
}

auto status = cudaGetDeviceCount(&numberOfDevices_);
if (cudaSuccess != status) {
supportedDevices_ = supportedCUDADevices();
numberOfDevices_ = supportedDevices_.size();
if (numberOfDevices_ == 0) {
edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n" << "Disabling the CUDAService.";
return;
}
edm::LogInfo log("CUDAService");
computeCapabilities_.reserve(numberOfDevices_);
log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n";
log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " supported compute devices.\n\n";

auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
Expand All @@ -137,18 +138,20 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");

for (int i = 0; i < numberOfDevices_; ++i) {
int lastDevice = supportedDevices_.back();
computeCapabilities_.resize(lastDevice + 1, std::make_pair(0, 0));
for (int i: supportedDevices_) {
// read information about the compute device.
// see the documentation of cudaGetDeviceProperties() for more information.
cudaDeviceProp properties;
cudaCheck(cudaGetDeviceProperties(&properties, i));
log << "CUDA device " << i << ": " << properties.name << '\n';

// compute capabilities
computeCapabilities_[i] = std::make_pair(properties.major, properties.minor);
log << " compute capability: " << properties.major << "." << properties.minor << " (sm_" << properties.major << properties.minor << ")\n";
computeCapabilities_.emplace_back(properties.major, properties.minor);
log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor ) << '\n';
log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio << ":1\n";

// compute mode
Expand Down Expand Up @@ -291,7 +294,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
size_t minCachedBytes = std::numeric_limits<size_t>::max();
int currentDevice;
cudaCheck(cudaGetDevice(&currentDevice));
for (int i = 0; i < numberOfDevices_; ++i) {
for (int i: supportedDevices_) {
size_t freeMemory, totalMemory;
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
Expand Down Expand Up @@ -331,16 +334,16 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
log << "cub::CachingDeviceAllocator disabled\n";
}

cudaStreamCache_ = std::make_unique<CUDAStreamCache>(numberOfDevices_);
cudaEventCache_ = std::make_unique<CUDAEventCache>(numberOfDevices_);
cudaStreamCache_ = std::make_unique<CUDAStreamCache>(lastDevice+1);
cudaEventCache_ = std::make_unique<CUDAEventCache>(lastDevice+1);

log << "\n";

log << "CUDAService fully initialized";
enabled_ = true;

// Preallocate buffers if asked to
devicePreallocate(*this, numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
devicePreallocate(*this, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
hostPreallocate(*this, allocator.getUntrackedParameter<std::vector<unsigned int> >("hostPreallocate"));
}

Expand All @@ -353,7 +356,7 @@ CUDAService::~CUDAService() {
cudaEventCache_.reset();
cudaStreamCache_.reset();

for (int i = 0; i < numberOfDevices_; ++i) {
for (int i: supportedDevices_) {
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaDeviceSynchronize());
// Explicitly destroys and cleans up all resources associated with the current device in the
Expand Down Expand Up @@ -398,7 +401,7 @@ int CUDAService::deviceWithMostFreeMemory() const {

size_t maxFreeMemory = 0;
int device = -1;
for(int i = 0; i < numberOfDevices_; ++i) {
for (int i: supportedDevices_) {
/*
// TODO: understand why the api-wrappers version gives same value for all devices
auto device = cuda::device::get(i);
Expand Down Expand Up @@ -432,9 +435,6 @@ struct CUDAService::Allocator {
template <typename ...Args>
Allocator(size_t max, Args&&... args): maxAllocation(max), deviceAllocator(args...), hostAllocator(std::forward<Args>(args)...) {}

void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);
void hostPreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);

size_t maxAllocation;
notcub::CachingDeviceAllocator deviceAllocator;
notcub::CachingHostAllocator hostAllocator;
Expand Down
11 changes: 5 additions & 6 deletions HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
#include "FWCore/Utilities/interface/Exception.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"

namespace {
CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) {
Expand All @@ -29,13 +30,10 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {

// Test setup: check if a simple CUDA runtime API call fails:
// if so, skip the test with the CUDAService enabled
int deviceCount = 0;
auto ret = cudaGetDeviceCount( &deviceCount );
int deviceCount = supportedCUDADevices().size();

if( ret != cudaSuccess ) {
WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
<< ret << ") " << cudaGetErrorString( ret )
<< ". Running only tests not requiring devices.");
if (deviceCount == 0) {
WARN("No supported CUDA devices available. Running only tests not requiring devices.");
}

SECTION("CUDAService enabled") {
Expand All @@ -58,6 +56,7 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
}

auto cs = makeCUDAService(ps, ar);
cudaError_t ret;

SECTION("CUDA Queries") {
int driverVersion = 0, runtimeVersion = 0;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h
#define HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h

#include <vector>

std::vector<int> supportedCUDADevices();

#endif // HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h
6 changes: 6 additions & 0 deletions HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <cuda_runtime.h>

#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"

void exitSansCUDADevices() {
int devices = 0;
Expand All @@ -16,4 +17,9 @@ void exitSansCUDADevices() {
std::cerr << "No CUDA devices available, the test will be skipped." << "\n";
exit(EXIT_SUCCESS);
}
int supported = supportedCUDADevices().size();
if (supported == 0) {
std::cerr << "No supported CUDA devices available, the test will be skipped." << "\n";
exit(EXIT_SUCCESS);
}
}
42 changes: 42 additions & 0 deletions HeterogeneousCore/CUDAUtilities/src/supportedCUDADevices.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#include <vector>

#include <cuda_runtime.h>

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"

__global__
void isSupported(bool * result) {
* result = true;
}

std::vector<int> supportedCUDADevices() {
int devices = 0;
auto status = cudaGetDeviceCount(&devices);
if (status != cudaSuccess or devices == 0) {
return {};
}

std::vector<int> supportedDevices;
supportedDevices.reserve(devices);

for (int i = 0; i < devices; ++i) {
cudaCheck(cudaSetDevice(i));
bool supported = false;
bool * supported_d;
cudaCheck(cudaMalloc(&supported_d, sizeof(bool)));
cudaCheck(cudaMemset(supported_d, 0x00, sizeof(bool)));
isSupported<<<1,1>>>(supported_d);
// swallow any eventual error from launching the kernel on an unsupported device
cudaGetLastError();
cudaCheck(cudaDeviceSynchronize());
cudaCheck(cudaMemcpy(& supported, supported_d, sizeof(bool), cudaMemcpyDeviceToHost));
cudaCheck(cudaFree(supported_d));
if (supported) {
supportedDevices.push_back(i);
}
cudaCheck(cudaDeviceReset());
}

return supportedDevices;
}