Skip to content

Commit

Permalink
Merge pull request #40551 from valsdav/tensorflow_gpu
Browse files Browse the repository at this point in the history
Add GPU backend option for TensorFlow session
  • Loading branch information
cmsbuild authored Feb 13, 2023
2 parents 6f5b833 + c85d7d8 commit 39853ed
Show file tree
Hide file tree
Showing 24 changed files with 1,114 additions and 42 deletions.
1 change: 1 addition & 0 deletions PhysicsTools/TensorFlow/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
<use name="FWCore/Framework"/>
<use name="FWCore/MessageLogger"/>
<use name="FWCore/Utilities"/>
<use name="FWCore/ServiceRegistry"/>
<export>
<lib name="1"/>
</export>
19 changes: 15 additions & 4 deletions PhysicsTools/TensorFlow/interface/TensorFlow.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@

namespace tensorflow {

enum class Backend { cpu, cuda, rocm, intel, best };

typedef std::pair<std::string, Tensor> NamedTensor;
typedef std::vector<NamedTensor> NamedTensorList;

Expand All @@ -39,6 +41,10 @@ namespace tensorflow {
// since the threading configuration is done per run() call as of 2.1
void setThreading(SessionOptions& sessionOptions, int nThreads, const std::string& singleThreadPool);

// Set the backend option cpu/cuda
// The gpu memory is set to "allow_growth" to avoid TF getting all the CUDA memory at once.
void setBackend(SessionOptions& sessionOptions, Backend backend = Backend::cpu);

// loads a meta graph definition saved at exportDir using the SavedModel interface for a tag and
// predefined sessionOptions
// transfers ownership
Expand All @@ -52,11 +58,13 @@ namespace tensorflow {
// transfers ownership
MetaGraphDef* loadMetaGraphDef(const std::string& exportDir,
const std::string& tag = kSavedModelTagServe,
Backend backend = Backend::cpu,
int nThreads = 1);

// deprecated in favor of loadMetaGraphDef
MetaGraphDef* loadMetaGraph(const std::string& exportDir,
const std::string& tag = kSavedModelTagServe,
Backend backend = Backend::cpu,
int nThreads = 1);

// loads a graph definition saved as a protobuf file at pbFile
Expand All @@ -67,9 +75,9 @@ namespace tensorflow {
// transfers ownership
Session* createSession(SessionOptions& sessionOptions);

// return a new, empty session with nThreads
// return a new, empty session with nThreads and selected backend
// transfers ownership
Session* createSession(int nThreads = 1);
Session* createSession(Backend backend = Backend::cpu, int nThreads = 1);

// return a new session that will contain an already loaded meta graph whose exportDir must be
// given in order to load and initialize the variables, sessionOptions are predefined
Expand All @@ -83,7 +91,10 @@ namespace tensorflow {
// in order to load and initialize the variables, threading options are inferred from nThreads
// an error is thrown when metaGraphDef is a nullptr or when the graph has no nodes
// transfers ownership
Session* createSession(const MetaGraphDef* metaGraphDef, const std::string& exportDir, int nThreads = 1);
Session* createSession(const MetaGraphDef* metaGraphDef,
const std::string& exportDir,
Backend backend = Backend::cpu,
int nThreads = 1);

// return a new session that will contain an already loaded graph def, sessionOptions are predefined
// an error is thrown when graphDef is a nullptr or when the graph has no nodes
Expand All @@ -94,7 +105,7 @@ namespace tensorflow {
// inferred from nThreads
// an error is thrown when graphDef is a nullptr or when the graph has no nodes
// transfers ownership
Session* createSession(const GraphDef* graphDef, int nThreads = 1);
Session* createSession(const GraphDef* graphDef, Backend backend = Backend::cpu, int nThreads = 1);

// closes a session, calls its destructor, resets the pointer, and returns true on success
bool closeSession(Session*& session);
Expand Down
2 changes: 1 addition & 1 deletion PhysicsTools/TensorFlow/plugins/TfGraphDefProducer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ TfGraphDefProducer::TfGraphDefProducer(const edm::ParameterSet& iConfig)
// ------------ method called to produce the data ------------
TfGraphDefProducer::ReturnType TfGraphDefProducer::produce(const TfGraphRecord& iRecord) {
auto* graph = tensorflow::loadGraphDef(filename_);
return std::make_unique<TfGraphDefWrapper>(tensorflow::createSession(graph, 1), graph);
return std::make_unique<TfGraphDefWrapper>(tensorflow::createSession(graph), graph);
}

void TfGraphDefProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
Expand Down
78 changes: 71 additions & 7 deletions PhysicsTools/TensorFlow/src/TensorFlow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
*/

#include "PhysicsTools/TensorFlow/interface/TensorFlow.h"

#include "FWCore/MessageLogger/interface/MessageLogger.h"
#include "FWCore/ServiceRegistry/interface/Service.h"
#include "FWCore/Utilities/interface/ResourceInformation.h"

namespace tensorflow {

Expand All @@ -25,6 +26,65 @@ namespace tensorflow {
setThreading(sessionOptions, nThreads);
}

void setBackend(SessionOptions& sessionOptions, Backend backend) {
/*
* The TensorFlow backend configures the available devices using options provided in the sessionOptions proto.
* // Options from https://github.com/tensorflow/tensorflow/blob/c53dab9fbc9de4ea8b1df59041a5ffd3987328c3/tensorflow/core/protobuf/config.proto
*
* If the device_count["GPU"] = 0 GPUs are not used.
* The visible_device_list configuration is used to map the `visible` devices (from CUDA_VISIBLE_DEVICES) to `virtual` devices.
* If Backend::cpu is request, the GPU device is disallowed by device_count configuration.
* If Backend::cuda is request:
* - if ResourceInformation shows an available Nvidia GPU device:
* the device is used with memory_growth configuration (not allocating all cuda memory at once).
* - if no device is present: an exception is raised.
*/

edm::Service<edm::ResourceInformation> ri;
if (backend == Backend::cpu) {
// disable GPU usage
(*sessionOptions.config.mutable_device_count())["GPU"] = 0;
sessionOptions.config.mutable_gpu_options()->set_visible_device_list("");
}
// NVidia GPU
else if (backend == Backend::cuda) {
if (not ri->nvidiaDriverVersion().empty()) {
// Take only the first GPU in the CUDA_VISIBLE_DEVICE list
(*sessionOptions.config.mutable_device_count())["GPU"] = 1;
sessionOptions.config.mutable_gpu_options()->set_visible_device_list("0");
// Do not allocate all the memory on the GPU at the beginning.
sessionOptions.config.mutable_gpu_options()->set_allow_growth(true);
} else {
edm::Exception ex(edm::errors::UnavailableAccelerator);
ex << "Cuda backend requested, but no NVIDIA GPU available in the job";
ex.addContext("Calling tensorflow::setBackend()");
throw ex;
}
}
// ROCm and Intel GPU are still not supported
else if ((backend == Backend::rocm) || (backend == Backend::intel)) {
edm::Exception ex(edm::errors::UnavailableAccelerator);
ex << "ROCm/Intel GPU backend requested, but TF is not compiled yet for this platform";
ex.addContext("Calling tensorflow::setBackend()");
throw ex;
}
// Get NVidia GPU if possible or fallback to CPU
else if (backend == Backend::best) {
// Check if a Nvidia GPU is availabl
if (not ri->nvidiaDriverVersion().empty()) {
// Take only the first GPU in the CUDA_VISIBLE_DEVICE list
(*sessionOptions.config.mutable_device_count())["GPU"] = 1;
sessionOptions.config.mutable_gpu_options()->set_visible_device_list("0");
// Do not allocate all the memory on the GPU at the beginning.
sessionOptions.config.mutable_gpu_options()->set_allow_growth(true);
} else {
// Just CPU support
(*sessionOptions.config.mutable_device_count())["GPU"] = 0;
sessionOptions.config.mutable_gpu_options()->set_visible_device_list("");
}
}
}

MetaGraphDef* loadMetaGraphDef(const std::string& exportDir, const std::string& tag, SessionOptions& sessionOptions) {
// objects to load the graph
Status status;
Expand All @@ -49,19 +109,20 @@ namespace tensorflow {
return loadMetaGraphDef(exportDir, tag, sessionOptions);
}

MetaGraphDef* loadMetaGraphDef(const std::string& exportDir, const std::string& tag, int nThreads) {
MetaGraphDef* loadMetaGraphDef(const std::string& exportDir, const std::string& tag, Backend backend, int nThreads) {
// create session options and set thread options
SessionOptions sessionOptions;
setThreading(sessionOptions, nThreads);
setBackend(sessionOptions, backend);

return loadMetaGraphDef(exportDir, tag, sessionOptions);
}

MetaGraphDef* loadMetaGraph(const std::string& exportDir, const std::string& tag, int nThreads) {
MetaGraphDef* loadMetaGraph(const std::string& exportDir, const std::string& tag, Backend backend, int nThreads) {
edm::LogInfo("PhysicsTools/TensorFlow")
<< "tensorflow::loadMetaGraph() is deprecated, use tensorflow::loadMetaGraphDef() instead";

return loadMetaGraphDef(exportDir, tag, nThreads);
return loadMetaGraphDef(exportDir, tag, backend, nThreads);
}

GraphDef* loadGraphDef(const std::string& pbFile) {
Expand Down Expand Up @@ -95,10 +156,11 @@ namespace tensorflow {
return session;
}

Session* createSession(int nThreads) {
Session* createSession(Backend backend, int nThreads) {
// create session options and set thread options
SessionOptions sessionOptions;
setThreading(sessionOptions, nThreads);
setBackend(sessionOptions, backend);

return createSession(sessionOptions);
}
Expand Down Expand Up @@ -152,10 +214,11 @@ namespace tensorflow {
return session;
}

Session* createSession(const MetaGraphDef* metaGraphDef, const std::string& exportDir, int nThreads) {
Session* createSession(const MetaGraphDef* metaGraphDef, const std::string& exportDir, Backend backend, int nThreads) {
// create session options and set thread options
SessionOptions sessionOptions;
setThreading(sessionOptions, nThreads);
setBackend(sessionOptions, backend);

return createSession(metaGraphDef, exportDir, sessionOptions);
}
Expand Down Expand Up @@ -186,10 +249,11 @@ namespace tensorflow {
return session;
}

Session* createSession(const GraphDef* graphDef, int nThreads) {
Session* createSession(const GraphDef* graphDef, Backend backend, int nThreads) {
// create session options and set thread options
SessionOptions sessionOptions;
setThreading(sessionOptions, nThreads);
setBackend(sessionOptions, backend);

return createSession(graphDef, sessionOptions);
}
Expand Down
128 changes: 128 additions & 0 deletions PhysicsTools/TensorFlow/test/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,166 @@
<use name="tensorflow-cc"/>
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
</bin>

<iftool name="cuda">
<bin name="testTFHelloWorldCUDA" file="testRunner.cpp,testHelloWorldCUDA.cc">
<use name="tensorflow-cc"/>
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<use name="catch2"/>
<use name="FWCore/ParameterSet"/>
<use name="FWCore/ParameterSetReader"/>
<use name="FWCore/PluginManager"/>
<use name="FWCore/ServiceRegistry"/>
<use name="FWCore/Utilities"/>
<use name="cuda"/>
</bin>
</iftool>


<bin name="testTFMetaGraphLoading" file="testRunner.cpp,testMetaGraphLoading.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
</bin>

<iftool name="cuda">
<bin name="testTFMetaGraphLoadingCUDA" file="testRunner.cpp,testMetaGraphLoadingCUDA.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<use name="catch2"/>
<use name="FWCore/ParameterSet"/>
<use name="FWCore/ParameterSetReader"/>
<use name="FWCore/PluginManager"/>
<use name="FWCore/ServiceRegistry"/>
<use name="FWCore/Utilities"/>
<use name="cuda"/>
</bin>
</iftool>

<bin name="testTFGraphLoading" file="testRunner.cpp,testGraphLoading.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
</bin>

<iftool name="cuda">
<bin name="testTFGraphLoadingCUDA" file="testRunner.cpp,testGraphLoadingCUDA.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<use name="catch2"/>
<use name="FWCore/ParameterSet"/>
<use name="FWCore/ParameterSetReader"/>
<use name="FWCore/PluginManager"/>
<use name="FWCore/ServiceRegistry"/>
<use name="FWCore/Utilities"/>
<use name="cuda"/>
</bin>
</iftool>

<bin name="testTFConstSession" file="testRunner.cpp,testConstSession.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
</bin>

<iftool name="cuda">
<bin name="testTFConstSessionCUDA" file="testRunner.cpp,testConstSessionCUDA.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<use name="catch2"/>
<use name="FWCore/ParameterSet"/>
<use name="FWCore/ParameterSetReader"/>
<use name="FWCore/PluginManager"/>
<use name="FWCore/ServiceRegistry"/>
<use name="FWCore/Utilities"/>
<use name="cuda"/>
</bin>
</iftool>

<bin name="testTFSessionCache" file="testRunner.cpp,testSessionCache.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
</bin>

<iftool name="cuda">
<bin name="testTFSessionCacheCUDA" file="testRunner.cpp,testSessionCacheCUDA.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<use name="catch2"/>
<use name="FWCore/ParameterSet"/>
<use name="FWCore/ParameterSetReader"/>
<use name="FWCore/PluginManager"/>
<use name="FWCore/ServiceRegistry"/>
<use name="FWCore/Utilities"/>
<use name="cuda"/>
</bin>
</iftool>

<bin name="testTFThreadPools" file="testRunner.cpp,testThreadPools.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
</bin>

<iftool name="cuda">
<bin name="testTFThreadPoolsCUDA" file="testRunner.cpp,testThreadPoolsCUDA.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<use name="catch2"/>
<use name="FWCore/ParameterSet"/>
<use name="FWCore/ParameterSetReader"/>
<use name="FWCore/PluginManager"/>
<use name="FWCore/ServiceRegistry"/>
<use name="FWCore/Utilities"/>
<use name="cuda"/>
</bin>
</iftool>

<bin name="testTFVisibleDevices" file="testRunner.cpp,testVisibleDevices.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
</bin>


<iftool name="cuda">
<bin name="testTFVisibleDevicesCUDA" file="testRunner.cpp,testVisibleDevicesCUDA.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="catch2"/>
<use name="FWCore/ParameterSet"/>
<use name="FWCore/ParameterSetReader"/>
<use name="FWCore/PluginManager"/>
<use name="FWCore/ServiceRegistry"/>
<use name="FWCore/Utilities"/>
<use name="cuda"/>
</bin>
</iftool>

<!-- <ifarchitecture name="!_ppc64le_">
<bin name="testTFAOT" file="testRunner.cpp,testAOT.cc">
<flags DNN_NAME="testAOT_add" />
Expand Down
Loading

0 comments on commit 39853ed

Please sign in to comment.