[NVIDIA] Support ov::internal::exclusive_async_requests property (#691)

openvinotoolkit · Jul 26, 2023 · 34f5c54 · 34f5c54
1 parent 8e60111
commit 34f5c54
Show file tree

Hide file tree

Showing 4 changed files with 74 additions and 7 deletions.
diff --git a/modules/nvidia_plugin/src/cuda_compiled_model.cpp b/modules/nvidia_plugin/src/cuda_compiled_model.cpp
@@ -31,11 +31,12 @@
 #include "transformer/cuda_graph_transformer.hpp"
 
 #include "openvino/runtime/exec_model_info.hpp"
+#include "openvino/runtime/internal_properties.hpp"
 #include "openvino/runtime/iplugin.hpp"
-#include "openvino/pass/serialize.hpp"
 
 namespace {
-static constexpr const char* nv_task_executor_name = "NvidiaCallbackExecutor";
+static constexpr const char* nv_stream_executor_name = "NvidiaStreamExecutor";
+static constexpr const char* nv_exclusive_executor = "NvidiaExecutor";
 static constexpr const char* nv_callback_executor_name = "NvidiaCallbackExecutor";
 }  // namespace
 
@@ -73,17 +74,21 @@ void CompiledModel::init_executor() {
     // real hardware cores and NUMA nodes.
     config_.streams_executor_config_.set_property({ ov::num_streams(ov::streams::Num(memory_pool_->Size())) });
     auto streams_executor_config = ov::threading::IStreamsExecutor::Config::make_default_multi_threaded(config_.streams_executor_config_);
-    streams_executor_config._name = nv_task_executor_name;
+    streams_executor_config._name = nv_stream_executor_name;
     // As OpenVINO CPU Streams Executor creates some additional threads
     // it is better to avoid threads recreateion as some OSs memory allocator can not manage such usage cases
     // and memory consumption can be larger than it is expected.
     // So OpenVINO provides executors cache.
-    set_task_executor(get_plugin()->get_executor_manager()->get_idle_cpu_streams_executor(streams_executor_config));
+    if (config_.is_exclusive_async_requests()) {
+        set_task_executor(get_plugin()->get_executor_manager()->get_executor(nv_exclusive_executor));
+    } else {
+        set_task_executor(get_plugin()->get_executor_manager()->get_idle_cpu_streams_executor(streams_executor_config));
+    }
     set_callback_executor(get_plugin()->get_executor_manager()->get_idle_cpu_streams_executor({nv_callback_executor_name}));
 }
 
 CompiledModel::~CompiledModel() {
-    get_plugin()->get_executor_manager()->clear(nv_task_executor_name);
+    get_plugin()->get_executor_manager()->clear(nv_stream_executor_name);
     get_plugin()->get_executor_manager()->clear(nv_callback_executor_name);
 }
 

diff --git a/modules/nvidia_plugin/src/cuda_config.cpp b/modules/nvidia_plugin/src/cuda_config.cpp
@@ -64,7 +64,8 @@ std::vector<ov::PropertyName> Configuration::get_supported_properties() {
 std::vector<ov::PropertyName> Configuration::get_supported_internal_properties() {
     static const std::vector<ov::PropertyName> supported_internal_properties = {
             ov::PropertyName{ov::internal::caching_properties.name(), ov::PropertyMutability::RO},
-            ov::PropertyName{ov::internal::config_device_id.name(), ov::PropertyMutability::WO}};
+            ov::PropertyName{ov::internal::config_device_id.name(), ov::PropertyMutability::WO},
+            ov::PropertyName{ov::internal::exclusive_async_requests.name(), ov::PropertyMutability::RW}};
     return supported_internal_properties;
 }
 
@@ -111,6 +112,8 @@ ov::element::Type Configuration::get_inference_precision() const noexcept {
 }
 
 bool Configuration::auto_streams_detection_required() const noexcept {
+    if (exclusive_async_requests)
+        return false;
     return ((ov::hint::PerformanceMode::THROUGHPUT == performance_mode) && (num_streams <= 0)) ||
             (num_streams == ov::streams::AUTO);
 }
@@ -124,7 +127,7 @@ uint32_t Configuration::get_optimal_number_of_streams() const noexcept {
         optimal_number_of_streams = (hint_num_requests > 0) ?
             std::min(hint_num_requests, reasonable_limit_of_streams)
             : reasonable_limit_of_streams;
-    } else if (num_streams > 0) {
+    } else if (num_streams > 0 && !exclusive_async_requests) {
         optimal_number_of_streams = num_streams;
     }
     return optimal_number_of_streams;
@@ -137,6 +140,10 @@ bool Configuration::is_stream_executor_property(const std::string& name) const {
         std::find(std::begin(stream_executor_properties), std::end(stream_executor_properties), name));
 }
 
+bool Configuration::is_exclusive_async_requests() const noexcept {
+    return exclusive_async_requests;
+}
+
 Configuration::Configuration(const ov::AnyMap& config, const Configuration& defaultCfg, bool throwOnUnsupported) {
     *this = defaultCfg;
     // Update device id first
@@ -186,6 +193,8 @@ Configuration::Configuration(const ov::AnyMap& config, const Configuration& defa
             performance_mode = value.as<ov::hint::PerformanceMode>();
         } else if (ov::hint::execution_mode == key) {
             execution_mode = value.as<ov::hint::ExecutionMode>();
+        } else if (ov::internal::exclusive_async_requests == key) {
+            exclusive_async_requests = value.as<bool>();
         } else if (throwOnUnsupported) {
             throw_ov_exception(key);
         }
@@ -219,6 +228,8 @@ ov::Any Configuration::get(const std::string& name) const {
         return performance_mode;
     } else if (name == ov::hint::execution_mode) {
         return execution_mode;
+    } else if (name == ov::internal::exclusive_async_requests) {
+        return exclusive_async_requests;
     } else {
         OPENVINO_THROW("Property was not found: ", name);
     }

diff --git a/modules/nvidia_plugin/src/cuda_config.hpp b/modules/nvidia_plugin/src/cuda_config.hpp
@@ -42,6 +42,7 @@ struct Configuration {
     ov::element::Type get_inference_precision() const noexcept;
     uint32_t get_optimal_number_of_streams() const noexcept;
     bool auto_streams_detection_required() const noexcept;
+    bool is_exclusive_async_requests() const noexcept;
 
     // Plugin configuration parameters
     static constexpr uint32_t reasonable_limit_of_streams = 10;
@@ -52,6 +53,7 @@ struct Configuration {
     bool is_profiling_enabled = false;
     bool operation_benchmark = false;
     bool use_cuda_graph = true;
+    bool exclusive_async_requests = false;
     uint32_t hint_num_requests = 0;
     ov::streams::Num num_streams = 0;
     ov::hint::PerformanceMode performance_mode = ov::hint::PerformanceMode::LATENCY;

diff --git a/modules/nvidia_plugin/tests/unit/executable_network.cpp b/modules/nvidia_plugin/tests/unit/executable_network.cpp
@@ -7,6 +7,8 @@
 #include <memory>
 #include <typeinfo>
 
+#include "openvino/runtime/internal_properties.hpp"
+
 #include "cuda_compiled_model.hpp"
 #include "cuda_operation_registry.hpp"
 #include "cuda_plugin.hpp"
@@ -133,6 +135,8 @@ TEST_P(NumStreams1ExecNetworkTest, LoadExecNetwork_OptimalNumberInferRequests_1_
     auto cuda_compiled_model = std::dynamic_pointer_cast<CompiledModel>(compiled_model);
     auto& memoryManagerPool = GetMemoryManagerPool(cuda_compiled_model);
     ASSERT_EQ(memoryManagerPool->Size(), total_streams);
+    ASSERT_EQ(cuda_compiled_model->get_property(ov::num_streams.name()), ov::streams::Num(total_streams));
+    ASSERT_EQ(cuda_compiled_model->get_property(ov::optimal_number_of_infer_requests.name()), uint32_t(total_streams));
 }
 
 INSTANTIATE_TEST_SUITE_P(ExecNetworkTest,
@@ -170,13 +174,58 @@ TEST_P(NumStreams8ExecNetworkTest, LoadExecNetwork_OptimalNumberInferRequests_8_
     auto cuda_compiled_model = std::dynamic_pointer_cast<CompiledModel>(compiled_model);
     auto& memoryManagerPool = GetMemoryManagerPool(cuda_compiled_model);
     ASSERT_EQ(memoryManagerPool->Size(), total_streams);
+    ASSERT_EQ(cuda_compiled_model->get_property(ov::num_streams.name()), ov::streams::Num(total_streams));
+    ASSERT_EQ(cuda_compiled_model->get_property(ov::optimal_number_of_infer_requests.name()), uint32_t(total_streams));
 }
 
+
 INSTANTIATE_TEST_SUITE_P(ExecNetworkTest,
                          NumStreams8ExecNetworkTest,
                          ::testing::ValuesIn(num_streams_8_properties),
                          ExecNetworkTest::getTestCaseName);
 
+std::vector<PropertiesParams> num_streams_8_properties_exclusive = {
+    {
+        {CONFIG_KEY(DEVICE_ID), "0"},
+        {NVIDIA_CONFIG_KEY(THROUGHPUT_STREAMS), "8"},
+        {CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), CONFIG_VALUE(YES)},
+    },
+    {
+        {ov::device::id.name(), "0"},
+        {ov::num_streams.name(), "8"},
+        {ov::internal::exclusive_async_requests.name(), true},
+    },
+    {
+        {ov::device::id.name(), "0"},
+        {ov::hint::performance_mode.name(), ov::util::to_string(ov::hint::PerformanceMode::THROUGHPUT)},
+        {ov::num_streams.name(), "8"},
+        {ov::internal::exclusive_async_requests.name(), true},
+    },
+    {
+        {ov::device::id.name(), "0"},
+        {ov::hint::performance_mode.name(), ov::util::to_string(ov::hint::PerformanceMode::LATENCY)},
+        {ov::num_streams.name(), "8"},
+        {ov::internal::exclusive_async_requests.name(), true},
+    },
+};
+
+using NumStreams8ExclusiveExecNetworkTest = ExecNetworkTest;
+TEST_P(NumStreams8ExclusiveExecNetworkTest, LoadExecNetwork_OptimalNumberInferRequests_8_Success) {
+    using namespace std::chrono_literals;
+    auto plugin = std::make_shared<Plugin>();
+    constexpr auto total_streams = 1;
+    auto compiled_model = plugin->compile_model(function_, properties);
+    auto cuda_compiled_model = std::dynamic_pointer_cast<CompiledModel>(compiled_model);
+    auto& memoryManagerPool = GetMemoryManagerPool(cuda_compiled_model);
+    ASSERT_EQ(memoryManagerPool->Size(), total_streams);
+    ASSERT_EQ(cuda_compiled_model->get_property(ov::num_streams.name()), ov::streams::Num(total_streams));
+    ASSERT_EQ(cuda_compiled_model->get_property(ov::optimal_number_of_infer_requests.name()), uint32_t(total_streams));
+}
+
+INSTANTIATE_TEST_SUITE_P(ExecNetworkTest,
+                         NumStreams8ExclusiveExecNetworkTest,
+                         ::testing::ValuesIn(num_streams_8_properties_exclusive),
+                         ExecNetworkTest::getTestCaseName);
 
 std::vector<PropertiesParams> num_streams_auto_properties = {
     {