Skip to content

Commit

Permalink
[NVIDIA] Support ov::internal::exclusive_async_requests property (#691)
Browse files Browse the repository at this point in the history
  • Loading branch information
nkogteva authored Jul 26, 2023
1 parent 8e60111 commit 34f5c54
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 7 deletions.
15 changes: 10 additions & 5 deletions modules/nvidia_plugin/src/cuda_compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,12 @@
#include "transformer/cuda_graph_transformer.hpp"

#include "openvino/runtime/exec_model_info.hpp"
#include "openvino/runtime/internal_properties.hpp"
#include "openvino/runtime/iplugin.hpp"
#include "openvino/pass/serialize.hpp"

namespace {
static constexpr const char* nv_task_executor_name = "NvidiaCallbackExecutor";
static constexpr const char* nv_stream_executor_name = "NvidiaStreamExecutor";
static constexpr const char* nv_exclusive_executor = "NvidiaExecutor";
static constexpr const char* nv_callback_executor_name = "NvidiaCallbackExecutor";
} // namespace

Expand Down Expand Up @@ -73,17 +74,21 @@ void CompiledModel::init_executor() {
// real hardware cores and NUMA nodes.
config_.streams_executor_config_.set_property({ ov::num_streams(ov::streams::Num(memory_pool_->Size())) });
auto streams_executor_config = ov::threading::IStreamsExecutor::Config::make_default_multi_threaded(config_.streams_executor_config_);
streams_executor_config._name = nv_task_executor_name;
streams_executor_config._name = nv_stream_executor_name;
// As OpenVINO CPU Streams Executor creates some additional threads
// it is better to avoid threads recreateion as some OSs memory allocator can not manage such usage cases
// and memory consumption can be larger than it is expected.
// So OpenVINO provides executors cache.
set_task_executor(get_plugin()->get_executor_manager()->get_idle_cpu_streams_executor(streams_executor_config));
if (config_.is_exclusive_async_requests()) {
set_task_executor(get_plugin()->get_executor_manager()->get_executor(nv_exclusive_executor));
} else {
set_task_executor(get_plugin()->get_executor_manager()->get_idle_cpu_streams_executor(streams_executor_config));
}
set_callback_executor(get_plugin()->get_executor_manager()->get_idle_cpu_streams_executor({nv_callback_executor_name}));
}

CompiledModel::~CompiledModel() {
get_plugin()->get_executor_manager()->clear(nv_task_executor_name);
get_plugin()->get_executor_manager()->clear(nv_stream_executor_name);
get_plugin()->get_executor_manager()->clear(nv_callback_executor_name);
}

Expand Down
15 changes: 13 additions & 2 deletions modules/nvidia_plugin/src/cuda_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ std::vector<ov::PropertyName> Configuration::get_supported_properties() {
std::vector<ov::PropertyName> Configuration::get_supported_internal_properties() {
static const std::vector<ov::PropertyName> supported_internal_properties = {
ov::PropertyName{ov::internal::caching_properties.name(), ov::PropertyMutability::RO},
ov::PropertyName{ov::internal::config_device_id.name(), ov::PropertyMutability::WO}};
ov::PropertyName{ov::internal::config_device_id.name(), ov::PropertyMutability::WO},
ov::PropertyName{ov::internal::exclusive_async_requests.name(), ov::PropertyMutability::RW}};
return supported_internal_properties;
}

Expand Down Expand Up @@ -111,6 +112,8 @@ ov::element::Type Configuration::get_inference_precision() const noexcept {
}

bool Configuration::auto_streams_detection_required() const noexcept {
if (exclusive_async_requests)
return false;
return ((ov::hint::PerformanceMode::THROUGHPUT == performance_mode) && (num_streams <= 0)) ||
(num_streams == ov::streams::AUTO);
}
Expand All @@ -124,7 +127,7 @@ uint32_t Configuration::get_optimal_number_of_streams() const noexcept {
optimal_number_of_streams = (hint_num_requests > 0) ?
std::min(hint_num_requests, reasonable_limit_of_streams)
: reasonable_limit_of_streams;
} else if (num_streams > 0) {
} else if (num_streams > 0 && !exclusive_async_requests) {
optimal_number_of_streams = num_streams;
}
return optimal_number_of_streams;
Expand All @@ -137,6 +140,10 @@ bool Configuration::is_stream_executor_property(const std::string& name) const {
std::find(std::begin(stream_executor_properties), std::end(stream_executor_properties), name));
}

bool Configuration::is_exclusive_async_requests() const noexcept {
return exclusive_async_requests;
}

Configuration::Configuration(const ov::AnyMap& config, const Configuration& defaultCfg, bool throwOnUnsupported) {
*this = defaultCfg;
// Update device id first
Expand Down Expand Up @@ -186,6 +193,8 @@ Configuration::Configuration(const ov::AnyMap& config, const Configuration& defa
performance_mode = value.as<ov::hint::PerformanceMode>();
} else if (ov::hint::execution_mode == key) {
execution_mode = value.as<ov::hint::ExecutionMode>();
} else if (ov::internal::exclusive_async_requests == key) {
exclusive_async_requests = value.as<bool>();
} else if (throwOnUnsupported) {
throw_ov_exception(key);
}
Expand Down Expand Up @@ -219,6 +228,8 @@ ov::Any Configuration::get(const std::string& name) const {
return performance_mode;
} else if (name == ov::hint::execution_mode) {
return execution_mode;
} else if (name == ov::internal::exclusive_async_requests) {
return exclusive_async_requests;
} else {
OPENVINO_THROW("Property was not found: ", name);
}
Expand Down
2 changes: 2 additions & 0 deletions modules/nvidia_plugin/src/cuda_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ struct Configuration {
ov::element::Type get_inference_precision() const noexcept;
uint32_t get_optimal_number_of_streams() const noexcept;
bool auto_streams_detection_required() const noexcept;
bool is_exclusive_async_requests() const noexcept;

// Plugin configuration parameters
static constexpr uint32_t reasonable_limit_of_streams = 10;
Expand All @@ -52,6 +53,7 @@ struct Configuration {
bool is_profiling_enabled = false;
bool operation_benchmark = false;
bool use_cuda_graph = true;
bool exclusive_async_requests = false;
uint32_t hint_num_requests = 0;
ov::streams::Num num_streams = 0;
ov::hint::PerformanceMode performance_mode = ov::hint::PerformanceMode::LATENCY;
Expand Down
49 changes: 49 additions & 0 deletions modules/nvidia_plugin/tests/unit/executable_network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include <memory>
#include <typeinfo>

#include "openvino/runtime/internal_properties.hpp"

#include "cuda_compiled_model.hpp"
#include "cuda_operation_registry.hpp"
#include "cuda_plugin.hpp"
Expand Down Expand Up @@ -133,6 +135,8 @@ TEST_P(NumStreams1ExecNetworkTest, LoadExecNetwork_OptimalNumberInferRequests_1_
auto cuda_compiled_model = std::dynamic_pointer_cast<CompiledModel>(compiled_model);
auto& memoryManagerPool = GetMemoryManagerPool(cuda_compiled_model);
ASSERT_EQ(memoryManagerPool->Size(), total_streams);
ASSERT_EQ(cuda_compiled_model->get_property(ov::num_streams.name()), ov::streams::Num(total_streams));
ASSERT_EQ(cuda_compiled_model->get_property(ov::optimal_number_of_infer_requests.name()), uint32_t(total_streams));
}

INSTANTIATE_TEST_SUITE_P(ExecNetworkTest,
Expand Down Expand Up @@ -170,13 +174,58 @@ TEST_P(NumStreams8ExecNetworkTest, LoadExecNetwork_OptimalNumberInferRequests_8_
auto cuda_compiled_model = std::dynamic_pointer_cast<CompiledModel>(compiled_model);
auto& memoryManagerPool = GetMemoryManagerPool(cuda_compiled_model);
ASSERT_EQ(memoryManagerPool->Size(), total_streams);
ASSERT_EQ(cuda_compiled_model->get_property(ov::num_streams.name()), ov::streams::Num(total_streams));
ASSERT_EQ(cuda_compiled_model->get_property(ov::optimal_number_of_infer_requests.name()), uint32_t(total_streams));
}


INSTANTIATE_TEST_SUITE_P(ExecNetworkTest,
NumStreams8ExecNetworkTest,
::testing::ValuesIn(num_streams_8_properties),
ExecNetworkTest::getTestCaseName);

std::vector<PropertiesParams> num_streams_8_properties_exclusive = {
{
{CONFIG_KEY(DEVICE_ID), "0"},
{NVIDIA_CONFIG_KEY(THROUGHPUT_STREAMS), "8"},
{CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), CONFIG_VALUE(YES)},
},
{
{ov::device::id.name(), "0"},
{ov::num_streams.name(), "8"},
{ov::internal::exclusive_async_requests.name(), true},
},
{
{ov::device::id.name(), "0"},
{ov::hint::performance_mode.name(), ov::util::to_string(ov::hint::PerformanceMode::THROUGHPUT)},
{ov::num_streams.name(), "8"},
{ov::internal::exclusive_async_requests.name(), true},
},
{
{ov::device::id.name(), "0"},
{ov::hint::performance_mode.name(), ov::util::to_string(ov::hint::PerformanceMode::LATENCY)},
{ov::num_streams.name(), "8"},
{ov::internal::exclusive_async_requests.name(), true},
},
};

using NumStreams8ExclusiveExecNetworkTest = ExecNetworkTest;
TEST_P(NumStreams8ExclusiveExecNetworkTest, LoadExecNetwork_OptimalNumberInferRequests_8_Success) {
using namespace std::chrono_literals;
auto plugin = std::make_shared<Plugin>();
constexpr auto total_streams = 1;
auto compiled_model = plugin->compile_model(function_, properties);
auto cuda_compiled_model = std::dynamic_pointer_cast<CompiledModel>(compiled_model);
auto& memoryManagerPool = GetMemoryManagerPool(cuda_compiled_model);
ASSERT_EQ(memoryManagerPool->Size(), total_streams);
ASSERT_EQ(cuda_compiled_model->get_property(ov::num_streams.name()), ov::streams::Num(total_streams));
ASSERT_EQ(cuda_compiled_model->get_property(ov::optimal_number_of_infer_requests.name()), uint32_t(total_streams));
}

INSTANTIATE_TEST_SUITE_P(ExecNetworkTest,
NumStreams8ExclusiveExecNetworkTest,
::testing::ValuesIn(num_streams_8_properties_exclusive),
ExecNetworkTest::getTestCaseName);

std::vector<PropertiesParams> num_streams_auto_properties = {
{
Expand Down

0 comments on commit 34f5c54

Please sign in to comment.