Ovep develop lnl 1.2 (#22424)

### Description Support OV2024.4 Refactor tensor initialization check for external weights Support loading OV Config OVEP: Tensor Caching fix, Fix accuracy issues Refactor device memory implementation to make it more generic ### Motivation and Context The changes are required to fix accuracy issues, support loading of OV config, support OV2024.4 --------- Co-authored-by: Eric Crawford <[email protected]> Co-authored-by: saurabhkale17 <[email protected]> Co-authored-by: Javier E. Martinez <[email protected]> Co-authored-by: sfatimar <[email protected]> Co-authored-by: ankitm3k <[email protected]> Co-authored-by: Preetha Veeramalai <[email protected]> Co-authored-by: n1harika <[email protected]> Co-authored-by: jatinwadhwa921 <[email protected]>
microsoft · Oct 14, 2024 · 35adba2 · 35adba2
1 parent 9b1b4e5
commit 35adba2
Show file tree

Hide file tree

Showing 23 changed files with 379 additions and 281 deletions.
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -1352,6 +1352,7 @@ if (onnxruntime_USE_OPENVINO)
   add_definitions(-DUSE_OPENVINO=1)
 
   if(onnxruntime_NPU_NO_FALLBACK)
+    add_definitions(-DOPENVINO_CONFIG_NPU=1)
     add_definitions(-DOPENVINO_DISABLE_NPU_FALLBACK=1)
   endif()
 

diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake
@@ -37,7 +37,7 @@
 
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_openvino_cc_srcs})
   onnxruntime_add_shared_library_module(onnxruntime_providers_openvino ${onnxruntime_providers_openvino_cc_srcs} "${ONNXRUNTIME_ROOT}/core/dll/onnxruntime.rc")
-  onnxruntime_add_include_to_target(onnxruntime_providers_openvino onnxruntime_common onnx)
+  onnxruntime_add_include_to_target(onnxruntime_providers_openvino onnxruntime_common onnx nlohmann_json::nlohmann_json)
   install(FILES ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/openvino/openvino_provider_factory.h
     DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/)
   set_target_properties(onnxruntime_providers_openvino PROPERTIES CXX_STANDARD 20)

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -645,7 +645,7 @@ typedef struct OrtOpenVINOProviderOptions {
    * Valid settings are one of: "CPU_FP32", "CPU_FP16", "GPU_FP32", "GPU_FP16"
    */
   const char* device_type;
-  unsigned char enable_npu_fast_compile;  ///< 0 = disabled, nonzero = enabled
+  unsigned char enable_npu_fast_compile;
   const char* device_id;
   size_t num_of_threads;  ///< 0 = Use default number of threads
   const char* cache_dir;  // path is set to empty by default

diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -83,7 +83,8 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
                                                            subgraph_context_.subgraph_name);
         ie_cnn_network_ = exe_network_.Get().get_runtime_model();
       } else if (global_context_.export_ep_ctx_blob &&
-                 hw_target.find("NPU") != std::string::npos) {
+                 hw_target.find("NPU") != std::string::npos &&
+                 !global_context_.has_external_weights) {
         std::shared_ptr<ov::Model> ov_model;
         {
           const std::string model = model_proto->SerializeAsString();
@@ -93,7 +94,8 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
           ov_model = global_context_.ie_core.Get().read_model(model, ov::Tensor());
         }
         exe_network_ = OVExeNetwork(global_context_.ie_core.Get().compile_model(ov_model, hw_target, device_config));
-      } else if ((!subgraph_context_.has_dynamic_input_shape) &&
+      } else if (!global_context_.has_external_weights &&
+                 (!subgraph_context_.has_dynamic_input_shape) &&
                  ((hw_target.find("AUTO") == std::string::npos) ||
                   (global_context_.OpenVINO_Version.at(0) >= 2024 && global_context_.OpenVINO_Version.at(1) > 2))) {
         // Optimized OV compile_model API is supported with AUTO from version 2024.3 and above
@@ -178,6 +180,74 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
     }
 #endif
   }
+
+  if (!global_context_.load_config.empty()) {
+    const std::map<std::string, ov::AnyMap>& target_config = global_context_.load_config;
+
+    // Parse device types like "AUTO:CPU,GPU" and extract individual devices
+    auto parse_individual_devices = [&](const std::string& device_type) -> std::vector<std::string> {
+      std::vector<std::string> devices;
+      auto delimiter_pos = device_type.find(':');
+      if (delimiter_pos != std::string::npos) {
+        std::stringstream str_stream(device_type.substr(delimiter_pos + 1));
+        std::string device;
+        while (std::getline(str_stream, device, ',')) {
+          devices.emplace_back(device);
+        }
+      } else {
+        devices.emplace_back(device_type);
+      }
+      return devices;
+    };
+
+    // Check if a property is supported and mutable
+    auto is_supported_and_mutable = [&](const std::string& key,
+                                        const std::vector<ov::PropertyName>& supported_config) -> bool {
+      auto it = std::find_if(supported_config.begin(), supported_config.end(), [&](const ov::PropertyName& property) {
+        return property == key && property.is_mutable();
+      });
+      return it != supported_config.end();
+    };
+
+    // Set properties if they are valid, else log a warning if the property is missing or immutable by skipping the same
+    auto set_target_properties = [&](const std::string& device, const ov::AnyMap& config_options,
+                                     const std::vector<ov::PropertyName>& supported_properties) {
+      for (const auto& [key, value] : config_options) {
+        if (is_supported_and_mutable(key, supported_properties)) {
+          global_context_.ie_core.Get().set_property(device, ov::AnyMap{{key, value}});
+        } else {
+          LOGS_DEFAULT(WARNING) << "WARNING: Property \"" << key
+                                << "\" is either unsupported in current OpenVINO version"
+                                << " or property is immutable for target device \""
+                                << device << "\". Skipping setting this property.";
+        }
+      }
+    };
+
+    // Check if the device type is AUTO, HETERO, or MULTI
+    if (global_context_.device_type.find("AUTO") == 0 ||
+        global_context_.device_type.find("HETERO") == 0 ||
+        global_context_.device_type.find("MULTI") == 0) {
+      // Parse individual devices (e.g., "AUTO:CPU,GPU" -> ["CPU", "GPU"])
+      auto individual_devices = parse_individual_devices(global_context_.device_type);
+      // Set properties only for individual devices (e.g., "CPU", "GPU")
+      for (const std::string& device : individual_devices) {
+        if (target_config.count(device)) {
+          // Get supported properties for each individual device
+          auto device_properties = global_context_.ie_core.Get().get_property(device, ov::supported_properties);
+          // Set properties for the device
+          set_target_properties(device, target_config.at(device), device_properties);
+        }
+      }
+    } else {
+      if (target_config.count(global_context_.device_type)) {
+        auto supported_properties = global_context_.ie_core.Get().get_property(global_context_.device_type,
+                                                                               ov::supported_properties);
+        set_target_properties(global_context_.device_type,
+                              target_config.at(global_context_.device_type), supported_properties);
+      }
+    }
+  }
 }
 
 void BasicBackend::EnableCaching(ov::AnyMap& device_config) {
@@ -275,7 +345,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
           input_tensor_shape[tensor_iter] = *i;
           tensor_iter += 1;
         }
-        auto input = graph_input_info.at(input_idx);
+        const auto& input = graph_input_info.at(input_idx);
         OVTensorPtr tensor_ptr;
         // avoid input copies on the CPU device
         if (global_context_.device_type.find("CPU") != std::string::npos) {
@@ -316,7 +386,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
             ort_ov_tensor_map[ort_tensor_key] = ov_tensor_data;
 
             try {
-              infer_request->SetTensor(input_name, ov_tensor_data.tensor_ptr);
+              infer_request->SetTensor(std::move(input_name), ov_tensor_data.tensor_ptr);
             } catch (const char* msg) {
               ORT_THROW(msg);
             }
@@ -354,14 +424,14 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
         if ((it == ort_ov_tensor_map.end()) ||
             (it != ort_ov_tensor_map.end() && (it->second.ort_ptr != tensor.GetTensorRawData()))) {
           ov_tensor_data_t ov_tensor_data;
-          auto output = graph_output_info.at(output_idx);
+          const auto& output = graph_output_info.at(output_idx);
           ov_tensor_data.ort_ptr = tensor.GetTensorRawData();
           ov_tensor_data.tensor_ptr = std::make_shared<ov::Tensor>(output.get_element_type(), output.get_shape(),
                                                                    const_cast<void*>(tensor.GetTensorRawData()));
           ort_ov_tensor_map[ort_tensor_key] = ov_tensor_data;
 
           try {
-            infer_request->SetTensor(output_name, ov_tensor_data.tensor_ptr);
+            infer_request->SetTensor(std::move(output_name), ov_tensor_data.tensor_ptr);
           } catch (const char* msg) {
             ORT_THROW(msg);
           }

diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include <vector>
+#include <map>
 #include <unordered_map>
 #include <string>
 #include "core/providers/openvino/ov_interface.h"
@@ -15,18 +16,19 @@ namespace openvino_ep {
 struct GlobalContext {
   OVCore ie_core;
   bool is_wholly_supported_graph = false;
-  bool enable_npu_fast_compile = false;
   bool enable_opencl_throttling = false;
   bool disable_dynamic_shapes = false;
   bool ep_context_embed_mode = true;
   bool export_ep_ctx_blob = false;
   bool enable_qdq_optimizer = false;
   bool disable_cpu_fallback = false;
+  bool has_external_weights = false;
   size_t num_of_threads;
   std::string device_type;
   std::string precision_str;
   std::string model_precision;
   std::string cache_dir;
+  std::map<std::string, ov::AnyMap> load_config;
   std::string model_priority = "DEFAULT";
   int num_streams;
   std::vector<bool> deviceAvailableList = {true, true, true, true, true, true, true, true};

diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -25,8 +25,8 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv
   global_context_ = std::make_unique<openvino_ep::GlobalContext>();
   global_context_->device_type = info.device_type_;
   global_context_->precision_str = info.precision_;
-  global_context_->enable_npu_fast_compile = info.enable_npu_fast_compile_;
   global_context_->cache_dir = info.cache_dir_;
+  global_context_->load_config = info.load_config_;
   global_context_->model_priority = info.model_priority_;
   global_context_->num_streams = info.num_streams_;
   global_context_->context = info.context_;
@@ -124,6 +124,7 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
   result = obj.Execute();
 
   global_context_->is_wholly_supported_graph = obj.IsWhollySupportedGraph();
+  global_context_->has_external_weights = obj.HasExternalWeights();
 
   return result;
 }

diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -79,8 +79,8 @@ static std::vector<std::string> parseDevices(const std::string& device_string,
 struct OpenVINOExecutionProviderInfo {
   std::string device_type_{""};
   std::string precision_{""};
-  bool enable_npu_fast_compile_{false};
   size_t num_of_threads_{0};
+  std::map<std::string, ov::AnyMap> load_config_{};
   std::string cache_dir_{""};
   std::string model_priority_{""};
   int num_streams_{1};
@@ -94,16 +94,18 @@ struct OpenVINOExecutionProviderInfo {
 
   OpenVINOExecutionProviderInfo() = delete;
 
-  explicit OpenVINOExecutionProviderInfo(const std::string& dev_type, const std::string& precision,
-                                         bool enable_npu_fast_compile, size_t num_of_threads,
-                                         const std::string& cache_dir, const std::string& model_priority,
-                                         int num_streams, void* context, bool enable_opencl_throttling,
+  explicit OpenVINOExecutionProviderInfo(std::string dev_type, const std::string& precision,
+                                         size_t num_of_threads,
+                                         const std::map<std::string, ov::AnyMap>& load_config,
+                                         const std::string& cache_dir,
+                                         const std::string& model_priority, int num_streams,
+                                         void* context, bool enable_opencl_throttling,
                                          bool disable_dynamic_shapes, bool export_ep_ctx_blob,
                                          bool enable_qdq_optimizer, bool disable_cpu_fallback,
                                          bool so_epctx_embed_mode)
       : precision_(std::move(precision)),
-        enable_npu_fast_compile_(enable_npu_fast_compile),
         num_of_threads_(num_of_threads),
+        load_config_(std::move(load_config)),
         cache_dir_(std::move(cache_dir)),
         model_priority_(std::move(model_priority)),
         num_streams_(num_streams),