microsoft · jslhcl · Jul 17, 2024 · Jul 23, 2024 · Jul 23, 2024 · Jul 26, 2024
diff --git a/include/onnxruntime/core/framework/ort_type_constraints.h b/include/onnxruntime/core/framework/ort_type_constraints.h
@@ -0,0 +1,15 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/session/onnxruntime_c_api.h"
+#include <unordered_map>
+#include <string>
+#include <set>
+
+struct OrtTypeConstraints {
+    bool AddTypeConstraint(const char* type_symbol, ONNXTensorElementDataType type);
+    inline const std::unordered_map<std::string, std::set<ONNXTensorElementDataType>>& GetTypeConstraints() const { return type_constraints_; };
+private:
+    std::unordered_map<std::string, std::set<ONNXTensorElementDataType>> type_constraints_;
+};
diff --git a/include/onnxruntime/core/session/environment.h b/include/onnxruntime/core/session/environment.h
@@ -88,6 +88,10 @@
    */
   Status CreateAndRegisterAllocatorV2(const std::string& provider_type, const OrtMemoryInfo& mem_info, const std::unordered_map<std::string, std::string>& options, const OrtArenaCfg* arena_cfg = nullptr);
 
+  void InsertCustomEp(const char* ep_name, OrtExecutionProviderFactory* ep_factory);
+
+  OrtExecutionProviderFactory* GetOrtExecutionProviderFactory(const std::string& ep_name);
+
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Environment);
   Status Initialize(std::unique_ptr<logging::LoggingManager> logging_manager,
@@ -99,5 +103,6 @@
   std::unique_ptr<onnxruntime::concurrency::ThreadPool> inter_op_thread_pool_;
   bool create_global_thread_pools_{false};
   std::vector<AllocatorPtr> shared_allocators_;
+  std::unordered_map<std::string, std::unique_ptr<OrtExecutionProviderFactory>> custom_ep_factories_;
 };
 }  // namespace onnxruntime
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -304,6 +304,13 @@
 ORT_RUNTIME_CLASS(OpAttr);
 ORT_RUNTIME_CLASS(Logger);
 ORT_RUNTIME_CLASS(ShapeInferContext);
+ORT_RUNTIME_CLASS(ExecutionProvider);
+ORT_RUNTIME_CLASS(ExecutionProviderFactory);
+ORT_RUNTIME_CLASS(Node);
+ORT_RUNTIME_CLASS(GraphViewer);
+ORT_RUNTIME_CLASS(KernelRegistry);
+ORT_RUNTIME_CLASS(TypeConstraints);
+ORT_RUNTIME_CLASS(NodeUnit);
 
 #ifdef _WIN32
 typedef _Return_type_success_(return == 0) OrtStatus* OrtStatusPtr;
@@ -689,6 +696,67 @@
  */
 ORT_EXPORT const OrtApiBase* ORT_API_CALL OrtGetApiBase(void) NO_EXCEPTION;
 
+typedef struct OrtMetaDef {
+  const char* name;
+  const char* domain;
+  int since_version;
+
+  const char** inputs;
+  size_t input_len;
+  const char** outputs;
+  size_t output_len;
+  const char** constant_initializers;
+  size_t initializer_len;
+
+  const char* doc_string;
+} OrtMetaDef;
+
+typedef struct OrtIndexedSubGraph {
+  OrtMetaDef* meta_def; // TODO(leca): how to define a nested structure pointer?
+  size_t* node_index;
+  size_t node_index_len;
+} OrtIndexedSubGraph;
+
+typedef struct OrtComputeContext {
+  void*(ORT_API_CALL* AllocateFunc)(void*, size_t, size_t);
+  void(ORT_API_CALL* DestroyFunc)(void*, void*);
+  void* allocator_handle;
+  const char* node_name;
+} OrtComputeContext;
+
+typedef struct OrtNodeComputeInfo {
+  int(ORT_API_CALL* CreateFunctionStateFunc)(OrtComputeContext*, void**);
+  OrtStatusPtr(ORT_API_CALL* ComputeFunc)(void*, const OrtApi*, OrtKernelContext*);
+  void(ORT_API_CALL* DestroyFunctionStateFunc)(void*);
+} OrtNodeComputeInfo;
+
+typedef struct OrtExecutionProvider {
+#ifdef __cplusplus
+  OrtExecutionProvider() : GetCapability{nullptr}, Compile{nullptr}, RegisterKernels{nullptr} {}
+#endif
+  void(ORT_API_CALL* GetCapability)(const OrtExecutionProvider* this_, const OrtGraphViewer* graph, size_t* cnt, OrtIndexedSubGraph***);
+  void(ORT_API_CALL* Compile)(OrtExecutionProvider* this_, const OrtGraphViewer** graph, const OrtNode** node, size_t cnt, OrtNodeComputeInfo** node_compute_info);
+  void(ORT_API_CALL* RegisterKernels)(OrtKernelRegistry* kernel_registry);
+  const char* type;
+} OrtExecutionProvider;
+
+typedef struct OrtExecutionProviderFactory {
+  OrtExecutionProvider*(ORT_API_CALL* CreateExecutionProvider)(OrtExecutionProviderFactory* this_, const char* const* ep_option_keys, const char* const* ep_option_values, size_t option_size);
+} OrtExecutionProviderFactory;
+
+typedef struct OrtNodeUnit {
+  enum Type {
+    SingleNode,
+    QDQGroup,
+  } type;
+  OrtNode** dq_nodes;
+  size_t dq_nodes_len;
+  OrtNode** q_nodes;
+  size_t q_nodes_len;
+  OrtNode* target_node;
+  size_t input_edge_count;
+} OrtNodeUnit;
+
 /** \brief Thread work loop function
  *
  * Onnxruntime will provide the working loop on custom thread creation
@@ -4665,7 +4733,36 @@
                   _In_reads_(num_external_initializer_files) char* const* external_initializer_file_buffer_array,
                   _In_reads_(num_external_initializer_files) const size_t* external_initializer_file_lengths,
                   size_t num_external_initializer_files);
-};
+
+  ORT_API2_STATUS(RegisterOrtExecutionProviderLibrary, _In_ const ORTCHAR_T* lib_path, _In_ OrtEnv* env, _In_ const char* ep_name);
+
+  ORT_API2_STATUS(SessionOptionsAppendOrtExecutionProvider, _In_ OrtSessionOptions* options, _In_ const char* ep_name, _In_ OrtEnv* env,
+                   _In_reads_(num_keys) const char* const* provider_options_keys, _In_reads_(num_keys) const char* const* provider_options_values, _In_ size_t num_keys);
+
+  ORT_API2_STATUS(OrtGraph_IsConstantInitializer, const OrtGraphViewer* graph, const char* name, bool check_outer_scope, _Out_ bool* ret);
+
+  ORT_API2_STATUS(OrtGraph_GetNodesIndexInTopologicalOrder, const OrtGraphViewer* graph, _Out_ size_t* len, _Out_ const size_t** nodes_index_in_topological_order);
+
+  ORT_API2_STATUS(OrtGraph_GetOrtNode, const OrtGraphViewer* graph, size_t node_index, _Outptr_ const OrtNode** node);
+
+  ORT_API2_STATUS(OrtNode_GetOpType, const OrtNode* node, _Out_ const char** op_type);
+
+  ORT_API2_STATUS(OrtNode_GetInputSize, const OrtNode* node, _Out_ size_t* input_size);
+
+  ORT_API2_STATUS(OrtNode_GetIthInputName, const OrtNode* node, size_t i, _Out_ const char** ith_input_name);
+
+  ORT_API2_STATUS(OrtNode_GetOutputSize, const OrtNode* node, _Out_ size_t* output_size);
+
+  ORT_API2_STATUS(OrtNode_GetIthOutputName, const OrtNode* node, size_t i, _Out_ const char** ith_output_name);
+
+  ORT_API2_STATUS(OrtKernelRegistry_RegisterKernel, OrtKernelRegistry* kernel_registry, OrtCustomOp* custom_op, OrtTypeConstraints* type_constraints);
+
+  ORT_API2_STATUS(CreateOrtTypeConstraints, _Outptr_ OrtTypeConstraints** type_constraints);
+
+  ORT_API2_STATUS(AddTypeConstraint, _In_ OrtTypeConstraints* type_constraints, _In_ const char* type_symbol, ONNXTensorElementDataType type);
+
+  ORT_API2_STATUS(ReleaseOrtTypeConstraints, _In_ OrtTypeConstraints* type_constraints);
+};  // struct OrtApi
 
 /*
  * Steps to use a custom op:

diff --git a/onnxruntime/core/framework/ort_type_constraints.cc b/onnxruntime/core/framework/ort_type_constraints.cc
@@ -0,0 +1,14 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/framework/ort_type_constraints.h"
+
+bool OrtTypeConstraints::AddTypeConstraint(const char* type_symbol, ONNXTensorElementDataType type) {
+    std::unordered_map<std::string, std::set<ONNXTensorElementDataType>>::iterator iter = type_constraints_.find(type_symbol);
+    if (iter == type_constraints_.end()) {
+        std::set<ONNXTensorElementDataType> types{type};
+        type_constraints_[type_symbol] = types;
+        return true;
+    }
+    return (iter->second).insert(type).second;
+}
diff --git a/onnxruntime/core/framework/provider_adapter.h b/onnxruntime/core/framework/provider_adapter.h
@@ -0,0 +1,93 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/session/onnxruntime_c_api.h"
+#include "core/framework/compute_capability.h"
+
+namespace onnxruntime {
+class ExecutionProviderAdapter : public IExecutionProvider {
+public:
+  ExecutionProviderAdapter(OrtExecutionProvider* ep) : IExecutionProvider(ep->type), ep_impl_(ep) {
+    if (ep_impl_->RegisterKernels) {
+      kernel_registry_ = std::make_shared<KernelRegistry>();
+      ep_impl_->RegisterKernels(reinterpret_cast<OrtKernelRegistry*>(kernel_registry_.get()));
+    }
+  }
+  virtual std::vector<std::unique_ptr<ComputeCapability>> GetCapability(const GraphViewer& graph_viewer, const IKernelLookup& kernel_lookup) const override {
+    size_t cnt = 0;
+    OrtIndexedSubGraph** indexed_subgraph = nullptr;
+    if (ep_impl_->GetCapability) ep_impl_->GetCapability(ep_impl_, reinterpret_cast<const OrtGraphViewer*>(&graph_viewer), &cnt, &indexed_subgraph);
+
+    if (cnt == 0) return IExecutionProvider::GetCapability(graph_viewer, kernel_lookup);
+
+    std::vector<std::unique_ptr<ComputeCapability>> ret;
+    for (size_t i = 0; i < cnt; i++) {
+        std::unique_ptr<IndexedSubGraph> sb = std::make_unique<IndexedSubGraph>();
+        sb->nodes.reserve(indexed_subgraph[i]->node_index_len);
+        for (size_t j = 0; j < indexed_subgraph[i]->node_index_len; j++) sb->nodes.push_back((indexed_subgraph[i]->node_index)[j]);
+        if (indexed_subgraph[i]->meta_def != nullptr) {
+            std::unique_ptr<IndexedSubGraph::MetaDef> meta_def = std::make_unique<IndexedSubGraph::MetaDef>();
+            meta_def->name = indexed_subgraph[i]->meta_def->name ? indexed_subgraph[i]->meta_def->name : "";
+            meta_def->doc_string = indexed_subgraph[i]->meta_def->doc_string ? indexed_subgraph[i]->meta_def->doc_string : "";
+            meta_def->domain = indexed_subgraph[i]->meta_def->domain ? indexed_subgraph[i]->meta_def->domain : "";
+            meta_def->since_version = indexed_subgraph[i]->meta_def->since_version;
+
+            meta_def->inputs.reserve(indexed_subgraph[i]->meta_def->input_len);
+            for (size_t j = 0; j < indexed_subgraph[i]->meta_def->input_len; j++) meta_def->inputs.push_back(indexed_subgraph[i]->meta_def->inputs[j]);
+
+            meta_def->outputs.reserve(indexed_subgraph[i]->meta_def->output_len);
+            for (size_t j = 0; j < indexed_subgraph[i]->meta_def->output_len; j++) meta_def->outputs.push_back(indexed_subgraph[i]->meta_def->outputs[j]);
+
+            meta_def->constant_initializers.reserve(indexed_subgraph[i]->meta_def->initializer_len);
+            for (size_t j = 0; j < indexed_subgraph[i]->meta_def->initializer_len; j++) meta_def->constant_initializers.push_back(indexed_subgraph[i]->meta_def->constant_initializers[j]);
+
+            sb->SetMetaDef(std::move(meta_def));
+        }
+
+        ret.push_back(std::make_unique<ComputeCapability>(std::move(sb)));
+    }
+    return ret;
+  }
+
+  virtual common::Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs, std::vector<NodeComputeInfo>& node_compute_funcs) override {
+    std::vector<const OrtGraphViewer*> ortGraphs;
+    std::vector<const OrtNode*> ortNodes;
+    for (auto& fused_node_graph : fused_nodes_and_graphs) {
+      const GraphViewer& graph_viewer = fused_node_graph.filtered_graph;
+      const Node& fused_node = fused_node_graph.fused_node;
+      ortGraphs.push_back(reinterpret_cast<const OrtGraphViewer*>(&graph_viewer));
+      ortNodes.push_back(reinterpret_cast<const OrtNode*>(&fused_node));
+    }
+    size_t count = fused_nodes_and_graphs.size();
+    std::vector<OrtNodeComputeInfo> cache;
+    cache.resize(count);
+    OrtNodeComputeInfo* cache_data = cache.data();
+    ep_impl_->Compile(ep_impl_, ortGraphs.data(), ortNodes.data(), count, &cache_data);
+    node_compute_funcs.reserve(count);
+    for (size_t i = 0; i < count; i++) {
+        NodeComputeInfo compute_info;
+        compute_info.create_state_func = [&, cache, i](ComputeContext* context, void** state) {
+            if (cache[i].CreateFunctionStateFunc) return cache[i].CreateFunctionStateFunc(reinterpret_cast<OrtComputeContext*>(context), state);
+            return 0;
+        };
+        compute_info.compute_func = [&, cache, i](void* state, const OrtApi* api, OrtKernelContext* context) {
+            return ToStatus(cache[i].ComputeFunc(state, api, context));
+        };
+        compute_info.release_state_func = [&, cache, i](void* state) {
+            if (cache[i].DestroyFunctionStateFunc) {
+                cache[i].DestroyFunctionStateFunc(state);
+            }
+        };
+        node_compute_funcs.emplace_back(std::move(compute_info));
+    }
+
+    return Status::OK();
+  }
+
+  virtual std::shared_ptr<KernelRegistry> GetKernelRegistry() const override { return kernel_registry_; }
+private:
+  OrtExecutionProvider* ep_impl_;
+  std::shared_ptr<KernelRegistry> kernel_registry_; // TODO(leca): should be static local
+};
+}
diff --git a/onnxruntime/core/framework/provider_factory_adapter.h b/onnxruntime/core/framework/provider_factory_adapter.h
@@ -0,0 +1,34 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/providers/providers.h"
+#include "provider_adapter.h"
+
+namespace onnxruntime {
+struct ExecutionProviderFactoryAdapter : IExecutionProviderFactory {
+ExecutionProviderFactoryAdapter(OrtExecutionProviderFactory* ep_factory, const char* const* provider_option_keys, const char* const* provider_option_values, size_t provider_option_length)
+    : ep_factory_(ep_factory), provider_option_length_(provider_option_length) {
+        provider_option_keys_.reserve(provider_option_length);
+        provider_option_values_.reserve(provider_option_length);
+        keys_.reserve(provider_option_length);
+        values_.reserve(provider_option_length);
+        for (size_t i = 0; i < provider_option_length; i++) {
+            provider_option_keys_.push_back(provider_option_keys[i]);
+            provider_option_values_.push_back(provider_option_values[i]);
+            keys_.push_back(provider_option_keys_[i].c_str());
+            values_.push_back(provider_option_values_[i].c_str());
+        }
+    }
+
+std::unique_ptr<IExecutionProvider> CreateProvider() override {
+    return std::make_unique<ExecutionProviderAdapter>(ep_factory_->CreateExecutionProvider(ep_factory_, keys_.data(), values_.data(), provider_option_length_));
+}
+OrtExecutionProviderFactory* ep_factory_;
+//const char* const* provider_option_keys_;
+//const char* const* provider_option_values_;
+std::vector<std::string> provider_option_keys_, provider_option_values_;
+std::vector<const char*> keys_, values_;
+size_t provider_option_length_;
+};
+}
diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc
@@ -25,6 +25,7 @@
 #include "core/session/inference_session.h"
 #include "core/session/ort_apis.h"
 #include "core/platform/threadpool.h"
+#include "core/framework/ort_type_constraints.h"
 
 // NOTE: OrtKernelContext is used by both custom ops and compiled kernels.
 // In a minimal build, ORT_EXTENDED_MINIMAL_BUILD is used to enable EPs like CoreML/NNAPI which use compiled kernels,
@@ -49,9 +50,9 @@ static constexpr uint32_t min_ort_version_with_shape_inference = 17;
 #endif
 
 #if !defined(DISABLE_FLOAT8_TYPES)
-#define SUPPORTED_TENSOR_TYPES DataTypeImpl::AllTensorTypesIRv9()
+#define SUPPORTED_TENSOR_TYPES onnxruntime::DataTypeImpl::AllTensorTypesIRv9()
 #else
-#define SUPPORTED_TENSOR_TYPES DataTypeImpl::AllTensorTypesIRv4()
+#define SUPPORTED_TENSOR_TYPES onnxruntime::DataTypeImpl::AllTensorTypesIRv4()
 #endif
 
 #if defined(ORT_MINIMAL_BUILD)
@@ -1331,3 +1332,76 @@ common::Status CreateCustomRegistry(gsl::span<OrtCustomOpDomain* const> op_domai
 
 }  // namespace onnxruntime
 #endif  // ENABLE_CUSTOM_OP_API
+
+//namespace onnxruntime {
+class FuncManager;
+class OpKernelInfo;
+onnxruntime::KernelCreateInfo CreateKernelCreateInfo2(const std::string& domain, const OrtCustomOp* op, OrtTypeConstraints* type_constraints) {
+  const size_t input_count = op->GetInputTypeCount(op);
+
+  onnxruntime::KernelDefBuilder def_builder;
+  def_builder.SetName(op->GetName(op))
+      .SetDomain(domain);
+
+  if (op->version >= min_ort_version_with_custom_version) {
+    if (op->GetStartVersion && op->GetEndVersion) {
+      def_builder.SinceVersion(op->GetStartVersion(op), op->GetEndVersion(op));
+    } else if (op->GetStartVersion) {
+      def_builder.SinceVersion(op->GetStartVersion(op));
+    } else {
+      def_builder.SinceVersion(1);
+    }
+  } else {
+    def_builder.SinceVersion(1);
+  }
+
+  // GetInputMemoryType was introduced in ver 13. This check allows custom ops compiled using older versions
+  // to work with newer versions (> 12) of the ORT binary.
+  if (op->version > 12) {
+    for (size_t i = 0; i < input_count; i++) {
+      def_builder.InputMemoryType(op->GetInputMemoryType(op, i), gsl::narrow_cast<int>(i));
+    }
+  }
+
+  const std::unordered_map<std::string, std::set<ONNXTensorElementDataType>>& tc = type_constraints->GetTypeConstraints();
+  for (const auto& [type_symbol, types] : tc) {
+    for (const auto& type : types) {
+      def_builder.TypeConstraint(type_symbol, onnxruntime::DataTypeImpl::TensorTypeFromONNXEnum(static_cast<int>(type))->AsTensorType());
+    }
+  }
+
+  if (const char* provider_type = op->GetExecutionProviderType(op)) {
+    def_builder.Provider(provider_type);
+  } else {
+    def_builder.Provider(onnxruntime::kCpuExecutionProvider);
+  }
+
+  if (op->version >= 18 && op->GetMayInplace != nullptr) {
+    int* input_index = nullptr;
+    int* output_index = nullptr;
+    size_t len = op->GetMayInplace(&input_index, &output_index);
+    if (len > 0) {
+      for (size_t i = 0; i < len; i++) def_builder.MayInplace(input_index[i], output_index[i]);
+      op->ReleaseMayInplace(input_index, output_index);
+    }
+  }
+
+  if (op->version >= 18 && op->GetAliasMap != nullptr) {
+    int* input_index = nullptr;
+    int* output_index = nullptr;
+    size_t len = op->GetAliasMap(&input_index, &output_index);
+    if (len > 0) {
+      for (size_t i = 0; i < len; i++) def_builder.Alias(input_index[i], output_index[i]);
+      op->ReleaseAliasMap(input_index, output_index);
+    }
+  }
+
+  onnxruntime::KernelCreateFn kernel_create_fn = [op](onnxruntime::FuncManager&, const onnxruntime::OpKernelInfo& info,
+                                         std::unique_ptr<onnxruntime::OpKernel>& out) -> onnxruntime::common::Status {
+    out = std::make_unique<onnxruntime::CustomOpKernel>(info, *op);
+    return onnxruntime::common::Status::OK();
+  };
+
+  return onnxruntime::KernelCreateInfo(def_builder.Build(), kernel_create_fn);
+}
+//} // namespace onnxruntime