Add more F16 kernels of XNNPack (microsoft#22381)

### Description 1. Add Gemm, MatMul, Softmax, AveragePool and Resize F16 kernels This PR has included all changes in microsoft#22378 [AB#51066](https://aiinfra.visualstudio.com/6a833879-cd9b-44a4-a9de-adc2d818f13c/_workitems/edit/51066) [AB#51026](https://aiinfra.visualstudio.com/6a833879-cd9b-44a4-a9de-adc2d818f13c/_workitems/edit/51026) 2. Matrix B must be const and martrix A and B dim_size shoule NOT bigger than 2 in XNNPack, so I added 2 tests in matmul_test.cc to make sure it's really tested. (that is, compute() must be called.) ### Motivation and Context
ranjitshs · Oct 14, 2024 · caa6743 · caa6743
1 parent 72cc72c
commit caa6743
Show file tree

Hide file tree

Showing 19 changed files with 334 additions and 203 deletions.
diff --git a/onnxruntime/core/framework/kernel_registry.cc b/onnxruntime/core/framework/kernel_registry.cc
@@ -6,6 +6,7 @@
 #include <algorithm>
 #include <memory>
 #include <numeric>
+#include <string>
 #include <unordered_map>
 
 #include "core/framework/kernel_type_str_resolver.h"
@@ -310,9 +311,12 @@ Status KernelRegistry::Register(KernelCreateInfo&& create_info) {
   for (auto i = range.first; i != range.second; ++i) {
     if (i->second.kernel_def &&
         i->second.kernel_def->IsConflict(*create_info.kernel_def)) {
-      return Status(common::ONNXRUNTIME, common::FAIL,
-                    "Failed to add kernel for " + key +
-                        ": Conflicting with a registered kernel with op versions.");
+      int since_version = i->second.kernel_def->SinceVersion().first;
+      std::string since_version_str = std::to_string(since_version);
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                             "Failed to add kernel for ", key,
+                             ": Conflicting with a registered kernel with op versions. the since version is: ",
+                             since_version_str);
     }
   }
 

diff --git a/onnxruntime/core/providers/xnnpack/detail/node_support_checker.cc b/onnxruntime/core/providers/xnnpack/detail/node_support_checker.cc
@@ -90,6 +90,17 @@ const NodeUnit* ClipReluChecker(const NodeUnit& node_unit,
 }  // namespace
 
 bool NodeSupportChecker::IsNodeSupported(const NodeUnit& nodeunit) {
+#ifndef XNNPACK_FP16_SUPPORTED
+  // check whether the hardware support XNNPack FP16
+  // Note. In CI, ios pipeline on ADO doesn't support XNNPack FP16. Because ADO mac pool is still x64.
+  const auto& inputs = nodeunit.Inputs();
+  const auto& x_arg = inputs[0].node_arg;
+  const auto* x_type = x_arg.TypeAsProto();
+  if (x_type == nullptr || x_type->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
+    return false;
+  }
+#endif
+
   static std::unordered_map<std::string, CheckerFn> checkers{
       {"Conv", Conv::IsOnnxNodeSupported},
       {"ConvTranspose", ConvTranspose::IsOnnxNodeSupported},

diff --git a/onnxruntime/core/providers/xnnpack/math/gemm.cc b/onnxruntime/core/providers/xnnpack/math/gemm.cc
@@ -4,6 +4,7 @@
 #include "gemm.h"
 #include "core/framework/transpose_helper.h"
 #include "core/providers/utils.h"
+#include "core/providers/xnnpack/xnnpack_init.h"
 
 namespace onnxruntime {
 namespace xnnpack {
@@ -37,7 +38,8 @@ bool Gemm::IsOnnxNodeSupported(const NodeUnit& node_unit, const GraphViewer& gra
     const auto* A_type = A_arg->TypeAsProto();
 
     if (A_type == nullptr ||
-        A_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+        (A_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT &&
+         A_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT16)) {
       break;
     }
 
@@ -74,19 +76,26 @@ bool Gemm::IsOnnxNodeSupported(const NodeUnit& node_unit, const GraphViewer& gra
     supported = true;
 
   } while (false);
-
   return supported;
 }
 
 Gemm::Gemm(const OpKernelInfo& info) : GemmBase(info), XnnpackKernel(info, /*enable_caches*/ true) {
-  const auto& node{Node()};
-
   info.GetAttrOrDefault<float>("alpha", &alpha_, 1.f);
   info.GetAttrOrDefault<float>("beta", &beta_, 1.f);
 
+  const auto& node{Node()};
   const auto& input_defs = node.InputDefs();
   const auto* shapeA = input_defs[0]->Shape();
   const auto* shapeB = input_defs[1]->Shape();
+
+  const NodeArg& X = *input_defs[0];
+  auto input_dtype = X.TypeAsProto()->tensor_type().elem_type();
+  if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+    op_compute_type_ = OpComputeType::op_compute_type_fp32;
+  } else if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
+    op_compute_type_ = OpComputeType::op_compute_type_fp16;
+  }
+
   const NodeArg* C_arg = input_defs.size() == 2 ? nullptr : input_defs[2];
 
   C_matrix_exists_ = C_arg && C_arg->Exists();
@@ -127,32 +136,49 @@ Status Gemm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr,
 
   // flags - 1 - for no transpose - 0 for transpose
   uint32_t flags = trans_B_ == CblasTrans ? 0 : XNN_FLAG_TRANSPOSE_WEIGHTS;
-
-  float output_min = clip_min_max_ ? clip_min_max_->first : -INFINITY;
-  float output_max = clip_min_max_ ? clip_min_max_->second : INFINITY;
-
-  const float* bias_Data = nullptr;
-
-  if (C_matrix_exists_) {
-    bias_Data = tensor.Data<float>();
-  }
-
+  auto code_cache = GetCodeCache();
+  auto weights_cache = GetWeightsCache();
   xnn_status status = xnn_status::xnn_status_uninitialized;
   struct xnn_operator* p = nullptr;
-  status = xnn_create_fully_connected_nc_f32(
-      trans_B_ == CblasNoTrans ? B_->Shape()[0] : B_->Shape()[1],  // size_t input_channels,
-      trans_B_ == CblasNoTrans ? B_->Shape()[1] : B_->Shape()[0],  // size_t output_channels,
-      trans_B_ == CblasNoTrans ? B_->Shape()[0] : B_->Shape()[1],  // size_t input_stride,
-      trans_B_ == CblasNoTrans ? B_->Shape()[1] : B_->Shape()[0],  // size_t output_stride,
-      B_->Data<float>(),                                           // const float* kernel,
-      bias_Data,                                                   // const float* bias,
-      output_min, output_max,
-      flags,
-      GetCodeCache(), GetWeightsCache(),
-      &p);
+  float foutput_min = clip_min_max_ ? clip_min_max_->first : -INFINITY;
+  float foutput_max = clip_min_max_ ? clip_min_max_->second : INFINITY;
+  if (op_compute_type_ == OpComputeType::op_compute_type_fp32) {
+    const float* bias_data = nullptr;
+    if (C_matrix_exists_) {
+      bias_data = tensor.Data<float>();
+    }
+    status = xnn_create_fully_connected_nc_f32(
+        trans_B_ == CblasNoTrans ? B_->Shape()[0] : B_->Shape()[1],  // size_t input_channels,
+        trans_B_ == CblasNoTrans ? B_->Shape()[1] : B_->Shape()[0],  // size_t output_channels,
+        trans_B_ == CblasNoTrans ? B_->Shape()[0] : B_->Shape()[1],  // size_t input_stride,
+        trans_B_ == CblasNoTrans ? B_->Shape()[1] : B_->Shape()[0],  // size_t output_stride,
+        B_->Data<float>(),                                           // const float* kernel,
+        bias_data,                                                   // const float* bias,
+        foutput_min, foutput_max,
+        flags,
+        code_cache, weights_cache,
+        &p);
+  } else if (op_compute_type_ == OpComputeType::op_compute_type_fp16) {
+    const MLFloat16* bias_data = nullptr;
+    if (C_matrix_exists_) {
+      bias_data = tensor.Data<MLFloat16>();
+    }
+    status = xnn_create_fully_connected_nc_f16(
+        trans_B_ == CblasNoTrans ? B_->Shape()[0] : B_->Shape()[1],  // size_t input_channels,
+        trans_B_ == CblasNoTrans ? B_->Shape()[1] : B_->Shape()[0],  // size_t output_channels,
+        trans_B_ == CblasNoTrans ? B_->Shape()[0] : B_->Shape()[1],  // size_t input_stride,
+        trans_B_ == CblasNoTrans ? B_->Shape()[1] : B_->Shape()[0],  // size_t output_stride,
+        B_->Data<MLFloat16>(),                                       // const MLFloat16* kernel,
+        bias_data,                                                   // const float* bias,
+        foutput_min, foutput_max,
+        flags,
+        code_cache, weights_cache,
+        &p);
+  }
 
   if (status != xnn_status_success) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_create_fully_connected_nc_f32 returned ", status);
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_create_fully_connected_nc_",
+                           OpTypeToString(op_compute_type_), " returned ", status);
   }
   op0_.reset(p);
 
@@ -169,19 +195,30 @@ Status Gemm::Compute(OpKernelContext* context) const {
     return Status::OK();
   }
 
-  xnn_status status = xnn_reshape_fully_connected_nc_f32(op0_.get(),
-                                                         // Number of rows to multiply
-                                                         trans_A_ == CblasNoTrans ? M_ : K_,
-                                                         threadpool);
+  auto reshape_func = xnn_reshape_fully_connected_nc_f32;
+  if (op_compute_type_ == OpComputeType::op_compute_type_fp16) {
+    reshape_func = xnn_reshape_fully_connected_nc_f16;
+  }
+  xnn_status status = reshape_func(op0_.get(),
+                                   // Number of rows to multiply
+                                   trans_A_ == CblasNoTrans ? M_ : K_,
+                                   threadpool);
 
   if (status != xnn_status_success) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_reshape_fully_connected_nc_f32 returned ", status);
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_reshape_fully_connected_nc_",
+                           OpTypeToString(op_compute_type_), " returned ", status);
   }
 
-  status = xnn_setup_fully_connected_nc_f32(op0_.get(), A->Data<float>(), Y->MutableData<float>());
+  status = xnn_status_invalid_state;
+  if (op_compute_type_ == op_compute_type_fp32) {
+    status = xnn_setup_fully_connected_nc_f32(op0_.get(), A->Data<float>(), Y->MutableData<float>());
+  } else if (op_compute_type_ == OpComputeType::op_compute_type_fp16) {
+    status = xnn_setup_fully_connected_nc_f16(op0_.get(), A->Data<MLFloat16>(), Y->MutableData<MLFloat16>());
+  }
 
   if (status != xnn_status_success) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_setup_fully_connected_nc_f32 returned ", status);
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_setup_fully_connected_nc_",
+                           OpTypeToString(op_compute_type_), " returned ", status);
   }
 
   status = xnn_run_operator(op0_.get(), nullptr);
@@ -193,19 +230,23 @@ Status Gemm::Compute(OpKernelContext* context) const {
 }
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(Gemm, kOnnxDomain, 7, 8, kXnnpackExecutionProvider,
-                                  KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+                                  KernelDefBuilder().TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                                                                          DataTypeImpl::GetTensorType<MLFloat16>()}),
                                   Gemm);
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(Gemm, kOnnxDomain, 9, 10, kXnnpackExecutionProvider,
-                                  KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+                                  KernelDefBuilder().TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                                                                          DataTypeImpl::GetTensorType<MLFloat16>()}),
                                   Gemm);
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(Gemm, kOnnxDomain, 11, 12, kXnnpackExecutionProvider,
-                                  KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+                                  KernelDefBuilder().TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                                                                          DataTypeImpl::GetTensorType<MLFloat16>()}),
                                   Gemm);
 
 ONNX_OPERATOR_KERNEL_EX(Gemm, kOnnxDomain, 13, kXnnpackExecutionProvider,
-                        KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+                        KernelDefBuilder().TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                                                                DataTypeImpl::GetTensorType<MLFloat16>()}),
                         Gemm);
 
 }  // namespace xnnpack

diff --git a/onnxruntime/core/providers/xnnpack/math/gemm.h b/onnxruntime/core/providers/xnnpack/math/gemm.h
@@ -41,6 +41,8 @@ class Gemm : protected GemmBase, public XnnpackKernel {
 
   float alpha_;
   float beta_;
+
+  OpComputeType op_compute_type_ = OpComputeType::op_compute_type_invalid;
 };
 
 }  // namespace xnnpack