[CoreML MLProgram] Support Float16 (1/N) (#22068)

### Description Support Float16 for CoreML MLProgram EP. Operations: "Add", "Mul", "Sub", "Div", "Pow", "Sqrt", "Reciprocal", "Sigmoid", "Tanh", "Relu", "LeakyRelu", "Concat", "GridSample", "GlobalAveragePool", "Clip", "DepthToSpace", "Resize", "Slice", "Conv", "ConvTranspose", "GlobalMaxPool", "Gemm", "MatMul", "AveragePool", "MaxPool", "Reshape", "Split", "Transpose" ### Motivation and Context  --------- Co-authored-by: Scott McKay <[email protected]>
microsoft · Sep 30, 2024 · 2cfe1f0 · 2cfe1f0
1 parent 434f0fa
commit 2cfe1f0
Show file tree

Hide file tree

Showing 35 changed files with 1,427 additions and 849 deletions.
diff --git a/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
@@ -104,7 +104,13 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     if (add_alpha) {
       NodeAttrHelper helper(node);
       const auto alpha = helper.Get("alpha", 0.01f);
-      AddOperationInput(*op, "alpha", model_builder.AddScalarConstant(op->type(), "alpha", alpha));
+
+      auto input_dtype = node.InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+      if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+        AddOperationInput(*op, "alpha", model_builder.AddScalarConstant(op->type(), "alpha", alpha));
+      } else {
+        AddOperationInput(*op, "alpha", model_builder.AddScalarConstant(op->type(), "alpha", MLFloat16(alpha)));
+      }
     }
 
     AddOperationOutput(*op, *node.OutputDefs()[0]);

diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <set>
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
@@ -12,6 +13,15 @@ using namespace CoreML::Specification;
 namespace onnxruntime {
 namespace coreml {
 
+// Once all ops are supportted FP16, we can remove it. Before that, we keep a set of ops to
+// filter suppported ones.
+static std::set<std::string> Float16Ops = {
+    "Add", "Mul", "Sub", "Div", "Pow", "Sqrt", "Reciprocal",
+    "Sigmoid", "Tanh", "Relu", "LeakyRelu", "Concat", "GridSample", "GlobalAveragePool",
+    "Clip", "DepthToSpace", "Resize", "Slice", "Conv",
+    "ConvTranspose", "GlobalMaxPool", "Gemm", "MatMul",
+    "AveragePool", "MaxPool", "Reshape", "Split", "Transpose"};
+
 namespace {
 // TODO, move this to shared_library
 bool HasExternalInitializer(const InitializedTensorSet& initializers, const Node& node,
@@ -83,8 +93,9 @@ bool BaseOpBuilder::HasSupportedInputs(const Node& node, const OpBuilderInputPar
 }
 
 /* static */
-bool BaseOpBuilder::IsInputFloat(const Node& node, size_t idx, const OpBuilderInputParams& /*input_params*/,
-                                 const logging::Logger& logger) {
+bool BaseOpBuilder::IsInputDtypeSupport(const Node& node, size_t idx,
+                                        [[maybe_unused]] const OpBuilderInputParams& input_params,
+                                        const logging::Logger& logger) {
   if (idx >= node.InputDefs().size()) {
     LOGS(logger, VERBOSE) << "Input index [" << idx << "] is out of range";
     return false;
@@ -94,20 +105,33 @@ bool BaseOpBuilder::IsInputFloat(const Node& node, size_t idx, const OpBuilderIn
 
   int32_t input_type = ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED;
 
-  // currently only float is supported
-  if (!GetType(input, input_type, logger) || input_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
-    LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not currently supported";
+  if (!GetType(input, input_type, logger)) {
+    LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Get Input type failed";
     return false;
   }
 
-  return true;
+  // float is supported
+  if (input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+    return true;
+  }
+
+// only support MLProgram for FP16
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (input_params.create_mlprogram && input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 &&
+      Float16Ops.count(node.OpType())) {
+    return true;
+  }
+#endif
+
+  LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not currently supported";
+  return false;
 }
 
 bool BaseOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& input_params,
                                            const logging::Logger& logger) const {
   // We only check the type of input 0 by default
   // specific op builder can override this
-  return IsInputFloat(node, 0, input_params, logger);
+  return IsInputDtypeSupport(node, 0, input_params, logger);
 }
 
 bool BaseOpBuilder::HasSupportedOpSet(const Node& node, const logging::Logger& logger) const {

diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
@@ -32,9 +32,9 @@ class BaseOpBuilder : public IOpBuilder {
       : allow_empty_tensor_as_input_(allow_empty_tensor_as_input) {
   }
 
-  // currently we only support float
-  static bool IsInputFloat(const Node& node, size_t idx, const OpBuilderInputParams& input_params,
-                           const logging::Logger& logger);
+  // currently we support float/float16
+  static bool IsInputDtypeSupport(const Node& node, size_t idx, const OpBuilderInputParams& input_params,
+                                  const logging::Logger& logger);
 
  private:
   virtual bool IsOpSupportedImpl(const Node& /*node*/, const OpBuilderInputParams& /*input_params*/,

diff --git a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
@@ -73,7 +73,7 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
     } else if (op_type == "Sub") {
       coreml_op_type = "sub";
     } else if (op_type == "Div") {
-      // we only support fp32 currently. when we add support for integers we need to check the type and use
+      // we support fp32/fp16 currently. when we add support for integers we need to check the type and use
       // "floor_div" or "real_div" accordingly
       coreml_op_type = "real_div";
     } else if (op_type == "Pow") {
@@ -138,9 +138,22 @@ bool BinaryOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderIn
                                              const logging::Logger& logger) const {
   // Add/Sub/Mul/Div spec says inputs must be of the same type.
   // Pow spec says inputs can be different types.
-  // We only support float for all of these inputs.
-  if (!IsInputFloat(node, 0, input_params, logger) ||
-      ((node.OpType() == "Pow") && !IsInputFloat(node, 1, input_params, logger))) {
+  // We support float/float16 for all of these inputs.
+
+  if (node.OpType() == "Pow") {
+    const auto& input0 = *node.InputDefs()[0];
+    const auto& input1 = *node.InputDefs()[1];
+    int32_t input_type0 = ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED;
+    int32_t input_type1 = ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED;
+    if (!GetType(input0, input_type0, logger)) {
+      return false;
+    }
+    if (!GetType(input1, input_type1, logger) || input_type1 != input_type0) {
+      return false;
+    }
+  }
+
+  if (!IsInputDtypeSupport(node, 0, input_params, logger)) {
     return false;
   }
 

diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
@@ -96,6 +96,9 @@ Status CreateCoreMLWeight(CoreML::Specification::WeightParams& weight,
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
       CreateCoreMLWeight(weight, unpacked_tensor.DataAsSpan<float>());
       break;
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
+      CreateCoreMLWeight(weight, unpacked_tensor.DataAsSpan<MLFloat16>());
+      break;
     case ONNX_NAMESPACE::TensorProto_DataType_INT32:
       CreateCoreMLWeight(weight, unpacked_tensor.DataAsSpan<int32_t>());
       break;
@@ -114,6 +117,11 @@ void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<c
   weight.mutable_floatvalue()->Assign(data.begin(), data.end());
 }
 
+void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<const MLFloat16> data) {
+  const char* data_byte_ptr = reinterpret_cast<const char*>(data.data());
+  weight.mutable_float16value()->assign(data_byte_ptr, data_byte_ptr + data.size_bytes());
+}
+
 namespace {
 template <typename T>
 void CreateCoreMLWeightConvertingDataToFloats(CoreML::Specification::WeightParams& weight, gsl::span<const T> data) {
@@ -123,6 +131,15 @@ void CreateCoreMLWeightConvertingDataToFloats(CoreML::Specification::WeightParam
                  [](T v) { return narrow<float>(v); });
   *weight.mutable_floatvalue() = std::move(weight_floats);
 }
+
+template <typename T>
+void CreateCoreMLWeightConvertingDataToFloat16s(CoreML::Specification::WeightParams& weight, gsl::span<const T> data) {
+  std::vector<MLFloat16> weight_float16s{};
+  weight_float16s.reserve(data.size());
+  std::transform(data.begin(), data.end(), std::back_inserter(weight_float16s),
+                 [](T v) { return MLFloat16(float(v)); });
+  CreateCoreMLWeight(weight, weight_float16s);
+}
 }  // namespace
 
 void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<const int32_t> data) {
@@ -195,6 +212,13 @@ void CopyDataToTensorValue<float>(MILSpec::TensorValue& tensor_value, gsl::span<
   tensor_value.mutable_floats()->mutable_values()->Add(data.begin(), data.end());
 }
 
+template <>
+void CopyDataToTensorValue<MLFloat16>(MILSpec::TensorValue& tensor_value, gsl::span<const MLFloat16> data) {
+  const char* begin = reinterpret_cast<const char*>(data.data());
+  const char* end = begin + (data.size() * sizeof(MLFloat16));
+  tensor_value.mutable_bytes()->mutable_values()->assign(begin, end);
+}
+
 template <>
 void CopyDataToTensorValue<int32_t>(MILSpec::TensorValue& tensor_value, gsl::span<const int32_t> data) {
   tensor_value.mutable_ints()->mutable_values()->Add(data.begin(), data.end());
@@ -290,6 +314,14 @@ MILSpec::Value CreateScalarTensorValue(const T& data) {
 // explicit specializations for types we handle so the implementation can be in the .cc file
 template MILSpec::Value CreateTensorValue<int64_t, int32_t>(gsl::span<const int64_t> data,
                                                             std::optional<gsl::span<const int64_t>> shape);
+template MILSpec::Value CreateTensorValue<float, float>(gsl::span<const float> data,
+                                                        std::optional<gsl::span<const int64_t>> shape);
+template MILSpec::Value CreateTensorValue<MLFloat16, MLFloat16>(gsl::span<const MLFloat16> data,
+                                                                std::optional<gsl::span<const int64_t>> shape);
+template MILSpec::Value CreateTensorValue<bool, bool>(gsl::span<const bool> data,
+                                                      std::optional<gsl::span<const int64_t>> shape);
+template MILSpec::Value CreateTensorValue<std::string, std::string>(gsl::span<const std::string> data,
+                                                                    std::optional<gsl::span<const int64_t>> shape);
 
 template MILSpec::Value CreateScalarTensorValue(const float& data);
 template MILSpec::Value CreateScalarTensorValue(const int32_t& data);

diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
@@ -41,6 +41,9 @@ Status CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, const ONN
 // Copy the float array to a coreml weight
 void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<const float> data);
 
+// Copy the MLFloat16 array to a coreml weight
+void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<const MLFloat16> data);
+
 // Copy the int32_t array to a coreml weight
 void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<const int32_t> data);
 

diff --git a/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
@@ -92,16 +92,30 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
         Operation& clip_op = *op;
         AddOperationInput(clip_op, "x", input_name);
 
+        // we already checked it and dtype must be existed.
+        auto input_dtype = node.InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
         // if min and max were attributes we need to add initializers. otherwise we use the existing inputs
         const bool min_max_attribs = node.SinceVersion() < 11;
-        std::string_view min_name = min_max_attribs ? model_builder.AddScalarConstant(clip_op.type(), "min", min)
-                                                    : node.InputDefs()[1]->Name();
+        std::string_view min_name;
+        if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+          min_name = min_max_attribs ? model_builder.AddScalarConstant(clip_op.type(), "min", min)
+                                     : node.InputDefs()[1]->Name();
+        } else {
+          min_name = min_max_attribs ? model_builder.AddScalarConstant(clip_op.type(), "min", MLFloat16(min))
+                                     : node.InputDefs()[1]->Name();
+        }
 
         AddOperationInput(clip_op, "alpha", min_name);
 
         if (has_max) {
-          std::string_view max_name = min_max_attribs ? model_builder.AddScalarConstant(clip_op.type(), "max", max)
-                                                      : node.InputDefs()[2]->Name();
+          std::string_view max_name;
+          if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+            max_name = min_max_attribs ? model_builder.AddScalarConstant(clip_op.type(), "max", max)
+                                       : node.InputDefs()[2]->Name();
+          } else {
+            max_name = min_max_attribs ? model_builder.AddScalarConstant(clip_op.type(), "max", MLFloat16(max))
+                                       : node.InputDefs()[2]->Name();
+          }
           AddOperationInput(clip_op, "beta", max_name);
         }
       }

diff --git a/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc
@@ -67,7 +67,9 @@ Status DepthToSpaceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
       // we checked shape was static in IsOpSupportedImpl so this should never fail
       std::vector<int64_t> input_shape;
       ORT_RETURN_IF_NOT(GetStaticShape(*input_defs[0], input_shape, logger), "Failed to get input shape");
-      const int32_t elem_type = static_cast<int32_t>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+      auto input_dtype = input_defs[0]->TypeAsProto()->tensor_type().elem_type();
+
+      const int32_t elem_type = static_cast<int32_t>(input_dtype);
 
       // reshape to [b * c // (blocksize ** 2), blocksize, blocksize, h, w]
       auto reshape1 = model_builder.CreateOperation(node, "reshape", "pre");

diff --git a/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc
@@ -70,16 +70,17 @@ void GemmOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Nod
   }
 }
 
-// This is an internal function, requires input tensor to be 2d float tensor
-// TODO, add support of other data types
-static Status GetTensorFloatDataTransposed(const ONNX_NAMESPACE::TensorProto& tensor,
-                                           std::vector<float>& transposed_data) {
+// This is an internal function, requires input tensor to be 2d float/float16 tensor
+template <typename T>
+static Status GetTensorDataTransposed(const ONNX_NAMESPACE::TensorProto& tensor,
+                                      std::vector<T>& transposed_data) {
   Initializer unpacked_tensor(tensor);
-  auto src_data = unpacked_tensor.DataAsSpan<float>();
+  const auto src_data = unpacked_tensor.DataAsSpan<T>();
   const auto& tensor_shape = tensor.dims();
   auto x_t = SafeInt<size_t>(tensor_shape[0]);
   auto y_t = SafeInt<size_t>(tensor_shape[1]);
   transposed_data.resize(x_t * y_t);
+
   for (size_t x = 0; x < x_t; x++) {
     for (size_t y = 0; y < y_t; y++) {
       transposed_data[y * x_t + x] = src_data[x * y_t + y];
@@ -121,8 +122,9 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   // B is {K, N} in ONNX spec by default, or {N, K} in Gemm if transB is true
   const auto K = transB ? b1 : b0;
   const auto N = transB ? b0 : b1;
-
+  // we already checked it and dtype must be existed.
 #if defined(COREML_ENABLE_MLPROGRAM)
+  auto input_dtype = a.TypeAsProto()->tensor_type().elem_type();
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 
@@ -136,13 +138,19 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
       if (transB) {
         AddOperationInput(*gemm_op, "weight", b.Name());
       } else {
-        // transpose from {K, N} to {N, K}
-        std::vector<float> weight_nk;
         std::vector<int64_t> weight_nk_shape = {N, K};
-        ORT_RETURN_IF_ERROR(GetTensorFloatDataTransposed(*b_initializer, weight_nk));
-
-        AddOperationInput(*gemm_op, "weight",
-                          model_builder.AddConstant(gemm_op->type(), b.Name() + "_t", weight_nk, weight_nk_shape));
+        // transpose from {K, N} to {N, K}
+        if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+          std::vector<float> weight_nk;
+          ORT_RETURN_IF_ERROR(GetTensorDataTransposed(*b_initializer, weight_nk));
+          AddOperationInput(*gemm_op, "weight",
+                            model_builder.AddConstant(gemm_op->type(), b.Name() + "_t", weight_nk, weight_nk_shape));
+        } else {  // TensorProto_DataType_FLOAT16
+          std::vector<MLFloat16> weight_nk;
+          ORT_RETURN_IF_ERROR(GetTensorDataTransposed(*b_initializer, weight_nk));
+          AddOperationInput(*gemm_op, "weight",
+                            model_builder.AddConstant(gemm_op->type(), b.Name() + "_t", weight_nk, weight_nk_shape));
+        }
       }
 
       if (input_defs.size() == 3) {
@@ -155,15 +163,28 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
           AddOperationInput(*gemm_op, "bias", bias_arg.Name());
         } else {
           Initializer unpacked_tensor(bias);
-          auto bias_data = unpacked_tensor.DataAsSpan<float>();
           std::string_view bias_data_name;
-          if (bias_data.size() == 1) {
-            // expand scalar to N
-            std::vector<float> expanded_bias_data(N, bias_data[0]);
-            bias_data_name = model_builder.AddConstant(gemm_op->type(), "bias", expanded_bias_data);
-          } else {
-            // can use data as-is but need to adjust shape (inferred by AddConstant as {bias_data.size()})
-            bias_data_name = model_builder.AddConstant(gemm_op->type(), "bias", bias_data);
+
+          if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+            auto bias_data = unpacked_tensor.DataAsSpan<float>();
+            if (bias_data.size() == 1) {
+              // expand scalar to N
+              std::vector<float> expanded_bias_data(N, bias_data[0]);
+              bias_data_name = model_builder.AddConstant(gemm_op->type(), "bias", expanded_bias_data);
+            } else {
+              // can use data as-is but need to adjust shape (inferred by AddConstant as {bias_data.size()})
+              bias_data_name = model_builder.AddConstant(gemm_op->type(), "bias", bias_data);
+            }
+          } else {  // TensorProto_DataType_FLOAT16
+            auto bias_data = unpacked_tensor.DataAsSpan<MLFloat16>();
+            if (bias_data.size() == 1) {
+              // expand scalar to N
+              std::vector<MLFloat16> expanded_bias_data(N, bias_data[0]);
+              bias_data_name = model_builder.AddConstant(gemm_op->type(), "bias", expanded_bias_data);
+            } else {
+              // can use data as-is but need to adjust shape (inferred by AddConstant as {bias_data.size()})
+              bias_data_name = model_builder.AddConstant(gemm_op->type(), "bias", bias_data);
+            }
           }
 
           AddOperationInput(*gemm_op, "bias", bias_data_name);
@@ -202,7 +223,7 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
       ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*coreml_inner_product->mutable_weights(), *b_initializer));
     } else {
       std::vector<float> b_transposed;
-      ORT_RETURN_IF_ERROR(GetTensorFloatDataTransposed(*b_initializer, b_transposed));
+      ORT_RETURN_IF_ERROR(GetTensorDataTransposed(*b_initializer, b_transposed));
       CreateCoreMLWeight(*coreml_inner_product->mutable_weights(), b_transposed);
     }