[DML EP] Prefer MatMulInteger over MatMulIntegerToFloat in case of (#…

…22469) ### Description Skip `MatMulIntegerToFloat` fusion in case of DML EP for cases where model uses Quantization before `MatMulInteger`. This is mainly done to be resource efficient, and we have better `MatMulInteger` Metacommand coverage which computes in int data type ### Motivation and Context
microsoft · Nov 7, 2024 · f16036b · f16036b
1 parent a436b3a
commit f16036b
Show file tree

Hide file tree

Showing 4 changed files with 160 additions and 0 deletions.
diff --git a/onnxruntime/core/optimizer/matmul_integer_to_float.cc b/onnxruntime/core/optimizer/matmul_integer_to_float.cc
@@ -49,6 +49,49 @@ bool HasElementDataType(const NodeArg& node_arg, int32_t data_type) {
   return data_type == actual_data_type;
 }
 
+// Return total mnumber of Elements.
+static uint64_t NumElements(const TensorShapeProto* tensor_shape) {
+  if (nullptr == tensor_shape || tensor_shape->dim_size() < 1) {
+    return 0;
+  }
+  uint64_t num_elements = 1;
+
+  for (int i = 0; i < tensor_shape->dim_size(); i++) {
+    num_elements *= tensor_shape->dim(i).dim_value();
+  }
+  return num_elements;
+}
+
+bool CheckMatMulLargeTensors(const Node& matmulinteger_node, const Node& cast_node) {
+  const auto a_def = matmulinteger_node.InputDefs()[0];
+  const auto b_def = matmulinteger_node.InputDefs()[1];
+  const int a_dim_size = a_def->Shape()->dim_size();
+  const int b_dim_size = b_def->Shape()->dim_size();
+  uint64_t a_num_elements = NumElements(a_def->Shape());
+  uint64_t b_num_elements = NumElements(b_def->Shape());
+
+  if (a_dim_size != b_dim_size) {
+    bool a_is_broadcasted = a_dim_size < b_dim_size;
+    if (a_is_broadcasted) {
+      for (int i = 0; i < b_dim_size - a_dim_size; i++) {
+        a_num_elements *= b_def->Shape()->dim(i).dim_value();
+      }
+    } else {
+      for (int i = 0; i < a_dim_size - b_dim_size; i++) {
+        b_num_elements *= a_def->Shape()->dim(i).dim_value();
+      }
+    }
+  }
+
+  int output_data_type = HasElementDataType(*cast_node.OutputDefs()[0], ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) ? 2 : 4;
+  uint64_t total_bytes = (a_num_elements + b_num_elements) * output_data_type;
+
+  if (total_bytes > UINT32_MAX) {
+    return true;
+  }
+  return false;
+}
+
 /**
 MatMulIntegerToFloatFusion will fuse subgraph like below into MatMulIntegerToFloat:
 
@@ -114,6 +157,17 @@ Status MatMulIntegerToFloatFusion::ApplyImpl(Graph& graph, bool& modified, int g
       continue;
     }
 
+    const Node* p_dynamicquantize_node = graph_utils::FirstParentByType(*p_matmulinteger_node, "DynamicQuantizeLinear");
+
+    // Check MatMulInteger Nodes' input is coming from DynamicQuantizeLinear
+    // For larger tensors DynamicQuantizeLinear -> MatMulInteger is used to be resource efficient
+    // And we have better MatMulInteger Metacommand coverage in DML
+    if (is_dml_ep && p_dynamicquantize_node) {
+      if (CheckMatMulLargeTensors(matmulinteger_node, cast_node)) {
+        continue;
+      }
+    }
+
     // Find bias node
     Node* p_add_node = nullptr;
     if (optimizer_utils::CheckOutputEdges(graph, mul_node, 1)) {

diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -5859,6 +5859,22 @@ TEST_F(GraphTransformationTests, MatMulIntegerToFloat16Test) {
   std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
   EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
 }
+
+TEST_F(GraphTransformationTests, MatMulIntegerToFloatLargeTensorTest) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/matmul_integer_to_float_large_tensor.onnx";
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  for (auto& node : graph.Nodes()) {
+    node.SetExecutionProviderType(kDmlExecutionProvider);
+  }
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique<MatMulIntegerToFloatFusion>(), TransformerLevel::Level2));
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_));
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 0);
+}
 #endif  // USE_DML
 
 #endif

diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.onnx b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.onnx
@@ -0,0 +1,41 @@
+	:�
+R
+inputAa_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear
+Y
+a_quantized
+inputB
+a_zp
+inputBZPmatmulinteger_outputMatMulInteger"MatMulInteger
+-
+a_scale
+inputBScalemul_1	mul_right"Mul
+:
+matmulinteger_outputcast_outputcast"Cast*	
+to�
+-
+mul_1
+cast_outputoutput
+mul_bottom"Mul+matmul_integer_to_float_large_tensor_fusionZ"
+inputA
+
+
+ 
+�
+
+�
+Z
+inputB
+
+
+�
+
+�
+Z
+inputBZP
+
+
+Z
+inputBScale
+
+
+B

diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.py b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.py
@@ -0,0 +1,49 @@
+from enum import Enum  # noqa: F401
+
+import onnx
+from onnx import TensorProto, helper
+
+
+def GenerateModel(model_name):  # noqa: N802
+    inputs = []
+    outputs = []
+    initializers = []
+    nodes = []
+
+    inputs.append(helper.make_tensor_value_info("inputA", TensorProto.FLOAT, [16, 32, 1280, 1280]))
+    inputs.append(helper.make_tensor_value_info("inputB", TensorProto.INT8, [1280, 1280]))
+    inputs.append(helper.make_tensor_value_info("inputBZP", TensorProto.INT8, [1]))
+    inputs.append(helper.make_tensor_value_info("inputBScale", TensorProto.FLOAT, [1]))
+
+    nodes = [  # construct graph
+        helper.make_node(
+            "DynamicQuantizeLinear",
+            ["inputA"],
+            ["a_quantized", "a_scale", "a_zp"],
+            "DynamicQuantizeLinear",
+        ),
+        helper.make_node(
+            "MatMulInteger",
+            ["a_quantized", "inputB", "a_zp", "inputBZP"],
+            ["matmulinteger_output"],
+            "MatMulInteger",
+        ),
+        helper.make_node("Mul", ["a_scale", "inputBScale"], ["mul_1"], "mul_right"),
+        helper.make_node("Cast", ["matmulinteger_output"], ["cast_output"], "cast", to=1),
+        helper.make_node("Mul", ["mul_1", "cast_output"], ["output"], "mul_bottom"),
+    ]
+
+    graph = helper.make_graph(
+        nodes,
+        "matmul_integer_to_float_large_tensor_fusion",  # name
+        inputs,
+        outputs,
+        initializers,
+    )
+
+    model = helper.make_model(graph)
+    onnx.save(model, model_name)
+
+
+if __name__ == "__main__":
+    GenerateModel("matmul_integer_to_float_large_tensor.onnx")