Skip to content

Commit

Permalink
[DML EP] Prefer MatMulInteger over MatMulIntegerToFloat in case of (#…
Browse files Browse the repository at this point in the history
…22469)

### Description
Skip `MatMulIntegerToFloat` fusion in case of DML EP for cases where
model uses Quantization before `MatMulInteger`. This is mainly done to
be resource efficient, and we have better `MatMulInteger` Metacommand
coverage which computes in int data type



### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
  • Loading branch information
raoanag authored Nov 7, 2024
1 parent a436b3a commit f16036b
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 0 deletions.
54 changes: 54 additions & 0 deletions onnxruntime/core/optimizer/matmul_integer_to_float.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,49 @@ bool HasElementDataType(const NodeArg& node_arg, int32_t data_type) {
return data_type == actual_data_type;
}

// Return total mnumber of Elements.
static uint64_t NumElements(const TensorShapeProto* tensor_shape) {
if (nullptr == tensor_shape || tensor_shape->dim_size() < 1) {
return 0;
}
uint64_t num_elements = 1;

for (int i = 0; i < tensor_shape->dim_size(); i++) {
num_elements *= tensor_shape->dim(i).dim_value();
}
return num_elements;
}

bool CheckMatMulLargeTensors(const Node& matmulinteger_node, const Node& cast_node) {
const auto a_def = matmulinteger_node.InputDefs()[0];
const auto b_def = matmulinteger_node.InputDefs()[1];
const int a_dim_size = a_def->Shape()->dim_size();
const int b_dim_size = b_def->Shape()->dim_size();
uint64_t a_num_elements = NumElements(a_def->Shape());
uint64_t b_num_elements = NumElements(b_def->Shape());

if (a_dim_size != b_dim_size) {
bool a_is_broadcasted = a_dim_size < b_dim_size;
if (a_is_broadcasted) {
for (int i = 0; i < b_dim_size - a_dim_size; i++) {
a_num_elements *= b_def->Shape()->dim(i).dim_value();
}
} else {
for (int i = 0; i < a_dim_size - b_dim_size; i++) {
b_num_elements *= a_def->Shape()->dim(i).dim_value();
}
}
}

int output_data_type = HasElementDataType(*cast_node.OutputDefs()[0], ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) ? 2 : 4;
uint64_t total_bytes = (a_num_elements + b_num_elements) * output_data_type;

if (total_bytes > UINT32_MAX) {
return true;
}
return false;
}

/**
MatMulIntegerToFloatFusion will fuse subgraph like below into MatMulIntegerToFloat:
Expand Down Expand Up @@ -114,6 +157,17 @@ Status MatMulIntegerToFloatFusion::ApplyImpl(Graph& graph, bool& modified, int g
continue;
}

const Node* p_dynamicquantize_node = graph_utils::FirstParentByType(*p_matmulinteger_node, "DynamicQuantizeLinear");

// Check MatMulInteger Nodes' input is coming from DynamicQuantizeLinear
// For larger tensors DynamicQuantizeLinear -> MatMulInteger is used to be resource efficient
// And we have better MatMulInteger Metacommand coverage in DML
if (is_dml_ep && p_dynamicquantize_node) {
if (CheckMatMulLargeTensors(matmulinteger_node, cast_node)) {
continue;
}
}

// Find bias node
Node* p_add_node = nullptr;
if (optimizer_utils::CheckOutputEdges(graph, mul_node, 1)) {
Expand Down
16 changes: 16 additions & 0 deletions onnxruntime/test/optimizer/graph_transform_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5859,6 +5859,22 @@ TEST_F(GraphTransformationTests, MatMulIntegerToFloat16Test) {
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
}

TEST_F(GraphTransformationTests, MatMulIntegerToFloatLargeTensorTest) {
constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/matmul_integer_to_float_large_tensor.onnx";
std::shared_ptr<Model> p_model;
ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
Graph& graph = p_model->MainGraph();

for (auto& node : graph.Nodes()) {
node.SetExecutionProviderType(kDmlExecutionProvider);
}
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique<MatMulIntegerToFloatFusion>(), TransformerLevel::Level2));
ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_));
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 0);
}
#endif // USE_DML

#endif
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
 :�
R
inputA a_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear
Y
a_quantized
inputB
a_zp
inputBZPmatmulinteger_outputMatMulInteger"MatMulInteger
-
a_scale
inputBScalemul_1 mul_right"Mul
:
matmulinteger_output cast_outputcast"Cast*
to�
-
mul_1
cast_outputoutput
mul_bottom"Mul+matmul_integer_to_float_large_tensor_fusionZ"
inputA



�

�
Z
inputB


�

�
Z
inputBZP


Z
inputBScale


B
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from enum import Enum # noqa: F401

import onnx
from onnx import TensorProto, helper


def GenerateModel(model_name): # noqa: N802
inputs = []
outputs = []
initializers = []
nodes = []

inputs.append(helper.make_tensor_value_info("inputA", TensorProto.FLOAT, [16, 32, 1280, 1280]))
inputs.append(helper.make_tensor_value_info("inputB", TensorProto.INT8, [1280, 1280]))
inputs.append(helper.make_tensor_value_info("inputBZP", TensorProto.INT8, [1]))
inputs.append(helper.make_tensor_value_info("inputBScale", TensorProto.FLOAT, [1]))

nodes = [ # construct graph
helper.make_node(
"DynamicQuantizeLinear",
["inputA"],
["a_quantized", "a_scale", "a_zp"],
"DynamicQuantizeLinear",
),
helper.make_node(
"MatMulInteger",
["a_quantized", "inputB", "a_zp", "inputBZP"],
["matmulinteger_output"],
"MatMulInteger",
),
helper.make_node("Mul", ["a_scale", "inputBScale"], ["mul_1"], "mul_right"),
helper.make_node("Cast", ["matmulinteger_output"], ["cast_output"], "cast", to=1),
helper.make_node("Mul", ["mul_1", "cast_output"], ["output"], "mul_bottom"),
]

graph = helper.make_graph(
nodes,
"matmul_integer_to_float_large_tensor_fusion", # name
inputs,
outputs,
initializers,
)

model = helper.make_model(graph)
onnx.save(model, model_name)


if __name__ == "__main__":
GenerateModel("matmul_integer_to_float_large_tensor.onnx")

0 comments on commit f16036b

Please sign in to comment.