From aa0cf1c5e1ef16d0e05d1e269c7088400d490dcb Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Wed, 6 Nov 2024 14:06:29 -0800
Subject: [PATCH] [Quant Tool] Update QDQ Pad, Slice, Softmax (#22676)

### Description
Updates python quantization tool:
- Ensures QDQ Pad has equal quantization parameters across input and
output for certain Pad configurations.
- Ensures QDQ Slice always has equal quantization parameters across
input and output.
- Fixes bug when Softmax is _excluded_ from quantization.


### Motivation and Context
QDQ Pad and Slice have lower latency on QNN EP when their quantization
parameters are equal.
---
 .../tools/quantization/base_quantizer.py      |   2 +
 .../tools/quantization/operators/pad.py       |  72 ++++++++
 .../python/tools/quantization/registry.py     |   4 +-
 .../test/python/quantization/op_test_utils.py |  33 ++++
 .../test/python/quantization/test_op_pad.py   | 165 +++++++++++++++++-
 .../test/python/quantization/test_op_slice.py | 153 ++++++++++++++++
 .../python/quantization/test_op_softmax.py    |  34 ++++
 7 files changed, 461 insertions(+), 2 deletions(-)
 create mode 100644 onnxruntime/test/python/quantization/test_op_slice.py

diff --git a/onnxruntime/python/tools/quantization/base_quantizer.py b/onnxruntime/python/tools/quantization/base_quantizer.py
index b12465ffa7926..f07fb30f10f82 100644
--- a/onnxruntime/python/tools/quantization/base_quantizer.py
+++ b/onnxruntime/python/tools/quantization/base_quantizer.py
@@ -554,4 +554,6 @@ def adjust_tensor_ranges(self):
                 self.tensors_range[node.input[0]] = td
             # Adjust Softmax to range from 0.0 to 1.0
             elif node.op_type == "Softmax":
+                if not self.should_quantize_node(node):
+                    continue
                 self.tensors_range[node.output[0]] = TensorData(lowest=np.float32(0.0), highest=np.float32(1.0))
diff --git a/onnxruntime/python/tools/quantization/operators/pad.py b/onnxruntime/python/tools/quantization/operators/pad.py
index 5f3c1231e62d6..b3e9ddb5e6278 100644
--- a/onnxruntime/python/tools/quantization/operators/pad.py
+++ b/onnxruntime/python/tools/quantization/operators/pad.py
@@ -1,3 +1,12 @@
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+from typing import Any
+
+import numpy as np
 import onnx
 
 from ..quant_utils import (
@@ -8,6 +17,7 @@
     quantize_nparray,
 )
 from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
 
 
 class QPad(QuantOperatorBase):
@@ -98,3 +108,65 @@ def quantize(self):
         node.input[0] = quantized_input_value.q_name
         node.output[0] = quantized_output_value.q_name
         self.quantizer.new_nodes += [node]
+
+
+class QDQPad(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def _get_pad_const_val(self, attrs_dict: dict[str, Any]) -> np.ndarray | None:
+        """
+        Returns the Pad's constant padding value. Returns `None` if the padding value is
+        not constant (i.e., comes from a dynamic input).
+        """
+        const_val = None
+        onnx_tensor_type = self.quantizer.model.get_tensor_type(self.node.input[0])
+        if onnx_tensor_type is None:
+            return None
+
+        np_dtype = onnx.helper.tensor_dtype_to_np_dtype(onnx_tensor_type.elem_type)
+        if self.quantizer.opset_version < 11:
+            const_val = np.array(attrs_dict.get("value", 0), dtype=np_dtype)
+        elif len(self.node.input) >= 3 and self.node.input[2]:
+            const_val = self.quantizer.model.get_constant_value(self.node.input[2])
+        else:
+            const_val = np.array(0, dtype=np_dtype)
+
+        return const_val
+
+    def _should_quantize_output_same_as_input(self) -> bool:
+        """
+        Returns true if Pad's output should use the same quantization parameters as input[0]
+        """
+        attrs_dict = {}
+        for attribute in self.node.attribute:
+            kv = attribute_to_kwarg(attribute)
+            attrs_dict.update(kv)
+
+        pad_mode = attrs_dict.get("mode", b"constant")
+        if pad_mode in (b"reflect", b"edge", b"wrap"):
+            # These modes pad the output with a value that already exists in the input.
+            # So, we can quantize the output the same as the input.
+            return True
+
+        # For 'constant' mode, if padding with 0, we can also quantize the output the same as the input
+        # because our quantization floating-point range always includes 0.
+        if pad_mode == b"constant":
+            pad_val = self._get_pad_const_val(attrs_dict)
+            if pad_val is not None and pad_val.dtype in (np.float32, np.float16):
+                return float(pad_val.item()) == 0
+
+        return False
+
+    def quantize(self):
+        assert self.node.op_type == "Pad"
+
+        for input_name in self.node.input:
+            if input_name:
+                self.quantizer.quantize_activation_tensor(input_name)
+
+        if not self.disable_qdq_for_node_output:
+            if self._should_quantize_output_same_as_input():
+                self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
+            else:
+                self.quantizer.quantize_activation_tensor(self.node.output[0])
diff --git a/onnxruntime/python/tools/quantization/registry.py b/onnxruntime/python/tools/quantization/registry.py
index 160b056e1de17..fbeae39c39d21 100644
--- a/onnxruntime/python/tools/quantization/registry.py
+++ b/onnxruntime/python/tools/quantization/registry.py
@@ -14,7 +14,7 @@
 from .operators.matmul import MatMulInteger, QDQMatMul, QLinearMatMul
 from .operators.maxpool import QDQMaxPool, QMaxPool
 from .operators.norm import QDQNormalization
-from .operators.pad import QPad
+from .operators.pad import QDQPad, QPad
 from .operators.pooling import QLinearPool
 from .operators.qdq_base_operator import QDQOperatorBase
 from .operators.resize import QDQResize, QResize
@@ -76,6 +76,8 @@
     "Resize": QDQResize,
     "MaxPool": QDQMaxPool,
     "AveragePool": QDQDirect8BitOp,
+    "Slice": QDQDirect8BitOp,
+    "Pad": QDQPad,
     "MatMul": QDQMatMul,
     "Split": QDQSplit,
     "Gather": QDQGather,
diff --git a/onnxruntime/test/python/quantization/op_test_utils.py b/onnxruntime/test/python/quantization/op_test_utils.py
index cf7fc292ea86b..82193d08684c6 100644
--- a/onnxruntime/test/python/quantization/op_test_utils.py
+++ b/onnxruntime/test/python/quantization/op_test_utils.py
@@ -1,3 +1,10 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
 import uuid
 from pathlib import Path
 
@@ -661,3 +668,29 @@ def generate_random_initializer(initializer_name, tensor_shape, tensor_dtype, me
     tensor = np.random.normal(mean, dev, tensor_shape).astype(tensor_dtype)
     init = onnx.numpy_helper.from_array(tensor, initializer_name)
     return init
+
+
+def get_tensor_consumers_and_producers(
+    model: onnx.ModelProto,
+) -> tuple[dict[str, list[onnx.NodeProto]], dict[str, onnx.NodeProto]]:
+    """
+    Returns a tuple containing the following python dictionaries:
+      - consumers: maps a tensor name to the list of nodes that have that tensor as an input.
+      - producers: maps a tensor name to the node that generates this tensor as an output.
+    """
+    consumers: dict[str, list[onnx.NodeProto]] = {}
+    producers: dict[str, onnx.NodeProto] = {}
+    for node in model.graph.node:
+        # Iterate through node's inputs to build the consumers dictionary.
+        for input_name in node.input:
+            if input_name:
+                if input_name not in consumers:
+                    consumers[input_name] = []
+
+                consumers[input_name].append(node)
+
+        # Iterate through node's outputs to build the producers dictionary.
+        for output_name in node.output:
+            producers[output_name] = node
+
+    return (consumers, producers)
diff --git a/onnxruntime/test/python/quantization/test_op_pad.py b/onnxruntime/test/python/quantization/test_op_pad.py
index 291bf42405d58..05736019cd7c8 100644
--- a/onnxruntime/test/python/quantization/test_op_pad.py
+++ b/onnxruntime/test/python/quantization/test_op_pad.py
@@ -4,14 +4,23 @@
 # Licensed under the MIT License. See License.txt in the project root for
 # license information.
 # --------------------------------------------------------------------------
+from __future__ import annotations
 
 import itertools
+import os
+import tempfile
 import unittest
 
 import numpy as np
 import onnx
 from onnx import TensorProto, helper
-from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type
+from op_test_utils import (
+    TestDataFeeds,
+    check_model_correctness,
+    check_op_type_count,
+    check_qtype_by_node_type,
+    get_tensor_consumers_and_producers,
+)
 
 from onnxruntime.quantization import QuantFormat, QuantType, quantize_dynamic, quantize_static
 
@@ -519,5 +528,159 @@ def test_pad_with_empty_string_input_name(self):
                 self.assertNotEqual(name, "_quantized")
 
 
+class TestQDQPad(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="ort.qdq.pad_")
+
+        # Note: swap with the commented line if you want to see the models in local test dir.
+        cls._tmp_dir_path = cls._tmp_model_dir.name
+        # cls._tmp_dir_path = "."
+
+    @classmethod
+    def tearDownClass(cls):
+        cls._tmp_model_dir.cleanup()
+
+    def build_pad_model(
+        self,
+        mode: str,
+        constant_value: float | None = None,
+        opset: int = 21,
+        float_type: onnx.TensorProto.DataType = onnx.TensorProto.FLOAT,
+    ) -> onnx.ModelProto:
+        input_0 = onnx.helper.make_tensor_value_info("input_0", float_type, (3, 2))
+        output_0 = onnx.helper.make_tensor_value_info("output_0", float_type, (3, 4))
+
+        initializers = []
+        pad_input_names = ["input_0"]
+        attrs = {"mode": mode}
+
+        pads_data = np.array([0, 2, 0, 0], dtype=np.int64)  # Pad two vals at beginning of axis 1.
+        if opset >= 11:
+            initializers.append(onnx.numpy_helper.from_array(pads_data, "pads"))
+            pad_input_names.append("pads")
+        else:
+            attrs["pads"] = pads_data.tolist()
+
+        if mode == "constant" and constant_value is not None:
+            if opset >= 11:
+                initializers.append(onnx.helper.make_tensor("constant_value", float_type, [], [constant_value]))
+                pad_input_names.append("constant_value")
+            else:
+                attrs["value"] = float(constant_value)
+
+        pad_node = onnx.helper.make_node("Pad", pad_input_names, ["output_0"], name="Pad0", **attrs)
+
+        graph = onnx.helper.make_graph(
+            [pad_node],
+            "PadFloat",
+            [input_0],
+            [output_0],
+            initializer=initializers,
+        )
+        opset_imports = [onnx.helper.make_opsetid("", opset)]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        model = onnx.shape_inference.infer_shapes(model)
+        onnx.checker.check_model(model, True)
+        return model
+
+    def test_qdq_pad_qparams(self):
+        """
+        Test that QDQ Pad has equal scale/zero-point for its input and output for certain configurations.
+        """
+        test_configs = [
+            # Opset 21
+            ("constant", None, 21, onnx.TensorProto.FLOAT),
+            ("constant", None, 21, onnx.TensorProto.FLOAT16),
+            ("constant", 0, 21, onnx.TensorProto.FLOAT),
+            ("constant", 0, 21, onnx.TensorProto.FLOAT16),
+            ("constant", 10.0, 21, onnx.TensorProto.FLOAT),
+            ("constant", 10.0, 21, onnx.TensorProto.FLOAT16),
+            ("reflect", None, 21, onnx.TensorProto.FLOAT),
+            ("reflect", None, 21, onnx.TensorProto.FLOAT16),
+            ("edge", None, 21, onnx.TensorProto.FLOAT),
+            ("edge", None, 21, onnx.TensorProto.FLOAT16),
+            ("wrap", None, 21, onnx.TensorProto.FLOAT),
+            ("wrap", None, 21, onnx.TensorProto.FLOAT16),
+            # Model with opset 10 will use pad of opset 2, which uses attributes instead of inputs.
+            # Opset 10 Q/DQ ops don't support float16.
+            ("constant", None, 10, onnx.TensorProto.FLOAT),
+            ("constant", 0, 10, onnx.TensorProto.FLOAT),
+            ("constant", 10.0, 10, onnx.TensorProto.FLOAT),
+            ("reflect", None, 10, onnx.TensorProto.FLOAT),
+            ("edge", None, 10, onnx.TensorProto.FLOAT),
+        ]
+
+        for pad_mode, constant_value, opset, float_type in test_configs:
+            with self.subTest(pad_mode=pad_mode, constant_value=constant_value, opset=opset, float_type=float_type):
+                label = f"_{pad_mode}_{constant_value}_opset{opset}_{onnx.TensorProto.DataType.Name(float_type)}"
+                float_model_path = os.path.join(self._tmp_dir_path, f"pad{label}.float.onnx")
+                qdq_model_path = os.path.join(self._tmp_dir_path, f"pad{label}.qdq.onnx")
+
+                float_model = self.build_pad_model(pad_mode, constant_value, opset=opset, float_type=float_type)
+                onnx.save_model(float_model, float_model_path)
+
+                # Create a data reader
+                np_dtype = onnx.helper.tensor_dtype_to_np_dtype(float_type)
+                input_data_list = [
+                    {"input_0": np.array([[1.0, 1.2], [2.3, 3.4], [4.5, 5.7]], dtype=np_dtype)},
+                    {"input_0": np.array([[2.3, 3.4], [4.5, 5.7], [1.0, 1.2]], dtype=np_dtype)},
+                ]
+                data_reader = TestDataFeeds(input_data_list)
+
+                # quantize model to QDQ
+                quantize_static(
+                    float_model_path,
+                    qdq_model_path,
+                    data_reader,
+                    quant_format=QuantFormat.QDQ,
+                    activation_type=QuantType.QUInt8,
+                    weight_type=QuantType.QInt8,
+                )
+
+                expected_op_counts = {"DequantizeLinear": 2, "QuantizeLinear": 2, "Pad": 1}
+                if constant_value is not None and opset >= 11:
+                    expected_op_counts["DequantizeLinear"] += 1  # The constant padding value is quantized.
+                check_op_type_count(self, qdq_model_path, **expected_op_counts)
+
+                if pad_mode != "reflect":
+                    # Do not check model correctness for 'reflect' mode because ONNX Runtime implementation does
+                    # not match the ONNX reference implementation. See the following issue:
+                    # https://github.com/microsoft/onnxruntime/issues/20801
+                    data_reader.rewind()
+                    check_model_correctness(self, float_model_path, qdq_model_path, data_reader.get_next())
+
+                qdq_model = onnx.load_model(qdq_model_path)
+                quant_output_same_as_input = False
+
+                if pad_mode in ("reflect", "edge", "wrap"):
+                    quant_output_same_as_input = True
+
+                if pad_mode == "constant" and constant_value in (None, 0):
+                    quant_output_same_as_input = True
+
+                pad_node = next((node for node in qdq_model.graph.node if node.op_type == "Pad"), None)
+                self.assertNotEqual(pad_node, None)
+                self.assertEqual(pad_node.op_type, "Pad")
+
+                # Get the parent and child nodes of the Pad and check that they are DQ/Q.
+                consumers, producers = get_tensor_consumers_and_producers(qdq_model)
+                input_dq_node = producers.get(pad_node.input[0], None)
+                self.assertNotEqual(input_dq_node, None)
+                self.assertEqual(input_dq_node.op_type, "DequantizeLinear")
+
+                output_q_node = consumers.get(pad_node.output[0], [None])[0]
+                self.assertNotEqual(output_q_node, None)
+                self.assertEqual(output_q_node.op_type, "QuantizeLinear")
+
+                # Check that the Pad's input DQ uses the same scale/zp as the Pad's output Q.
+                if quant_output_same_as_input:
+                    self.assertEqual(input_dq_node.input[1], output_q_node.input[1])  # Same scale
+                    self.assertEqual(input_dq_node.input[2], output_q_node.input[2])  # Same zero-point
+                else:
+                    self.assertNotEqual(input_dq_node.input[1], output_q_node.input[1])
+                    self.assertNotEqual(input_dq_node.input[2], output_q_node.input[2])
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/onnxruntime/test/python/quantization/test_op_slice.py b/onnxruntime/test/python/quantization/test_op_slice.py
new file mode 100644
index 0000000000000..bfb9fc6b46bbd
--- /dev/null
+++ b/onnxruntime/test/python/quantization/test_op_slice.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import os
+import tempfile
+import unittest
+
+import numpy as np
+import onnx
+from op_test_utils import (
+    TestDataFeeds,
+    check_model_correctness,
+    check_op_type_count,
+    get_tensor_consumers_and_producers,
+)
+
+from onnxruntime.quantization import QuantFormat, QuantType, quantize_static
+
+
+class TestQDQSlice(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="ort.qdq.slice_")
+
+        # Note: swap with the commented line if you want to see the models in local test dir.
+        cls._tmp_dir_path = cls._tmp_model_dir.name
+        # cls._tmp_dir_path = "."
+
+    @classmethod
+    def tearDownClass(cls):
+        cls._tmp_model_dir.cleanup()
+
+    def build_slice_model(
+        self,
+        input_shape: list[int],
+        input_tensor_type: onnx.TensorProto.DataType,
+        starts: list[int],
+        ends: list[int],
+        axes: list[int] | None = None,
+        steps: list[int] | None = None,
+    ) -> onnx.ModelProto:
+        """
+        Returns an onnx.ModelProto with a single Slice operator.
+        """
+        input_0 = onnx.helper.make_tensor_value_info("input_0", input_tensor_type, input_shape)
+        output_0 = onnx.helper.make_tensor_value_info("output_0", input_tensor_type, None)
+
+        initializers = [
+            onnx.numpy_helper.from_array(np.array(starts, dtype=np.int64), "starts"),
+            onnx.numpy_helper.from_array(np.array(ends, dtype=np.int64), "ends"),
+        ]
+        slice_input_names = ["input_0", "starts", "ends"]
+
+        if axes:
+            initializers.append(onnx.numpy_helper.from_array(np.array(axes, dtype=np.int64), "axes"))
+            slice_input_names.append("axes")
+
+        if steps:
+            if not axes:
+                slice_input_names.append("")  # Empty axes input.
+            initializers.append(onnx.numpy_helper.from_array(np.array(steps, dtype=np.int64), "steps"))
+            slice_input_names.append("steps")
+
+        slice_node = onnx.helper.make_node("Slice", slice_input_names, ["output_0"], name="Slice0")
+
+        graph = onnx.helper.make_graph(
+            [slice_node],
+            "SliceGraph",
+            [input_0],
+            [output_0],
+            initializer=initializers,
+        )
+        opset_imports = [onnx.helper.make_opsetid("", 21)]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        model = onnx.shape_inference.infer_shapes(model)
+        onnx.checker.check_model(model, True)
+        return model
+
+    def test_qdq_slice_qparams(self):
+        """
+        Test that QDQ Slice has equal scale/zero-point for its input and output.
+        """
+        test_configs = [onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16]
+
+        for onnx_tensor_type in test_configs:
+            with self.subTest(onnx_tensor_type=onnx_tensor_type):
+                label = f"{onnx.TensorProto.DataType.Name(onnx_tensor_type)}"
+                float_model_path = os.path.join(self._tmp_dir_path, f"slice.{label}.onnx")
+                qdq_model_path = os.path.join(self._tmp_dir_path, f"slice.{label}.qdq.onnx")
+
+                input_shape = [2, 4]
+                float_model = self.build_slice_model(
+                    input_shape=input_shape,
+                    input_tensor_type=onnx_tensor_type,
+                    starts=[1, 0],
+                    ends=[2, 3],
+                    axes=None,
+                    steps=[1, 2],
+                )
+                onnx.save_model(float_model, float_model_path)
+
+                # Create a data reader
+                np_dtype = onnx.helper.tensor_dtype_to_np_dtype(onnx_tensor_type)
+                input_data_list = [
+                    {"input_0": np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]], dtype=np_dtype)},
+                    {"input_0": np.array([[-1.0, -2.0, -3.0, -4.0], [-5.0, -6.0, -7.0, -8.0]], dtype=np_dtype)},
+                ]
+                data_reader = TestDataFeeds(input_data_list)
+
+                # quantize model to QDQ
+                quantize_static(
+                    float_model_path,
+                    qdq_model_path,
+                    data_reader,
+                    quant_format=QuantFormat.QDQ,
+                    activation_type=QuantType.QUInt8,
+                    weight_type=QuantType.QInt8,
+                    extra_options={"ForceQuantizeNoInputCheck": True},
+                )
+                expected_op_counts = {"DequantizeLinear": 2, "QuantizeLinear": 2, "Slice": 1}
+                check_op_type_count(self, qdq_model_path, **expected_op_counts)
+
+                data_reader.rewind()
+                check_model_correctness(self, float_model_path, qdq_model_path, data_reader.get_next())
+
+                qdq_model = onnx.load_model(qdq_model_path)
+
+                slice_node = next((node for node in qdq_model.graph.node if node.op_type == "Slice"), None)
+                self.assertNotEqual(slice_node, None)
+                self.assertEqual(slice_node.op_type, "Slice")
+
+                # Get the parent and child nodes of the Slice and check that they are DQ/Q.
+                consumers, producers = get_tensor_consumers_and_producers(qdq_model)
+                input_dq_node = producers.get(slice_node.input[0], None)
+                self.assertNotEqual(input_dq_node, None)
+                self.assertEqual(input_dq_node.op_type, "DequantizeLinear")
+
+                output_q_node = consumers.get(slice_node.output[0], [None])[0]
+                self.assertNotEqual(output_q_node, None)
+                self.assertEqual(output_q_node.op_type, "QuantizeLinear")
+
+                # Check that the Slice's input DQ uses the same scale/zp as the Slice's output Q.
+                self.assertEqual(input_dq_node.input[1], output_q_node.input[1])
+                self.assertEqual(input_dq_node.input[2], output_q_node.input[2])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxruntime/test/python/quantization/test_op_softmax.py b/onnxruntime/test/python/quantization/test_op_softmax.py
index 3416198450137..e5bc6288c91e2 100644
--- a/onnxruntime/test/python/quantization/test_op_softmax.py
+++ b/onnxruntime/test/python/quantization/test_op_softmax.py
@@ -213,6 +213,40 @@ def test_quantize_softmax(self):
         self.quantize_softmax_test_qop(QuantType.QUInt8, QuantType.QUInt8)
         self.quantize_softmax_test_qdq(QuantType.QUInt8, QuantType.QUInt8)
 
+    def test_bug_fix_exclude_softmax(self):
+        """
+        Test fix to bug that happens when softmax is excluded from quantization, but
+        the quantization tool still tries to assign it a tensor range of [0.0, 1.0].
+        """
+        np.random.seed(1)
+        model_fp32_path = "softmax_fp32.onnx"
+        model_qdq_path = "softmax_bug_exclude_softmax.qdq.onnx"
+        self.construct_model_conv_softmax(
+            model_fp32_path,
+            [1, 2, 26, 42],
+            [3, 2, 3, 3],
+            [1, 3, 24, 40],
+            {"axis": -2},
+            [1, 3, 24, 40],
+            add_ms_domain_opset=False,
+        )
+        data_reader = self.input_feeds(1, {"input": [1, 2, 26, 42]})
+        data_reader.rewind()
+
+        # Bug would cause an exception during quantization.
+        quantize_static(
+            model_fp32_path,
+            model_qdq_path,
+            data_reader,
+            quant_format=QuantFormat.QDQ,
+            activation_type=QuantType.QUInt8,
+            weight_type=QuantType.QInt8,
+            nodes_to_exclude=["Softmax"],
+        )
+
+        qdq_model = onnx.load(Path(model_qdq_path))
+        self.assertIn("Softmax", {node.op_type for node in qdq_model.graph.node})
+
     def test_quantize_softmax_s8s8(self):
         self.quantize_softmax_test_qop(
             QuantType.QInt8,