From 52cfc4a2ac4c9feb729ad7acd2adbfb0e1a41207 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Wed, 13 Mar 2024 10:17:08 +0100
Subject: [PATCH 01/23] Fix clipping range issue in RoundAndClipThresholds
 transformation

---
 src/finn/transformation/streamline/round_thresholds.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py
index 5ba5ee0ff5..2bf3630cff 100644
--- a/src/finn/transformation/streamline/round_thresholds.py
+++ b/src/finn/transformation/streamline/round_thresholds.py
@@ -57,10 +57,10 @@ def apply(self, model):
                     model.set_tensor_datatype(n.input[1], idtype)
                     graph_modified = True
                 if idtype.is_integer() and (
-                    (Tnew < (idtype.min() - 1)).any() or (Tnew > (idtype.max() + 1)).any()
+                    (Tnew < (idtype.min())).any() or (Tnew > (idtype.max())).any()
                 ):
                     # clip any large thresholds to input range + 1
-                    Tnew = np.clip(Tnew, idtype.min() - 1, idtype.max() + 1)
+                    Tnew = np.clip(Tnew, idtype.min(), idtype.max())
                     model.set_initializer(n.input[1], Tnew)
                     # use same datatype as inputs for thresholds
                     model.set_tensor_datatype(n.input[1], idtype)

From c8292e2a27bebb2254f278e409b00f448c35e600 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Sat, 6 Apr 2024 17:06:03 +0200
Subject: [PATCH 02/23] Rework RoundAndClipThresholds to avoid range and type
 promotion issues

See https://github.com/Xilinx/finn/issues/978
---
 .../streamline/round_thresholds.py            | 105 +++++++++++++-----
 1 file changed, 76 insertions(+), 29 deletions(-)

diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py
index 2bf3630cff..2666242730 100644
--- a/src/finn/transformation/streamline/round_thresholds.py
+++ b/src/finn/transformation/streamline/round_thresholds.py
@@ -26,43 +26,90 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+# Need numpy for modifying the onnx graph tensors, which are numpy style arrays
 import numpy as np
+
+# QONNX wrapper of ONNX model graphs
+from qonnx.core.modelwrapper import ModelWrapper
+
+# QONNX graph transformation base class
 from qonnx.transformation.base import Transformation
 
+# Transformation running qonnx datatype inference
+from qonnx.transformation.infer_datatypes import InferDataTypes
+
 
+# Rounds and clips thresholds to integer values if the node inputs are integer,
+# respecting range, representability and data type (promotion) of the container
+# data type
 class RoundAndClipThresholds(Transformation):
     """For MultiThreshold nodes operating on integer inputs, round up
     thresholds values to the nearest integer. Additionally, if the input
-    is unsigned, sets negative thresholds to zero."""
+    is unsigned, sets negative thresholds to zero. Type-casts thresholds (back)
+    to the float32 container type (this is separate from the quantization
+    annotation). Runs InferDataTypes() afterward to propagate any changes to the
+    quantization data types."""
 
-    def apply(self, model):
+    # Applies the transform to a whole model graph
+    def apply(self, model: ModelWrapper):  # noqa
+        # Get the model graph out of the model wrapper object
         graph = model.graph
+        # Keep track of whether the graph has been modified
         graph_modified = False
-        for n in graph.node:
-            if n.op_type == "MultiThreshold":
-                idtype = model.get_tensor_datatype(n.input[0])
-                T = model.get_initializer(n.input[1])
-                Tnew = np.ceil(T)
-                if idtype.is_integer() and (T != Tnew).any():
-                    # round up the thresholds to nearest integer
-                    model.set_initializer(n.input[1], Tnew)
-                    # use same datatype as inputs for thresholds
-                    model.set_tensor_datatype(n.input[1], idtype)
-                    graph_modified = True
-                if idtype.is_integer() and not idtype.signed() and (Tnew < 0).any():
-                    # clip any negative thresholds if input is unsigned
-                    Tnew = np.clip(Tnew, 0, None)
-                    model.set_initializer(n.input[1], Tnew)
-                    # use same datatype as inputs for thresholds
-                    model.set_tensor_datatype(n.input[1], idtype)
-                    graph_modified = True
-                if idtype.is_integer() and (
-                    (Tnew < (idtype.min())).any() or (Tnew > (idtype.max())).any()
-                ):
-                    # clip any large thresholds to input range + 1
-                    Tnew = np.clip(Tnew, idtype.min(), idtype.max())
-                    model.set_initializer(n.input[1], Tnew)
-                    # use same datatype as inputs for thresholds
-                    model.set_tensor_datatype(n.input[1], idtype)
+        # Iterate all nodes in the graph keeping track of the index
+        for index, node in enumerate(graph.node):
+            # Applies to initializer tensors of MultiThreshold operations
+            if node.op_type == "MultiThreshold":
+                # Try to get the thresholds initializer tensor
+                thresholds = model.get_initializer(node.input[1])
+                # There might be no constant thresholds stored as initializer
+                # tensor inside the model
+                if thresholds is None:
+                    # Nothing we can do, skip to the next node
+                    continue
+                # Get the data type of the inputs to this operation
+                dtype = model.get_tensor_datatype(node.input[0])
+                # This transformation only applies to thresholding operations
+                # operating on integer inputs
+                if not dtype.is_integer():
+                    # Nothing we can do, skip to the next node
+                    continue
+                # Round thresholds up to nearest integer and clip thresholds
+                # outside the input range
+                #   Note: This might promote the thresholds to float64 and
+                #   introduce extra inaccuracies due to large integers not being
+                #   exactly representable in floating-point representation.
+                #   See for example: np.ceil(np.float32(16777217)) == 16777216
+                # fmt: off
+                new_thresholds = np.clip(
+                    np.ceil(thresholds), dtype.min(), dtype.max()
+                )
+                # fmt: on
+                # Convert back to the preferred float32 container type
+                #   Note: np.clip might have promoted the thresholds to float64
+                #   TODO: Maybe consider an int64 container type for thresholds
+                #    rounded to integer? Need to check all other transformations
+                #    and code generation through the whole FINN and QONNX stack
+                #    first, as these probably assume a float32 container type.
+                new_thresholds = new_thresholds.astype(np.float32)
+                # Insert the rounded and clipped thresholds back into the model
+                model.set_initializer(node.input[1], new_thresholds)
+                # The rounded and clipped thresholds now fit into the input data
+                # type
+                model.set_tensor_datatype(node.input[1], dtype)
+                # Test whether the new thresholds actually differ from the old
+                # ones
+                if np.any(new_thresholds != thresholds):
+                    # Track the graph has been modified to inform the transform
+                    # container to exhaustively repeat this transformation until
+                    # no changes are possible
                     graph_modified = True
-        return (model, graph_modified)
+                    # Immediately exit here to propagate the data type changes
+                    # before considering the next node
+                    break
+        # Some data types might have changed, do one pass of data type inference
+        # to propagate these changes through the graph
+        model = model.transform(InferDataTypes())
+        # Return the transformed model and indicate whether the graph actually
+        # has been transformed to exhaustively apply this transformation again.
+        return model, graph_modified

From 3109645cb2a2bb764bd982948a36e2788756efc1 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Sat, 6 Apr 2024 17:10:36 +0200
Subject: [PATCH 03/23] [Tests] Rework test-cases for reworked
 RoundAndClipThresholds

See https://github.com/Xilinx/finn/issues/978
---
 .../streamline/test_round_thresholds.py       | 257 ++++++++++++++++--
 1 file changed, 227 insertions(+), 30 deletions(-)

diff --git a/tests/transformation/streamline/test_round_thresholds.py b/tests/transformation/streamline/test_round_thresholds.py
index 85c60b37d5..63375598a0 100644
--- a/tests/transformation/streamline/test_round_thresholds.py
+++ b/tests/transformation/streamline/test_round_thresholds.py
@@ -26,45 +26,242 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+# fmt: off
+# Disable formatter. This is deliberately formatted to stay within 80 characters
+# per line. Black, however, formats some lines going beyond this.
+
+# Testing framework
 import pytest
 
+# Use numpy for python execution / computing the ground truth expected values
 import numpy as np
+
+# Utility types and function for creating onnx nodes and graphs
 from onnx import TensorProto, helper
+
+# QONNX data types like INT25
 from qonnx.core.datatype import DataType
+
+# QONNX wrapper of ONNX model graphs
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.util.basic import qonnx_make_model
 
+# Generate random tensors of QONNX/FINN data types for testing
+from qonnx.util.basic import gen_finn_dt_tensor
+
+# Execution of onnx graphs within FINN
 import finn.core.onnx_exec as oxe
+
+# The transformation to be tested
 from finn.transformation.streamline import RoundAndClipThresholds
 
 
-@pytest.mark.streamline
-def test_round_thresholds():
-    v = helper.make_tensor_value_info("v", TensorProto.FLOAT, [1, 4])
-    thresholds = helper.make_tensor_value_info("thresholds", TensorProto.FLOAT, [4, 1])
-    out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, 4])
-    node_def = helper.make_node(
-        "MultiThreshold", ["v", "thresholds"], ["out"], domain="qonnx.custom_op.general"
+# Tests the RoundAndClipThresholds transformation under various input, output
+# data type combinations with purely integer inputs. Without proper rounding,
+# this tests only the clipping, range and type-casting behavior of the
+# transformation.
+@pytest.mark.parametrize("i_dtype", [
+    # Explanation for selecting these test configurations:
+    # 1. Below 24-bit thresholds we will not observe any interesting rounding
+    #    behavior, as all integers < 2^24 can be exactly represented in 32-bit
+    #    floating-point. Thus, we test thresholds at 25-bit signed integers and
+    #    generate test inputs slightly above and below this.
+    # 2. We want to test out-of-range clipping of thresholds, in particular
+    #    clipping of the negative portion of signed thresholds. Thus, we only
+    #    generate signed thresholds, but test with signed and unsigned
+    #    inputs of smaller, larger and equal range.
+    # 3. Testing proper floating-point thresholds requires a separate test-case
+    "INT23", "UINT23", "INT24", "UINT24", "INT25", "UINT25", "INT26", "UINT26"
+])
+@pytest.mark.parametrize("o_dtype", [
+    # Explanation for selecting these test configurations:
+    # 1. Outputs of MultiThreshold are typically much smaller bit-width than the
+    #    inputs and thresholds.
+    # 2. However, with randomly samples thresholds from a rather large range due
+    #    to the selected input bit-widths (see above), we risk not adequately
+    #    covering the input range if we sample too few thresholds. The number of
+    #    thresholds sampled depends on the bit-width of the output, thus we use
+    #    rather high bit-width for testing.
+    # 3. For a "real" model, the quantization procedure *should* take care of
+    #    adequately covering the true input range.
+    "INT8", "UINT8"
+])
+@pytest.mark.parametrize("n_elems", [
+    # Explanation for selecting these test configurations:
+    # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4
+    # 2. Large test case 256, hopefully amplifying any rarely occurring errors
+    1, 2, 3, 4, 256
+])
+def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems):
+    # Convert string representation of data type to onnx DataType
+    i_dtype = DataType[i_dtype]
+    t_dtype = DataType["INT25"]  # Note: Matches configuration above
+    o_dtype = DataType[o_dtype]  # noqa: Duplicate model setup code
+    # Create a dummy MultiThreshold operation to be tested
+    node = helper.make_node(
+        # Op-Type of the node
+        "MultiThreshold",
+        # MultiThreshold is implemented under the qonnx domain
+        domain="qonnx.custom_op.general",
+        # List the names of the input tensors
+        inputs=["inp", "thresholds"],
+        # List the names of the output tensors
+        outputs=["out"],
+        # The CustomOp needs to know the data type of the output to be produced
+        out_dtype=str(o_dtype)
+    )
+    # Number of threshold values required to produce outputs of type o_dtype
+    n_thresholds = o_dtype.get_num_possible_values() - 1
+    # Create tensor value infos for all input/output tensors involved
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, n_elems])
+    out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, n_elems])
+    # Create a tensor value info for the thresholds parameter tensor
+    #   Note: Number of thresholds is determined by the output data type
+    thresholds = helper.make_tensor_value_info(
+        "thresholds", TensorProto.FLOAT, [n_elems, n_thresholds]
+    )
+    # Combine node and tensor value infos into an onnx graph
+    graph = helper.make_graph([node], "thresholds", [inp, thresholds], [out])
+    # Wrap the model graph in a ModelWrapper container
+    model = ModelWrapper(helper.make_model(graph))
+    # Sample random tensors of the configured input data type
+    inp = gen_finn_dt_tensor(i_dtype, [1, n_elems])
+    # Generate sorted thresholds for each of the input channels
+    thresholds = np.sort(gen_finn_dt_tensor(t_dtype, [n_elems, n_thresholds]))
+    # Set data type annotations for the input and thresholds tensor
+    model.set_tensor_datatype("inp", i_dtype)  # noqa: Duplicate model execution
+    model.set_tensor_datatype("thresholds", t_dtype)
+    model.set_tensor_datatype("out", o_dtype)
+    # Set the thresholds as initializer input to the model
+    model.set_initializer("thresholds", thresholds)
+    # Execute the model before running the RoundAndClipThresholds transformation
+    out_expected = oxe.execute_onnx(model, {"inp": inp})["out"]
+    # Before rounding the threshold data type must be as annotated
+    assert model.get_tensor_datatype("thresholds") == t_dtype
+    # Run the transformation to be tested
+    model = model.transform(RoundAndClipThresholds())
+    # After this transformation, the thresholds and output data type should be
+    # inferred correctly
+    assert model.get_tensor_datatype("thresholds") == i_dtype
+    assert model.get_tensor_datatype("out") == o_dtype
+    # After this transformation, the container type used to store the thresholds
+    # values must be float32. No other type-cast or type promotion may happen.
+    assert model.get_initializer("thresholds").dtype == np.float32
+    # After rounding, all thresholds must be integers represented as float32
+    assert all(
+        x.is_integer() for x in model.get_initializer("thresholds").flatten()
+    )
+    # Execute the model after running the RoundAndClipThresholds transformation
+    out_produced = oxe.execute_onnx(model, {"inp": inp})["out"]
+    # Compare the results before and after: This is the pure integer test-case
+    # and no actual rounding should happen, thus the rounded operation should
+    # produce outputs exactly equal.
+    assert np.all(out_produced == out_expected)
+
+
+# Tests the RoundAndClipThresholds transformation under various input, output
+# data type combinations with purely integer inputs. This test case tests actual
+# rounding of floating-point thresholds.
+@pytest.mark.parametrize("i_dtype", [
+    # Explanation for selecting these test configurations:
+    # 1. Below 24-bit thresholds we will not observe any interesting rounding
+    #    behavior, as all integers < 2^24 can be exactly represented in 32-bit
+    #    floating-point. Thus, we test thresholds at 25-bit signed integers and
+    #    generate test inputs slightly above and below this.
+    # 2. We want to test out-of-range clipping of thresholds, in particular
+    #    clipping of the negative portion of signed thresholds. Thus, we only
+    #    generate signed thresholds, but test with signed and unsigned
+    #    inputs of smaller, larger and equal range.
+    # 3. Testing proper floating-point thresholds requires a separate test-case
+    "INT23", "UINT23", "INT24", "UINT24", "INT25", "UINT25", "INT26", "UINT26"
+])
+@pytest.mark.parametrize("o_dtype", [
+    # Explanation for selecting these test configurations:
+    # 1. Outputs of MultiThreshold are typically much smaller bit-width than the
+    #    inputs and thresholds.
+    # 2. However, with randomly samples thresholds from a rather large range due
+    #    to the selected input bit-widths (see above), we risk not adequately
+    #    covering the input range if we sample too few thresholds. The number of
+    #    thresholds sampled depends on the bit-width of the output, thus we use
+    #    rather high bit-width for testing.
+    # 3. For a "real" model, the quantization procedure *should* take care of
+    #    adequately covering the true input range.
+    "INT8", "UINT8"
+])
+@pytest.mark.parametrize("n_elems", [
+    # Explanation for selecting these test configurations:
+    # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4
+    # 2. Large test case 256, hopefully amplifying any rarely occurring errors
+    1, 2, 3, 4, 256
+])
+def test_round_and_clip_thresholds_floats(i_dtype, o_dtype, n_elems):
+    # Convert string representation of data type to onnx DataType
+    i_dtype = DataType[i_dtype]
+    t_dtype = DataType["FLOAT32"]
+    o_dtype = DataType[o_dtype]  # noqa: Duplicate model setup code
+    # Create a dummy MultiThreshold operation to be tested
+    node = helper.make_node(
+        # Op-Type of the node
+        "MultiThreshold",
+        # MultiThreshold is implemented under the qonnx domain
+        domain="qonnx.custom_op.general",
+        # List the names of the input tensors
+        inputs=["inp", "thresholds"],
+        # List the names of the output tensors
+        outputs=["out"],
+        # The CustomOp needs to know the data type of the output to be produced
+        out_dtype=str(o_dtype)
+    )
+    # Number of threshold values required to produce outputs of type o_dtype
+    n_thresholds = o_dtype.get_num_possible_values() - 1
+    # Create tensor value infos for all input/output tensors involved
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, n_elems])
+    out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, n_elems])
+    # Create a tensor value info for the thresholds parameter tensor
+    #   Note: Number of thresholds is determined by the output data type
+    thresholds = helper.make_tensor_value_info(
+        "thresholds", TensorProto.FLOAT, [n_elems, n_thresholds]
+    )
+    # Combine node and tensor value infos into an onnx graph
+    graph = helper.make_graph([node], "thresholds", [inp, thresholds], [out])
+    # Wrap the model graph in a ModelWrapper container
+    model = ModelWrapper(helper.make_model(graph))
+    # Sample random tensors of the configured input data type
+    inp = gen_finn_dt_tensor(i_dtype, [1, n_elems])
+    # Draw uniformly random prototype thresholds in [0,+1] range
+    thresholds = np.random.rand(n_elems, n_thresholds)
+    # Type alias to 25-bit signed integer type used to set the range of the
+    # thresholds
+    INT25 = DataType["INT25"]  # noqa: Variable name not lowercase
+    # Map the prototype thresholds into the test integer range and sort
+    thresholds = np.sort((INT25.max() - INT25.min()) * thresholds + INT25.min())
+    # Set data type annotations for the input and thresholds tensor
+    model.set_tensor_datatype("inp", i_dtype)  # noqa: Duplicate model execution
+    model.set_tensor_datatype("thresholds", t_dtype)
+    model.set_tensor_datatype("out", o_dtype)
+    # Set the thresholds as initializer input to the model
+    model.set_initializer("thresholds", thresholds)
+    # Execute the model before running the RoundAndClipThresholds transformation
+    out_expected = oxe.execute_onnx(model, {"inp": inp})["out"]
+    # Before rounding the threshold data type must be as annotated
+    assert model.get_tensor_datatype("thresholds") == t_dtype
+    # Run the transformation to be tested
+    model = model.transform(RoundAndClipThresholds())
+    # After this transformation, the thresholds and output data type should be
+    # inferred correctly
+    assert model.get_tensor_datatype("thresholds") == i_dtype
+    assert model.get_tensor_datatype("out") == o_dtype
+    # After this transformation, the container type used to store the thresholds
+    # values must be float32. No other type-cast or type promotion may happen.
+    assert model.get_initializer("thresholds").dtype == np.float32
+    # After rounding, all thresholds must be integers represented as float32
+    assert all(
+        x.is_integer() for x in model.get_initializer("thresholds").flatten()
     )
-    graph_def = helper.make_graph([node_def], "test_model", [v, thresholds], [out])
-    model_def = qonnx_make_model(graph_def)
-    model = ModelWrapper(model_def)
-    threshold_val = np.asarray([[-1.1], [0.7], [2.3], [5.1]], dtype=np.float32)
-    model.set_initializer("thresholds", threshold_val)
-    model.set_tensor_datatype("v", DataType["INT8"])
-    inp_dict_f = {"v": np.floor(threshold_val).T}
-    inp_dict_n = {"v": np.round(threshold_val).T}
-    inp_dict_c = {"v": np.ceil(threshold_val).T}
-    orig_f = oxe.execute_onnx(model, inp_dict_f)["out"]
-    orig_n = oxe.execute_onnx(model, inp_dict_n)["out"]
-    orig_c = oxe.execute_onnx(model, inp_dict_c)["out"]
-    assert model.get_tensor_datatype("thresholds") == DataType["FLOAT32"]
-    new_model = model.transform(RoundAndClipThresholds())
-    # rounded up thresholds should have same dtype as input
-    assert new_model.get_tensor_datatype("thresholds") == DataType["INT8"]
-    new_f = oxe.execute_onnx(new_model, inp_dict_f)["out"]
-    new_n = oxe.execute_onnx(new_model, inp_dict_n)["out"]
-    new_c = oxe.execute_onnx(new_model, inp_dict_c)["out"]
-    assert np.isclose(orig_f, new_f, atol=1e-3).all()
-    assert np.isclose(orig_n, new_n, atol=1e-3).all()
-    assert np.isclose(orig_c, new_c, atol=1e-3).all()
+    # Execute the model after running the RoundAndClipThresholds transformation
+    out_produced = oxe.execute_onnx(model, {"inp": inp})["out"]
+    # Compare the results before and after: This is the floating-point test with
+    # actual rounding, this the transformed result may only be equal within some
+    # tolerance.
+    # Hm, never observed this to be relevant. For all test configurations, exact
+    # equality seems to hold, probably due to only integer inputs being tested.
+    assert np.allclose(out_produced, out_expected, atol=1.0e-3)

From e22201f800a573b88d55f9b0024454a8e10fa0d4 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Fri, 16 Aug 2024 16:15:40 +0100
Subject: [PATCH 04/23] [HWop-MVAU] Ensure shape is compatible in execution
 function

---
 src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index 1c86ae7b7a..8f0a987bce 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -130,6 +130,8 @@ def get_nodeattr_types(self):
     def execute_node(self, context, graph):
         node = self.onnx_node
         in_act = context[node.input[0]]
+        # ensure that shape is compatible
+        in_act = in_act.reshape(self.get_normal_input_shape())
         mvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0]
         mvau_w = np_helper.to_array(mvau_w_init)
         # Matrix multiplication

From ec5613c68f209202cf7fefb21d383b0072a2441f Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 19 Sep 2024 10:15:08 +0100
Subject: [PATCH 05/23] [InsertFIFO] Preserve onnx tensor dtype when inserting
 FIFOs

---
 src/finn/transformation/fpgadataflow/insert_fifo.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index 9df193efcf..21fb843052 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -29,7 +29,6 @@
 
 import numpy as np
 import warnings
-from onnx import TensorProto
 from onnx import helper as oh
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
@@ -114,6 +113,8 @@ def apply(self, model):
                         # determine fifo node attributes
                         fld_shape = n0.get_folded_output_shape()
                         dtype = n0.get_output_datatype()
+                        n0_otensor = model.get_tensor_valueinfo(output_name)
+                        n0_tensor_dtype = n0_otensor.type.tensor_type.elem_type
 
                         # check if folded_shape of output of first node and
                         # input of the second node is equal
@@ -145,7 +146,7 @@ def apply(self, model):
                             # or unless create_shallow_fifos is specified
                             fifo_output_tensor = oh.make_tensor_value_info(
                                 model.make_new_valueinfo_name(),
-                                TensorProto.FLOAT,
+                                n0_tensor_dtype,
                                 n0.get_normal_output_shape(),
                             )
                             graph.value_info.append(fifo_output_tensor)
@@ -196,13 +197,15 @@ def apply(self, model):
                     fld_shape = n0.get_folded_input_shape(inp_ind)
                     n_shape = n0.get_normal_input_shape(inp_ind)
                     dtype = n0.get_input_datatype(inp_ind)
+                    n0_itensor = model.get_tensor_valueinfo(graph_in_name)
+                    n0_tensor_dtype = n0_itensor.type.tensor_type.elem_type
                     fifo_depth = n0.get_nodeattr("inFIFODepths")[inp_ind]
 
                     if fifo_depth > 2 or self.create_shallow_fifos:
                         # create fifo node
                         fifo_output_tensor = oh.make_tensor_value_info(
                             model.make_new_valueinfo_name(),
-                            TensorProto.FLOAT,
+                            n0_tensor_dtype,
                             n0.get_normal_input_shape(inp_ind),
                         )
                         graph.value_info.append(fifo_output_tensor)
@@ -256,13 +259,15 @@ def apply(self, model):
                     fld_shape = n0.get_folded_output_shape(out_ind)
                     n_shape = n0.get_normal_output_shape(out_ind)
                     dtype = n0.get_output_datatype(out_ind)
+                    n0_otensor = model.get_tensor_valueinfo(graph_out_name)
+                    n0_tensor_dtype = n0_otensor.type.tensor_type.elem_type
                     fifo_depth = n0.get_nodeattr("outFIFODepths")[out_ind]
 
                     if fifo_depth > 2 or self.create_shallow_fifos:
                         # create fifo node
                         fifo_input_tensor = oh.make_tensor_value_info(
                             model.make_new_valueinfo_name(),
-                            TensorProto.FLOAT,
+                            n0_tensor_dtype,
                             n0.get_normal_output_shape(),
                         )
                         graph.value_info.append(fifo_input_tensor)

From fb600553d5618d36be334f7dd6c99dea789b0c83 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 19 Sep 2024 10:58:43 +0100
Subject: [PATCH 06/23] [InsertDWC] Preserve onnx tensor dtype when inserting
 DWCs

---
 src/finn/transformation/fpgadataflow/insert_dwc.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index 33cc3e86d3..b56c8b74ea 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -26,7 +26,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from onnx import TensorProto
 from onnx import helper as oh
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
@@ -110,12 +109,15 @@ def apply(self, model):
                             # determine shape for dwc
                             dwc_shape = n0.get_normal_output_shape()
 
-                            # determine dtype for dwc
+                            # determine FINN dtype for dwc
                             dtype = n0.get_output_datatype()
+                            # determine onnx tensor dtype for dwc
+                            n0_otensor = model.get_tensor_valueinfo(output_name)
+                            n0_tensor_dtype = n0_otensor.type.tensor_type.elem_type
 
                             dwc_output_tensor = oh.make_tensor_value_info(
                                 model.make_new_valueinfo_name(),
-                                TensorProto.FLOAT,
+                                n0_tensor_dtype,
                                 dwc_shape,
                             )
                             graph.value_info.append(dwc_output_tensor)

From 03830929697464666b58be717ece8328bc6c6965 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 13:28:15 +0000
Subject: [PATCH 07/23] [Fix] InferDuplicateStreamsLayer now properly handles
 forks of multiple-output nodes

---
 .../fpgadataflow/convert_to_hw_layers.py      | 96 +++++++++----------
 1 file changed, 48 insertions(+), 48 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index 25a2032aeb..b02bc89db8 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -585,63 +585,63 @@ def apply(self, model):
 
         for node in graph.node:
             node_ind += 1
-            successors = model.find_consumers(node.output[0])
-            if successors is not None and len(successors) >= 2:
-                output_tensor = node.output[0]
-                n_outputs = len(successors)
+            for output_tensor in node.output:
+                successors = model.find_consumers(output_tensor)
+                if successors is not None and len(successors) >= 2:
+                    n_outputs = len(successors)
 
-                dt = model.get_tensor_datatype(output_tensor)
+                    dt = model.get_tensor_datatype(output_tensor)
 
-                # skip conversion for layers with float input
-                if not dt.is_integer():
-                    continue
+                    # skip conversion for layers with float input
+                    if not dt.is_integer():
+                        continue
 
-                # create clone tensors
-                out_shape = model.get_tensor_shape(output_tensor)
-                out_tensor_clones = []
-                for i in range(n_outputs):
-                    clone = helper.make_tensor_value_info(
-                        model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
-                    )
-                    model.graph.value_info.append(clone)
-                    out_tensor_clones += [clone.name]
+                    # create clone tensors
+                    out_shape = model.get_tensor_shape(output_tensor)
+                    out_tensor_clones = []
+                    for i in range(n_outputs):
+                        clone = helper.make_tensor_value_info(
+                            model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
+                        )
+                        model.graph.value_info.append(clone)
+                        out_tensor_clones += [clone.name]
 
-                num_ch = int(out_shape[-1])
-                vecs = out_shape[:-1]
+                    num_ch = int(out_shape[-1])
+                    vecs = out_shape[:-1]
 
-                # create node with no parallelization first
-                pe = 1
+                    # create node with no parallelization first
+                    pe = 1
 
-                dup_node = helper.make_node(
-                    "DuplicateStreams",
-                    [output_tensor],
-                    out_tensor_clones,
-                    domain="finn.custom_op.fpgadataflow",
-                    backend="fpgadataflow",
-                    NumChannels=num_ch,
-                    PE=pe,
-                    inputDataType=dt.name,
-                    numInputVectors=vecs,
-                    NumOutputStreams=n_outputs,
-                    outFIFODepths=[2] * n_outputs,
-                    name="DuplicateStreams_" + node.name,
-                )
+                    dup_node = helper.make_node(
+                        "DuplicateStreams",
+                        [output_tensor],
+                        out_tensor_clones,
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                        NumChannels=num_ch,
+                        PE=pe,
+                        inputDataType=dt.name,
+                        numInputVectors=vecs,
+                        NumOutputStreams=n_outputs,
+                        outFIFODepths=[2] * n_outputs,
+                        name="DuplicateStreams_" + node.name,
+                    )
 
-                graph.node.insert(node_ind, dup_node)
+                    graph.node.insert(node_ind, dup_node)
 
-                # connect successors to out tensor clone
-                clone_idx = 0
-                for successor in successors:
-                    for i, succ_input in enumerate(successor.input):
-                        if succ_input == output_tensor:
-                            successor.input[i] = out_tensor_clones[clone_idx]
-                            clone_idx += 1
-                            # if one node has multiple connections to the same output
-                            # find_direct_successors will return one node per input
-                            # so break the inner loop will result in correct behaviour
-                            break
+                    # connect successors to out tensor clone
+                    clone_idx = 0
+                    for successor in successors:
+                        for i, succ_input in enumerate(successor.input):
+                            if succ_input == output_tensor:
+                                successor.input[i] = out_tensor_clones[clone_idx]
+                                clone_idx += 1
+                                # if one node has multiple connections to the same output
+                                # find_direct_successors will return one node per input
+                                # so break the inner loop will result in correct behaviour
+                                break
 
-                graph_modified = True
+                    graph_modified = True
 
         if graph_modified:
             model = model.transform(SortGraph())

From d13aa7e7debb21bd1d75b6dbb6eddc959b4ae8c8 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 13:48:43 +0000
Subject: [PATCH 08/23] [Fix] MoveScalarLinearPastInvariants, MakeMaxPoolNHWC,
 MakeScaleResizeNHWC transformations are checking whether the node to be moved
 is a fork node, in which case the MoveOpPastFork is called. MoveOpPastFork
 uses deepcopies of the original node.

---
 src/finn/transformation/streamline/reorder.py | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 8ac2d7dad6..9a7e9d0723 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -29,6 +29,7 @@
 import numpy as np
 import qonnx.core.data_layout as DataLayout
 import warnings
+from copy import deepcopy
 from onnx import TensorProto
 from onnx import helper as oh
 from qonnx.core.datatype import DataType
@@ -641,6 +642,10 @@ def apply(self, model):
                     # if initializer is not scalar, skip
                     if np.prod(init0.shape) != 1:
                         continue
+                    if model.is_fork_node(prod0):
+                        model = model.transform(MoveOpPastFork(prod0.op_type))
+                        # topology modified, "ask" ModelWrapper to apply this transform again
+                        return (model, True)
                     # Flatten input if required
                     if len(init0.shape) > 0:
                         init0 = init0.flatten()[0]
@@ -713,6 +718,12 @@ def apply(self, model):
                 elif producer is not None and producer.op_type == "Transpose":
                     perms = list(get_by_name(producer.attribute, "perm").ints)
                     if perms == [0, 3, 1, 2]:
+                        # check if the producer is a fork node
+                        # (need to move it past the fork before this transform)
+                        if model.is_fork_node(producer):
+                            model = model.transform(MoveTransposePastFork())
+                            # topology modified, "ask" ModelWrapper to apply this transform again
+                            return (model, True)
                         ceil_mode = get_by_name(n.attribute, "ceil_mode")
                         if ceil_mode is not None:
                             ceil_mode = ceil_mode.i
@@ -764,6 +775,12 @@ def apply(self, model):
                 if producer is not None and producer.op_type == "Transpose":
                     perms = list(get_by_name(producer.attribute, "perm").ints)
                     if perms == [0, 3, 1, 2]:
+                        # check if the producer is a fork node
+                        # (need to move it past the fork before this transform)
+                        if model.is_fork_node(producer):
+                            model = model.transform(MoveTransposePastFork())
+                            # topology modified, "ask" ModelWrapper to apply this transform again
+                            return (model, True)
                         old_value = model.get_initializer(n.input[scales_ind])
                         new_value = np.array(
                             [old_value[idx] for idx in (0, 2, 3, 1)],
@@ -813,10 +830,9 @@ class MoveOpPastFork(Transformation):
     can be merged with nodes in the branches
     """
 
-    def __init__(self, op_name_list, get_attrs_fxn=lambda x: {}):
+    def __init__(self, op_name_list):
         super().__init__()
         self.ops_to_move = op_name_list
-        self.get_attrs_fxn = get_attrs_fxn
 
     def apply(self, model):
         graph = model.graph
@@ -859,11 +875,9 @@ def apply(self, model):
                         new_param_name = model.make_new_valueinfo_name()
                         new_inp_list = [n.input[0], new_param_name]
                         model.set_initializer(new_param_name, op_init_param)
-                    attrs = self.get_attrs_fxn(n)
-                    # TODO use copy of original node instead to get attrs?
-                    new_node = oh.make_node(
-                        n.op_type, new_inp_list, [new_output_tensor_name], **attrs
-                    )
+                    new_node = deepcopy(n)
+                    new_node.input[:] = new_inp_list
+                    new_node.output[:] = [new_output_tensor_name]
                     graph.node.insert(node_ind, new_node)
                     node_ind += 1
 
@@ -901,7 +915,7 @@ def __init__(self):
 
 class MoveTransposePastFork(MoveOpPastFork):
     def __init__(self):
-        super().__init__(["Transpose"], lambda x: {"perm": get_by_name(x.attribute, "perm").ints})
+        super().__init__(["Transpose"])
 
 
 class MoveMaxPoolPastMultiThreshold(Transformation):

From 6223abe86c7d9aee43788825f3c19545dab0ea54 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 13:59:14 +0000
Subject: [PATCH 09/23] [Fix] InsertFIFO transform is fixed for the case of the
 last node in the graph being a fork node

---
 src/finn/transformation/fpgadataflow/insert_fifo.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index 21fb843052..9ed0f51cd4 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -268,7 +268,7 @@ def apply(self, model):
                         fifo_input_tensor = oh.make_tensor_value_info(
                             model.make_new_valueinfo_name(),
                             n0_tensor_dtype,
-                            n0.get_normal_output_shape(),
+                            n0.get_normal_output_shape(out_ind),
                         )
                         graph.value_info.append(fifo_input_tensor)
                         model.set_tensor_datatype(fifo_input_tensor.name, dtype)
@@ -294,7 +294,7 @@ def apply(self, model):
                         graph.node.append(fifo_node)
 
                         # set fifo output tensor as new input tensor of second node
-                        final_node.output[0] = fifo_input_tensor.name
+                        final_node.output[out_ind] = fifo_input_tensor.name
                     else:
                         warnings.warn(
                             """Output FIFO for %s has depth %d and won't

From 11d8234fdcfb03c00a700dd3ba82cb88d6da66e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Wed, 25 Sep 2024 13:27:04 +0100
Subject: [PATCH 10/23] Harden lane width computations against 32-bit numeric
 overflow.

---
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 107a00918e..dabb36647e 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -72,6 +72,10 @@ module mvu_8sx8u_dsp48 #(
 		return  res;
 	endfunction : init_leave_loads
 
+	function int unsigned sum_width(input int unsigned  n, input int unsigned  w);
+		return	w <= 16? $clog2(1 + n*(2**w - 1)) : w + $clog2(n);
+	endfunction : sum_width
+
 	// Pipeline for last indicator flag
 	logic [1:5] L = '0;
 	always_ff @(posedge clk) begin
@@ -445,7 +449,7 @@ module mvu_8sx8u_dsp48 #(
 		// Stage #4: Cross-SIMD Reduction
 
 		// Count leaves reachable from each node
-		localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
+		localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
 
 		// Range of Cross-lane Contribution Tracked in Hi4
 		/*
@@ -462,7 +466,7 @@ module mvu_8sx8u_dsp48 #(
 		 *   signed value is determined by its lower bound to be at least:
 		 *		1 + $clog2(2^(w-1)+SIMD)
 		 */
-		localparam int unsigned  HI_WIDTH = 1 + $clog2(2**(ACCU_WIDTH-D[1]-1)+SIMD);
+		localparam int unsigned  HI_WIDTH = 1 + ($clog2(SIMD) < ACCU_WIDTH-D[1]? ACCU_WIDTH-D[1] : $clog2(2**(ACCU_WIDTH-D[1]-1)+SIMD));
 
 		uwire signed [ACCU_WIDTH       -1:0]  up4;
 		uwire signed [HI_WIDTH         -1:0]  hi4;
@@ -504,12 +508,12 @@ module mvu_8sx8u_dsp48 #(
 			// Conclusive low part accumulation
 			if(i >= PE_REM) begin : blkLo
 				// Adder Tree across all SIMD low contributions (all unsigned arithmetic)
-				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
+				localparam int unsigned  ROOT_WIDTH = sum_width(SIMD, LO_WIDTH);
 				uwire [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
 				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
 				for(genvar  n = 0; n < SIMD-1; n++) begin
 					// Sum truncated to actual maximum bit width at this node
-					localparam int unsigned  NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
+					localparam int unsigned  NODE_WIDTH = sum_width(LEAVE_LOAD[n], LO_WIDTH);
 					uwire [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
 					assign  tree[n] = s;
 				end

From 945a4a4c7e341b3d5acaa929e51672babe70bc36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Wed, 25 Sep 2024 13:33:04 +0100
Subject: [PATCH 11/23] Adding testbench having two accumulator sized run
 against one another.

---
 finn-rtllib/mvu/tb/mvu_accu_tb.dat | 192 +++++++++++++++++++++++++++++
 finn-rtllib/mvu/tb/mvu_accu_tb.sv  | 162 ++++++++++++++++++++++++
 2 files changed, 354 insertions(+)
 create mode 100644 finn-rtllib/mvu/tb/mvu_accu_tb.dat
 create mode 100644 finn-rtllib/mvu/tb/mvu_accu_tb.sv

diff --git a/finn-rtllib/mvu/tb/mvu_accu_tb.dat b/finn-rtllib/mvu/tb/mvu_accu_tb.dat
new file mode 100644
index 0000000000..7e102ab6ab
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_accu_tb.dat
@@ -0,0 +1,192 @@
+9
+4
+d
+9
+2
+a
+d
+7
+9
+7
+b
+4
+4
+7
+0
+0
+c
+9
+9
+1
+9
+0
+a
+0
+5
+5
+7
+7
+2
+6
+7
+9
+0
+0
+9
+7
+7
+c
+7
+9
+7
+1
+2
+0
+f
+7
+1
+7
+f
+7
+1
+7
+1
+6
+6
+9
+e
+f
+e
+a
+6
+1
+7
+9
+d
+a
+7
+7
+f
+4
+7
+f
+9
+f
+9
+1
+9
+f
+7
+3
+4
+1
+1
+0
+d
+c
+d
+b
+9
+9
+f
+7
+0
+5
+e
+6
+7
+e
+7
+1
+7
+0
+e
+3
+c
+4
+9
+7
+9
+9
+d
+e
+c
+1
+f
+7
+0
+7
+1
+7
+d
+0
+7
+e
+a
+1
+9
+4
+b
+7
+9
+0
+a
+e
+6
+7
+2
+9
+0
+9
+0
+9
+1
+9
+0
+0
+7
+2
+7
+1
+5
+9
+1
+9
+6
+7
+c
+1
+9
+d
+9
+f
+c
+9
+9
+9
+b
+b
+9
+f
+9
+5
+1
+3
+0
+9
+0
+9
+2
+a
+9
+0
+f
+0
+7
+0
+a
+7
+3
+e
+5
+7
diff --git a/finn-rtllib/mvu/tb/mvu_accu_tb.sv b/finn-rtllib/mvu/tb/mvu_accu_tb.sv
new file mode 100644
index 0000000000..ceeb31194c
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_accu_tb.sv
@@ -0,0 +1,162 @@
+/******************************************************************************
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU core compute kernel.
+ *****************************************************************************/
+
+module mvu_accu_tb;
+
+	localparam	IS_MVU = 1;
+	localparam	COMPUTE_CORE = "mvu_8sx8u_dsp48";
+	localparam	PUMPED_COMPUTE = 0;
+	localparam	MW = 6;
+	localparam	MH = 32;
+	localparam	PE = 1;
+	localparam	SIMD = 1;
+	localparam	ACTIVATION_WIDTH = 8;
+	localparam	WEIGHT_WIDTH = 4;
+	localparam	NARROW_WEIGHTS = 1;
+	localparam	SIGNED_ACTIVATIONS = 1;
+	localparam	SEGMENTLEN = 1;
+	localparam	FORCE_BEHAVIORAL = 0;
+
+	// Safely deducible parameters
+	localparam  WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8;
+	localparam  INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8;
+
+	// Global Control
+	logic  clk = 0;
+	always #5ns clk = !clk;
+	logic  rst = 1;
+	initial begin
+		repeat(16) @(posedge clk);
+		rst <= 0;
+	end
+
+	logic [WEIGHT_WIDTH-1:0]  WeightMem[MH*MW];
+	initial  $readmemh("mvu_accu_tb.dat", WeightMem);
+
+	// Shared Input Feed
+	logic [INPUT_STREAM_WIDTH_BA-1:0]  in_TDATA;
+	logic  in_TVALID[2];
+	uwire  in_TREADY[2];
+	initial begin
+		in_TDATA = 'x;
+		in_TVALID = '{ default: 0 };
+		@(posedge clk iff !rst);
+
+		repeat(2161*MW) begin
+			automatic logic [ACTIVATION_WIDTH-1:0]  a = $urandom();
+			in_TDATA  <= a;
+			in_TVALID <= '{ default: 1 };
+			fork
+				begin
+					@(posedge clk iff in_TREADY[0]);
+					in_TVALID[0] <= 0;
+				end
+				begin
+					@(posedge clk iff in_TREADY[1]);
+					in_TVALID[1] <= 0;
+				end
+			join
+		end
+
+		repeat(MH*MW) @(posedge clk);
+		$display("Test completed.");
+		$finish;
+	end
+
+	// DUTs
+	localparam int unsigned  ACCU_WIDTHS[2] = '{ 16, 32 };
+	int  OutQ[2][$];
+	for(genvar  i = 0; i < $size(ACCU_WIDTHS); i++) begin : genDUTs
+		localparam int unsigned  ACCU_WIDTH = ACCU_WIDTHS[i];
+		localparam int unsigned  OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
+
+		// Private Weight Feed
+		logic [WEIGHT_STREAM_WIDTH_BA-1:0]  weights_TDATA;
+		logic  weights_TVALID;
+		uwire  weights_TREADY;
+		initial begin
+			weights_TDATA  = 'x;
+			weights_TVALID = 0;
+			@(posedge clk iff !rst);
+
+			weights_TVALID <= 1;
+			forever begin
+				for(int unsigned  i = 0; i < MH*MW; i++)  begin
+					weights_TDATA <= WeightMem[i];
+					@(posedge clk iff weights_TREADY);
+				end
+			end
+		end
+
+		// Private Output Capture into Queue
+		uwire signed [OUTPUT_STREAM_WIDTH_BA-1:0]  out_TDATA;
+		uwire  out_TVALID;
+		uwire  out_TREADY = !rst;
+		always_ff @(posedge clk iff !rst) begin
+			if(out_TVALID)  OutQ[i].push_back(out_TDATA);
+		end
+
+		// Actual DUT Instance
+		mvu_vvu_axi #(
+			.IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .PUMPED_COMPUTE(PUMPED_COMPUTE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD),
+			.ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .NARROW_WEIGHTS(NARROW_WEIGHTS),
+			.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
+		) dut (
+			.ap_clk(clk),
+			.ap_clk2x(1'b0),
+			.ap_rst_n(!rst),
+			.s_axis_weights_tdata(weights_TDATA),
+			.s_axis_weights_tvalid(weights_TVALID),
+			.s_axis_weights_tready(weights_TREADY),
+			.s_axis_input_tdata(in_TDATA),
+			.s_axis_input_tvalid(in_TVALID[i]),
+			.s_axis_input_tready(in_TREADY[i]),
+			.m_axis_output_tdata(out_TDATA),
+			.m_axis_output_tvalid(out_TVALID),
+			.m_axis_output_tready(out_TREADY)
+		);
+	end : genDUTs
+
+	// Output Equivalence Checker
+	always_ff @(posedge clk) begin
+		if(OutQ[0].size && OutQ[1].size) begin
+			automatic int unsigned  y0 = OutQ[0].pop_front();
+			automatic int unsigned  y1 = OutQ[1].pop_front();
+			assert(y0 == y1) else begin
+				$error("Output Mismatch: %0d vs. %0d", y0, y1);
+				$stop;
+			end
+		end
+	end
+
+endmodule : mvu_accu_tb

From 00c3a83aae2a28d75abc097d2655633fc7d55c0d Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 26 Sep 2024 11:44:18 +0100
Subject: [PATCH 12/23] [RoundThresh] Clean-up transformation and test files

---
 .../streamline/round_thresholds.py            |  43 +--
 .../streamline/test_round_thresholds.py       | 263 +++++++++---------
 2 files changed, 132 insertions(+), 174 deletions(-)

diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py
index 2666242730..ab986e7826 100644
--- a/src/finn/transformation/streamline/round_thresholds.py
+++ b/src/finn/transformation/streamline/round_thresholds.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (c) 2020-2022, Xilinx
+# Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,22 +27,12 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# Need numpy for modifying the onnx graph tensors, which are numpy style arrays
 import numpy as np
-
-# QONNX wrapper of ONNX model graphs
 from qonnx.core.modelwrapper import ModelWrapper
-
-# QONNX graph transformation base class
 from qonnx.transformation.base import Transformation
-
-# Transformation running qonnx datatype inference
 from qonnx.transformation.infer_datatypes import InferDataTypes
 
 
-# Rounds and clips thresholds to integer values if the node inputs are integer,
-# respecting range, representability and data type (promotion) of the container
-# data type
 class RoundAndClipThresholds(Transformation):
     """For MultiThreshold nodes operating on integer inputs, round up
     thresholds values to the nearest integer. Additionally, if the input
@@ -50,29 +41,19 @@ class RoundAndClipThresholds(Transformation):
     annotation). Runs InferDataTypes() afterward to propagate any changes to the
     quantization data types."""
 
-    # Applies the transform to a whole model graph
     def apply(self, model: ModelWrapper):  # noqa
-        # Get the model graph out of the model wrapper object
         graph = model.graph
-        # Keep track of whether the graph has been modified
         graph_modified = False
-        # Iterate all nodes in the graph keeping track of the index
         for index, node in enumerate(graph.node):
-            # Applies to initializer tensors of MultiThreshold operations
-            if node.op_type == "MultiThreshold":
-                # Try to get the thresholds initializer tensor
+            op_type = node.op_type
+            if op_type == "MultiThreshold":
                 thresholds = model.get_initializer(node.input[1])
-                # There might be no constant thresholds stored as initializer
-                # tensor inside the model
                 if thresholds is None:
-                    # Nothing we can do, skip to the next node
                     continue
-                # Get the data type of the inputs to this operation
                 dtype = model.get_tensor_datatype(node.input[0])
                 # This transformation only applies to thresholding operations
                 # operating on integer inputs
                 if not dtype.is_integer():
-                    # Nothing we can do, skip to the next node
                     continue
                 # Round thresholds up to nearest integer and clip thresholds
                 # outside the input range
@@ -80,24 +61,14 @@ def apply(self, model: ModelWrapper):  # noqa
                 #   introduce extra inaccuracies due to large integers not being
                 #   exactly representable in floating-point representation.
                 #   See for example: np.ceil(np.float32(16777217)) == 16777216
-                # fmt: off
-                new_thresholds = np.clip(
-                    np.ceil(thresholds), dtype.min(), dtype.max()
-                )
-                # fmt: on
+                new_thresholds = np.clip(np.ceil(thresholds), dtype.min(), dtype.max())
                 # Convert back to the preferred float32 container type
-                #   Note: np.clip might have promoted the thresholds to float64
-                #   TODO: Maybe consider an int64 container type for thresholds
-                #    rounded to integer? Need to check all other transformations
-                #    and code generation through the whole FINN and QONNX stack
-                #    first, as these probably assume a float32 container type.
                 new_thresholds = new_thresholds.astype(np.float32)
                 # Insert the rounded and clipped thresholds back into the model
                 model.set_initializer(node.input[1], new_thresholds)
                 # The rounded and clipped thresholds now fit into the input data
                 # type
                 model.set_tensor_datatype(node.input[1], dtype)
-                # Test whether the new thresholds actually differ from the old
                 # ones
                 if np.any(new_thresholds != thresholds):
                     # Track the graph has been modified to inform the transform
@@ -107,9 +78,5 @@ def apply(self, model: ModelWrapper):  # noqa
                     # Immediately exit here to propagate the data type changes
                     # before considering the next node
                     break
-        # Some data types might have changed, do one pass of data type inference
-        # to propagate these changes through the graph
         model = model.transform(InferDataTypes())
-        # Return the transformed model and indicate whether the graph actually
-        # has been transformed to exhaustively apply this transformation again.
         return model, graph_modified
diff --git a/tests/transformation/streamline/test_round_thresholds.py b/tests/transformation/streamline/test_round_thresholds.py
index 63375598a0..7e2d39176e 100644
--- a/tests/transformation/streamline/test_round_thresholds.py
+++ b/tests/transformation/streamline/test_round_thresholds.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (c) 2020-2022, Xilinx, Inc.
+# Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,32 +27,15 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# fmt: off
-# Disable formatter. This is deliberately formatted to stay within 80 characters
-# per line. Black, however, formats some lines going beyond this.
-
-# Testing framework
 import pytest
 
-# Use numpy for python execution / computing the ground truth expected values
 import numpy as np
-
-# Utility types and function for creating onnx nodes and graphs
 from onnx import TensorProto, helper
-
-# QONNX data types like INT25
 from qonnx.core.datatype import DataType
-
-# QONNX wrapper of ONNX model graphs
 from qonnx.core.modelwrapper import ModelWrapper
-
-# Generate random tensors of QONNX/FINN data types for testing
 from qonnx.util.basic import gen_finn_dt_tensor
 
-# Execution of onnx graphs within FINN
 import finn.core.onnx_exec as oxe
-
-# The transformation to be tested
 from finn.transformation.streamline import RoundAndClipThresholds
 
 
@@ -59,173 +43,186 @@
 # data type combinations with purely integer inputs. Without proper rounding,
 # this tests only the clipping, range and type-casting behavior of the
 # transformation.
-@pytest.mark.parametrize("i_dtype", [
-    # Explanation for selecting these test configurations:
-    # 1. Below 24-bit thresholds we will not observe any interesting rounding
-    #    behavior, as all integers < 2^24 can be exactly represented in 32-bit
-    #    floating-point. Thus, we test thresholds at 25-bit signed integers and
-    #    generate test inputs slightly above and below this.
-    # 2. We want to test out-of-range clipping of thresholds, in particular
-    #    clipping of the negative portion of signed thresholds. Thus, we only
-    #    generate signed thresholds, but test with signed and unsigned
-    #    inputs of smaller, larger and equal range.
-    # 3. Testing proper floating-point thresholds requires a separate test-case
-    "INT23", "UINT23", "INT24", "UINT24", "INT25", "UINT25", "INT26", "UINT26"
-])
-@pytest.mark.parametrize("o_dtype", [
-    # Explanation for selecting these test configurations:
-    # 1. Outputs of MultiThreshold are typically much smaller bit-width than the
-    #    inputs and thresholds.
-    # 2. However, with randomly samples thresholds from a rather large range due
-    #    to the selected input bit-widths (see above), we risk not adequately
-    #    covering the input range if we sample too few thresholds. The number of
-    #    thresholds sampled depends on the bit-width of the output, thus we use
-    #    rather high bit-width for testing.
-    # 3. For a "real" model, the quantization procedure *should* take care of
-    #    adequately covering the true input range.
-    "INT8", "UINT8"
-])
-@pytest.mark.parametrize("n_elems", [
-    # Explanation for selecting these test configurations:
-    # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4
-    # 2. Large test case 256, hopefully amplifying any rarely occurring errors
-    1, 2, 3, 4, 256
-])
+@pytest.mark.parametrize(
+    "i_dtype",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Below 24-bit thresholds we will not observe any interesting rounding
+        #    behavior, as all integers < 2^24 can be exactly represented in 32-bit
+        #    floating-point. Thus, we test thresholds at 25-bit signed integers and
+        #    generate test inputs slightly above and below this.
+        # 2. We want to test out-of-range clipping of thresholds, in particular
+        #    clipping of the negative portion of signed thresholds. Thus, we only
+        #    generate signed thresholds, but test with signed and unsigned
+        #    inputs of smaller, larger and equal range.
+        # 3. Testing proper floating-point thresholds requires a separate test-case
+        "INT23",
+        "UINT23",
+        "INT24",
+        "UINT24",
+        "INT25",
+        "UINT25",
+        "INT26",
+        "UINT26",
+    ],
+)
+@pytest.mark.parametrize(
+    "o_dtype",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Outputs of MultiThreshold are typically much smaller bit-width than the
+        #    inputs and thresholds.
+        # 2. However, with randomly samples thresholds from a rather large range due
+        #    to the selected input bit-widths (see above), we risk not adequately
+        #    covering the input range if we sample too few thresholds. The number of
+        #    thresholds sampled depends on the bit-width of the output, thus we use
+        #    rather high bit-width for testing.
+        # 3. For a "real" model, the quantization procedure *should* take care of
+        #    adequately covering the true input range.
+        "INT8",
+        "UINT8",
+    ],
+)
+@pytest.mark.parametrize(
+    "n_elems",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4
+        # 2. Large test case 256, hopefully amplifying any rarely occurring errors
+        1,
+        2,
+        3,
+        4,
+        256,
+    ],
+)
 def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems):
-    # Convert string representation of data type to onnx DataType
     i_dtype = DataType[i_dtype]
     t_dtype = DataType["INT25"]  # Note: Matches configuration above
     o_dtype = DataType[o_dtype]  # noqa: Duplicate model setup code
-    # Create a dummy MultiThreshold operation to be tested
     node = helper.make_node(
-        # Op-Type of the node
         "MultiThreshold",
-        # MultiThreshold is implemented under the qonnx domain
         domain="qonnx.custom_op.general",
-        # List the names of the input tensors
         inputs=["inp", "thresholds"],
-        # List the names of the output tensors
         outputs=["out"],
-        # The CustomOp needs to know the data type of the output to be produced
-        out_dtype=str(o_dtype)
+        out_dtype=str(o_dtype),
     )
-    # Number of threshold values required to produce outputs of type o_dtype
     n_thresholds = o_dtype.get_num_possible_values() - 1
-    # Create tensor value infos for all input/output tensors involved
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, n_elems])
     out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, n_elems])
-    # Create a tensor value info for the thresholds parameter tensor
-    #   Note: Number of thresholds is determined by the output data type
     thresholds = helper.make_tensor_value_info(
         "thresholds", TensorProto.FLOAT, [n_elems, n_thresholds]
     )
-    # Combine node and tensor value infos into an onnx graph
     graph = helper.make_graph([node], "thresholds", [inp, thresholds], [out])
-    # Wrap the model graph in a ModelWrapper container
     model = ModelWrapper(helper.make_model(graph))
-    # Sample random tensors of the configured input data type
+
     inp = gen_finn_dt_tensor(i_dtype, [1, n_elems])
-    # Generate sorted thresholds for each of the input channels
     thresholds = np.sort(gen_finn_dt_tensor(t_dtype, [n_elems, n_thresholds]))
-    # Set data type annotations for the input and thresholds tensor
     model.set_tensor_datatype("inp", i_dtype)  # noqa: Duplicate model execution
     model.set_tensor_datatype("thresholds", t_dtype)
     model.set_tensor_datatype("out", o_dtype)
-    # Set the thresholds as initializer input to the model
     model.set_initializer("thresholds", thresholds)
+
     # Execute the model before running the RoundAndClipThresholds transformation
     out_expected = oxe.execute_onnx(model, {"inp": inp})["out"]
-    # Before rounding the threshold data type must be as annotated
     assert model.get_tensor_datatype("thresholds") == t_dtype
-    # Run the transformation to be tested
+
     model = model.transform(RoundAndClipThresholds())
+
     # After this transformation, the thresholds and output data type should be
     # inferred correctly
     assert model.get_tensor_datatype("thresholds") == i_dtype
     assert model.get_tensor_datatype("out") == o_dtype
+
     # After this transformation, the container type used to store the thresholds
     # values must be float32. No other type-cast or type promotion may happen.
     assert model.get_initializer("thresholds").dtype == np.float32
+
     # After rounding, all thresholds must be integers represented as float32
-    assert all(
-        x.is_integer() for x in model.get_initializer("thresholds").flatten()
-    )
+    assert all(x.is_integer() for x in model.get_initializer("thresholds").flatten())
+
     # Execute the model after running the RoundAndClipThresholds transformation
     out_produced = oxe.execute_onnx(model, {"inp": inp})["out"]
-    # Compare the results before and after: This is the pure integer test-case
-    # and no actual rounding should happen, thus the rounded operation should
-    # produce outputs exactly equal.
+
     assert np.all(out_produced == out_expected)
 
 
 # Tests the RoundAndClipThresholds transformation under various input, output
 # data type combinations with purely integer inputs. This test case tests actual
 # rounding of floating-point thresholds.
-@pytest.mark.parametrize("i_dtype", [
-    # Explanation for selecting these test configurations:
-    # 1. Below 24-bit thresholds we will not observe any interesting rounding
-    #    behavior, as all integers < 2^24 can be exactly represented in 32-bit
-    #    floating-point. Thus, we test thresholds at 25-bit signed integers and
-    #    generate test inputs slightly above and below this.
-    # 2. We want to test out-of-range clipping of thresholds, in particular
-    #    clipping of the negative portion of signed thresholds. Thus, we only
-    #    generate signed thresholds, but test with signed and unsigned
-    #    inputs of smaller, larger and equal range.
-    # 3. Testing proper floating-point thresholds requires a separate test-case
-    "INT23", "UINT23", "INT24", "UINT24", "INT25", "UINT25", "INT26", "UINT26"
-])
-@pytest.mark.parametrize("o_dtype", [
-    # Explanation for selecting these test configurations:
-    # 1. Outputs of MultiThreshold are typically much smaller bit-width than the
-    #    inputs and thresholds.
-    # 2. However, with randomly samples thresholds from a rather large range due
-    #    to the selected input bit-widths (see above), we risk not adequately
-    #    covering the input range if we sample too few thresholds. The number of
-    #    thresholds sampled depends on the bit-width of the output, thus we use
-    #    rather high bit-width for testing.
-    # 3. For a "real" model, the quantization procedure *should* take care of
-    #    adequately covering the true input range.
-    "INT8", "UINT8"
-])
-@pytest.mark.parametrize("n_elems", [
-    # Explanation for selecting these test configurations:
-    # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4
-    # 2. Large test case 256, hopefully amplifying any rarely occurring errors
-    1, 2, 3, 4, 256
-])
+@pytest.mark.parametrize(
+    "i_dtype",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Below 24-bit thresholds we will not observe any interesting rounding
+        #    behavior, as all integers < 2^24 can be exactly represented in 32-bit
+        #    floating-point. Thus, we test thresholds at 25-bit signed integers and
+        #    generate test inputs slightly above and below this.
+        # 2. We want to test out-of-range clipping of thresholds, in particular
+        #    clipping of the negative portion of signed thresholds. Thus, we only
+        #    generate signed thresholds, but test with signed and unsigned
+        #    inputs of smaller, larger and equal range.
+        # 3. Testing proper floating-point thresholds requires a separate test-case
+        "INT23",
+        "UINT23",
+        "INT24",
+        "UINT24",
+        "INT25",
+        "UINT25",
+        "INT26",
+        "UINT26",
+    ],
+)
+@pytest.mark.parametrize(
+    "o_dtype",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Outputs of MultiThreshold are typically much smaller bit-width than the
+        #    inputs and thresholds.
+        # 2. However, with randomly samples thresholds from a rather large range due
+        #    to the selected input bit-widths (see above), we risk not adequately
+        #    covering the input range if we sample too few thresholds. The number of
+        #    thresholds sampled depends on the bit-width of the output, thus we use
+        #    rather high bit-width for testing.
+        # 3. For a "real" model, the quantization procedure *should* take care of
+        #    adequately covering the true input range.
+        "INT8",
+        "UINT8",
+    ],
+)
+@pytest.mark.parametrize(
+    "n_elems",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4
+        # 2. Large test case 256, hopefully amplifying any rarely occurring errors
+        1,
+        2,
+        3,
+        4,
+        256,
+    ],
+)
 def test_round_and_clip_thresholds_floats(i_dtype, o_dtype, n_elems):
-    # Convert string representation of data type to onnx DataType
     i_dtype = DataType[i_dtype]
     t_dtype = DataType["FLOAT32"]
     o_dtype = DataType[o_dtype]  # noqa: Duplicate model setup code
-    # Create a dummy MultiThreshold operation to be tested
     node = helper.make_node(
-        # Op-Type of the node
         "MultiThreshold",
-        # MultiThreshold is implemented under the qonnx domain
         domain="qonnx.custom_op.general",
-        # List the names of the input tensors
         inputs=["inp", "thresholds"],
-        # List the names of the output tensors
         outputs=["out"],
-        # The CustomOp needs to know the data type of the output to be produced
-        out_dtype=str(o_dtype)
+        out_dtype=str(o_dtype),
     )
-    # Number of threshold values required to produce outputs of type o_dtype
     n_thresholds = o_dtype.get_num_possible_values() - 1
-    # Create tensor value infos for all input/output tensors involved
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, n_elems])
     out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, n_elems])
-    # Create a tensor value info for the thresholds parameter tensor
-    #   Note: Number of thresholds is determined by the output data type
     thresholds = helper.make_tensor_value_info(
         "thresholds", TensorProto.FLOAT, [n_elems, n_thresholds]
     )
-    # Combine node and tensor value infos into an onnx graph
     graph = helper.make_graph([node], "thresholds", [inp, thresholds], [out])
-    # Wrap the model graph in a ModelWrapper container
     model = ModelWrapper(helper.make_model(graph))
-    # Sample random tensors of the configured input data type
+
     inp = gen_finn_dt_tensor(i_dtype, [1, n_elems])
     # Draw uniformly random prototype thresholds in [0,+1] range
     thresholds = np.random.rand(n_elems, n_thresholds)
@@ -238,30 +235,24 @@ def test_round_and_clip_thresholds_floats(i_dtype, o_dtype, n_elems):
     model.set_tensor_datatype("inp", i_dtype)  # noqa: Duplicate model execution
     model.set_tensor_datatype("thresholds", t_dtype)
     model.set_tensor_datatype("out", o_dtype)
-    # Set the thresholds as initializer input to the model
     model.set_initializer("thresholds", thresholds)
+
     # Execute the model before running the RoundAndClipThresholds transformation
     out_expected = oxe.execute_onnx(model, {"inp": inp})["out"]
     # Before rounding the threshold data type must be as annotated
     assert model.get_tensor_datatype("thresholds") == t_dtype
-    # Run the transformation to be tested
+
     model = model.transform(RoundAndClipThresholds())
-    # After this transformation, the thresholds and output data type should be
-    # inferred correctly
+
     assert model.get_tensor_datatype("thresholds") == i_dtype
     assert model.get_tensor_datatype("out") == o_dtype
+
     # After this transformation, the container type used to store the thresholds
     # values must be float32. No other type-cast or type promotion may happen.
     assert model.get_initializer("thresholds").dtype == np.float32
     # After rounding, all thresholds must be integers represented as float32
-    assert all(
-        x.is_integer() for x in model.get_initializer("thresholds").flatten()
-    )
-    # Execute the model after running the RoundAndClipThresholds transformation
+    assert all(x.is_integer() for x in model.get_initializer("thresholds").flatten())
+
     out_produced = oxe.execute_onnx(model, {"inp": inp})["out"]
-    # Compare the results before and after: This is the floating-point test with
-    # actual rounding, this the transformed result may only be equal within some
-    # tolerance.
-    # Hm, never observed this to be relevant. For all test configurations, exact
-    # equality seems to hold, probably due to only integer inputs being tested.
+
     assert np.allclose(out_produced, out_expected, atol=1.0e-3)

From 717bfc13e2361e767c220a3d298245f04cfd84ef Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 26 Sep 2024 12:57:06 +0100
Subject: [PATCH 13/23] [RoundThresh] Expand rounding of thresholds to hw
 layers

---
 src/finn/builder/build_dataflow_steps.py               | 2 ++
 src/finn/transformation/streamline/round_thresholds.py | 2 +-
 tests/end2end/test_end2end_bnn_pynq.py                 | 2 ++
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index bdbcc53d83..ab2280554c 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -121,6 +121,7 @@
 )
 from finn.transformation.streamline import Streamline
 from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
+from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
 from finn.util.basic import (
     get_rtlsim_trace_depth,
     pyverilate_get_liveness_threshold_cycles,
@@ -503,6 +504,7 @@ def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig):
     if cfg.minimize_bit_width:
         model = model.transform(MinimizeWeightBitWidth())
         model = model.transform(MinimizeAccumulatorWidth())
+        model = model.transform(RoundAndClipThresholds())
         # make sure the changed datatypes are propagated through the network
         model = model.transform(InferDataTypes())
     return model
diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py
index ab986e7826..907f127896 100644
--- a/src/finn/transformation/streamline/round_thresholds.py
+++ b/src/finn/transformation/streamline/round_thresholds.py
@@ -46,7 +46,7 @@ def apply(self, model: ModelWrapper):  # noqa
         graph_modified = False
         for index, node in enumerate(graph.node):
             op_type = node.op_type
-            if op_type == "MultiThreshold":
+            if op_type == "MultiThreshold" or op_type.startswith("Thresholding"):
                 thresholds = model.get_initializer(node.input[1])
                 if thresholds is None:
                     continue
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 81c6316ec1..0d3418624a 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -94,6 +94,7 @@
     MakeMaxPoolNHWC,
     MoveScalarLinearPastInvariants,
 )
+from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
 from finn.util.basic import get_finn_root, make_build_dir, test_board_map
 from finn.util.pytorch import ToTensor
 from finn.util.test import (
@@ -672,6 +673,7 @@ def test_minimize_bit_width(self, topology, wbits, abits, board):
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         model = model.transform(MinimizeAccumulatorWidth())
         model = model.transform(MinimizeWeightBitWidth())
+        model = model.transform(RoundAndClipThresholds())
         curr_chkpt_name = get_checkpoint_name(topology, wbits, abits, "minimize_bit_width")
         model.save(curr_chkpt_name)
 

From 6ade140e684167100cce408454efbd9c2b4008c3 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 26 Sep 2024 14:20:04 +0100
Subject: [PATCH 14/23] [RoundThresh] Add change of the weight datatype to hw
 op threshold rounding

---
 src/finn/transformation/streamline/round_thresholds.py | 5 +++++
 tests/end2end/test_end2end_mobilenet_v1.py             | 1 +
 2 files changed, 6 insertions(+)

diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py
index 907f127896..ee6a31e3dc 100644
--- a/src/finn/transformation/streamline/round_thresholds.py
+++ b/src/finn/transformation/streamline/round_thresholds.py
@@ -29,6 +29,7 @@
 
 import numpy as np
 from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
 from qonnx.transformation.infer_datatypes import InferDataTypes
 
@@ -69,6 +70,10 @@ def apply(self, model: ModelWrapper):  # noqa
                 # The rounded and clipped thresholds now fit into the input data
                 # type
                 model.set_tensor_datatype(node.input[1], dtype)
+                # If hw op we need to set the weight data type attribute as well
+                if op_type.startswith("Thresholding"):
+                    inst = getCustomOp(node)
+                    inst.set_nodeattr("weightDataType", dtype.name)
                 # ones
                 if np.any(new_thresholds != thresholds):
                     # Track the graph has been modified to inform the transform
diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py
index 01d995c147..4c52277970 100644
--- a/tests/end2end/test_end2end_mobilenet_v1.py
+++ b/tests/end2end/test_end2end_mobilenet_v1.py
@@ -353,6 +353,7 @@ def test_end2end_mobilenet_minimize_bit_width():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_folded.onnx")
     model = model.transform(MinimizeAccumulatorWidth())
     model = model.transform(MinimizeWeightBitWidth())
+    model = model.transform(RoundAndClipThresholds())
     model.save(build_dir + "/end2end_mobilenet_minimize_bitwidth.onnx")
 
 

From db353f4fda97df13c593c0a6733e1e3aee9c3ecc Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Fri, 27 Sep 2024 15:36:10 +0100
Subject: [PATCH 15/23] [RoundThresh] Allow for range + 1

---
 .../streamline/round_thresholds.py              | 17 ++++++++++++-----
 .../test_fpgadataflow_thresholding.py           | 11 +++++++----
 .../streamline/test_round_thresholds.py         | 16 ++++++++++++++--
 3 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py
index ee6a31e3dc..312db404ac 100644
--- a/src/finn/transformation/streamline/round_thresholds.py
+++ b/src/finn/transformation/streamline/round_thresholds.py
@@ -28,6 +28,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
+from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
@@ -62,18 +63,24 @@ def apply(self, model: ModelWrapper):  # noqa
                 #   introduce extra inaccuracies due to large integers not being
                 #   exactly representable in floating-point representation.
                 #   See for example: np.ceil(np.float32(16777217)) == 16777216
-                new_thresholds = np.clip(np.ceil(thresholds), dtype.min(), dtype.max())
+                new_thresholds = np.clip(np.ceil(thresholds), dtype.min(), dtype.max() + 1)
                 # Convert back to the preferred float32 container type
                 new_thresholds = new_thresholds.astype(np.float32)
                 # Insert the rounded and clipped thresholds back into the model
                 model.set_initializer(node.input[1], new_thresholds)
-                # The rounded and clipped thresholds now fit into the input data
-                # type
-                model.set_tensor_datatype(node.input[1], dtype)
+                # The rounded and clipped thresholds now fit into a data type
+                # that is one bit bigger than the input datatype
+                # Determine new max_value
+                max_val = dtype.max() + 1
+                if not dtype.signed():
+                    tdt = DataType.get_smallest_possible(max_val)
+                else:
+                    tdt = DataType.get_smallest_possible(-(max_val) - 1)
+                model.set_tensor_datatype(node.input[1], tdt)
                 # If hw op we need to set the weight data type attribute as well
                 if op_type.startswith("Thresholding"):
                     inst = getCustomOp(node)
-                    inst.set_nodeattr("weightDataType", dtype.name)
+                    inst.set_nodeattr("weightDataType", tdt.name)
                 # ones
                 if np.any(new_thresholds != thresholds):
                     # Track the graph has been modified to inform the transform
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index fe7ba3d9fb..2079fe7fc5 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -49,6 +49,7 @@
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
 
 test_fpga_part = "xczu3eg-sbva484-1-e"
 target_clk_ns = 5
@@ -133,10 +134,8 @@ def make_single_multithresholding_modelwrapper(
 @pytest.mark.parametrize(
     "idt_tdt_cfg",
     [
-        (DataType["INT8"], DataType["INT8"]),
-        (DataType["INT8"], DataType["INT9"]),
-        (DataType["UINT5"], DataType["UINT5"]),
-        (DataType["UINT5"], DataType["UINT6"]),
+        (DataType["INT8"], DataType["INT25"]),
+        (DataType["UINT5"], DataType["UINT8"]),
     ],
 )
 @pytest.mark.parametrize("fold", [-1, 1, 2])
@@ -145,6 +144,7 @@ def make_single_multithresholding_modelwrapper(
 @pytest.mark.parametrize("impl_style", ["hls", "rtl"])
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"])
+@pytest.mark.parametrize("round_thresh", [True, False])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
@@ -159,6 +159,7 @@ def test_fpgadataflow_thresholding(
     impl_style,
     exec_mode,
     mem_mode,
+    round_thresh,
 ):
     # the mem_mode parameter can only be used for the hls thresholding
     # so the test will only be executed once for impl_style=rtl and once skipped
@@ -234,6 +235,8 @@ def test_fpgadataflow_thresholding(
     node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0]
     inst = getCustomOp(node)
     inst.set_nodeattr("PE", pe)
+    if round_thresh is True:
+        model = model.transform(RoundAndClipThresholds())
     model = model.transform(GiveUniqueNodeNames())
 
     if impl_style == "hls":
diff --git a/tests/transformation/streamline/test_round_thresholds.py b/tests/transformation/streamline/test_round_thresholds.py
index 7e2d39176e..6de82e6750 100644
--- a/tests/transformation/streamline/test_round_thresholds.py
+++ b/tests/transformation/streamline/test_round_thresholds.py
@@ -96,6 +96,7 @@
         256,
     ],
 )
+@pytest.mark.streamline
 def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems):
     i_dtype = DataType[i_dtype]
     t_dtype = DataType["INT25"]  # Note: Matches configuration above
@@ -106,6 +107,7 @@ def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems):
         inputs=["inp", "thresholds"],
         outputs=["out"],
         out_dtype=str(o_dtype),
+        out_bias=float(o_dtype.min()),
     )
     n_thresholds = o_dtype.get_num_possible_values() - 1
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, n_elems])
@@ -117,6 +119,7 @@ def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems):
     model = ModelWrapper(helper.make_model(graph))
 
     inp = gen_finn_dt_tensor(i_dtype, [1, n_elems])
+    inp[0][0] = i_dtype.max()
     thresholds = np.sort(gen_finn_dt_tensor(t_dtype, [n_elems, n_thresholds]))
     model.set_tensor_datatype("inp", i_dtype)  # noqa: Duplicate model execution
     model.set_tensor_datatype("thresholds", t_dtype)
@@ -131,7 +134,11 @@ def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems):
 
     # After this transformation, the thresholds and output data type should be
     # inferred correctly
-    assert model.get_tensor_datatype("thresholds") == i_dtype
+    if not i_dtype.signed():
+        new_tdt = DataType.get_smallest_possible(i_dtype.max() + 1)
+    else:
+        new_tdt = DataType.get_smallest_possible(-(i_dtype.max() + 1) - 1)
+    assert model.get_tensor_datatype("thresholds") == new_tdt
     assert model.get_tensor_datatype("out") == o_dtype
 
     # After this transformation, the container type used to store the thresholds
@@ -203,6 +210,7 @@ def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems):
         256,
     ],
 )
+@pytest.mark.streamline
 def test_round_and_clip_thresholds_floats(i_dtype, o_dtype, n_elems):
     i_dtype = DataType[i_dtype]
     t_dtype = DataType["FLOAT32"]
@@ -244,7 +252,11 @@ def test_round_and_clip_thresholds_floats(i_dtype, o_dtype, n_elems):
 
     model = model.transform(RoundAndClipThresholds())
 
-    assert model.get_tensor_datatype("thresholds") == i_dtype
+    if not i_dtype.signed():
+        new_tdt = DataType.get_smallest_possible(i_dtype.max() + 1)
+    else:
+        new_tdt = DataType.get_smallest_possible(-(i_dtype.max() + 1) - 1)
+    assert model.get_tensor_datatype("thresholds") == new_tdt
     assert model.get_tensor_datatype("out") == o_dtype
 
     # After this transformation, the container type used to store the thresholds

From b250047d444dfdc129bd667ce790c9c7982f2b39 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Fri, 11 Oct 2024 09:47:01 +0100
Subject: [PATCH 16/23] [tutorial] Update folding config to new custom operator
 structure

---
 tutorials/fpga_flow/folding_config.json | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tutorials/fpga_flow/folding_config.json b/tutorials/fpga_flow/folding_config.json
index 642200d02b..bf94f8058d 100644
--- a/tutorials/fpga_flow/folding_config.json
+++ b/tutorials/fpga_flow/folding_config.json
@@ -1,30 +1,29 @@
 {
   "Defaults": {},
-  "Thresholding_Batch_0": {
-    "PE": 49,
-    "ram_style": "block"
+  "Thresholding_rtl_0": {
+    "PE": 49
   },
-  "MatrixVectorActivation_0": {
+  "MVAU_hls_0": {
     "PE": 16,
     "SIMD": 49,
     "ram_style": "block"
   },
-  "MatrixVectorActivation_1": {
+  "MVAU_hls_1": {
     "PE": 8,
     "SIMD": 8,
     "ram_style": "auto"
   },
-  "MatrixVectorActivation_2": {
+  "MVAU_hls_2": {
     "PE": 8,
     "SIMD": 8,
     "ram_style": "auto"
   },
-  "MatrixVectorActivation_3": {
+  "MVAU_hls_3": {
     "PE": 10,
     "SIMD": 8,
     "ram_style": "distributed"
   },
-  "LabelSelect_Batch_0": {
+  "LabelSelect_hls_0": {
     "PE": 1
   }
 }

From b48147e0a6637659a8a7127dd0016edded998ed5 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Fri, 11 Oct 2024 10:36:20 +0100
Subject: [PATCH 17/23] [tutorial] Format tutorial README

---
 tutorials/fpga_flow/README.md | 44 ++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/tutorials/fpga_flow/README.md b/tutorials/fpga_flow/README.md
index 2aaad0423b..71f2a2a625 100644
--- a/tutorials/fpga_flow/README.md
+++ b/tutorials/fpga_flow/README.md
@@ -25,20 +25,29 @@ This demo was created using Vivado 2022.1.
 Prior to running, insure the following prerequisites have been met:
 - Install FINN and prerequisites.  The [Getting Started](https://finn.readthedocs.io/en/latest/getting_started.html#quickstart) section of the FINN documentation might be helpful for this.
 - Ensure you have the `FINN_XILINX_PATH` and `FINN_XILINX_VERSION` env variables set appropriately for your install.  For example:
-> export FINN_XILINX_PATH=/opt/Xilinx
-> export FINN_XILINX_VERSION=2022.1
+```shell
+export FINN_XILINX_PATH=/opt/Xilinx
+export FINN_XILINX_VERSION=2022.1
+```
+
 - Set the env variable for your `finn` install top directory (where you cloned the FINN compiler repo):
-> export FINN_ROOT=/home/foo/finn
+```shell
+export FINN_ROOT=/home/foo/finn
+```
 
 Then, change to `finn` install directory and invoke the build as follows:
-> cd ${FINN_ROOT}
-> ./run-docker.sh build_custom ${FINN_ROOT}/tutorials/fpga_flow/
+```shell
+cd ${FINN_ROOT}
+./run-docker.sh build_custom ${FINN_ROOT}/tutorials/fpga_flow/
+```
 
 Alternatively, since the tutorials folder is already part of the FINN compiler installation, you can invoke it from within the Docker container:
-> cd ${FINN_ROOT}
-> ./run-docker.sh
-> cd tutorials/fpga_flow
-> python build.py
+```shell
+cd ${FINN_ROOT}
+./run-docker.sh
+cd tutorials/fpga_flow
+python build.py
+```
 
 The build should finish in about 10 minutes, and the FINN docker will close on success.
 
@@ -59,12 +68,14 @@ The build should finish in about 10 minutes, and the FINN docker will close on s
 ### Examine the Stitched IP
 
 Navigate to the stitched IP project directory:
-
-> cd ${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/stitched_ip
+```shell
+cd ${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/stitched_ip
+```
 
 And, open the project:
-
-> vivado finn_vivado_stitch_proj.xpr
+```shell
+vivado finn_vivado_stitch_proj.xpr
+```
 
 Explore the IPI board design and note the interfaces.
 
@@ -89,9 +100,10 @@ them under `${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/sim`. Let's ex
    the FINN compiler. Used for launching the testbench simulation.
 
 You can now launch the simulation as follows:
-
-> cd ${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/sim
-> vivado -mode gui -source make_sim_proj.tcl
+```shell
+cd ${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/sim
+vivado -mode gui -source make_sim_proj.tcl
+```
 
 The simulation should complete with:
 

From f6acf7075b3af97719edd3705f1268f0d357e0fa Mon Sep 17 00:00:00 2001
From: Alexander Hornburg <alexander.hornburg@amd.com>
Date: Wed, 23 Oct 2024 17:42:26 +0100
Subject: [PATCH 18/23] [Infra] support passing arguments to build_custom flow

---
 run-docker.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run-docker.sh b/run-docker.sh
index b1fe44eb0c..1358337a37 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -142,7 +142,7 @@ elif [ "$1" = "build_custom" ]; then
   DOCKER_INTERACTIVE="-it"
   #FINN_HOST_BUILD_DIR=$BUILD_DATAFLOW_DIR/build
   gecho "Running build_custom: $BUILD_CUSTOM_DIR/$FLOW_NAME.py"
-  DOCKER_CMD="python -mpdb -cc -cq $FLOW_NAME.py"
+  DOCKER_CMD="python -mpdb -cc -cq $FLOW_NAME.py ${@:4}"
 elif [ -z "$1" ]; then
    gecho "Running container only"
    DOCKER_CMD="bash"

From 1d7636b8f8d841eda4e20b6cbd365b4a7257f24d Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 23 Oct 2024 17:41:32 +0000
Subject: [PATCH 19/23] Bump onnx from 1.13.0 to 1.17.0

Bumps [onnx](https://github.com/onnx/onnx) from 1.13.0 to 1.17.0.
- [Release notes](https://github.com/onnx/onnx/releases)
- [Changelog](https://github.com/onnx/onnx/blob/main/docs/Changelog-ml.md)
- [Commits](https://github.com/onnx/onnx/compare/v1.13.0...v1.17.0)

---
updated-dependencies:
- dependency-name: onnx
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index d4ca45cb37..85a0ca1175 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ gspread==3.6.0
 importlib-resources==6.1.0
 ipython==8.12.2
 numpy==1.24.1
-onnx==1.13.0
+onnx==1.17.0
 onnxoptimizer
 onnxruntime==1.16.1
 pre-commit==3.3.2

From 14b68b7efa235089bf7e1d8d40416095bcb23e81 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 24 Oct 2024 14:29:36 +0100
Subject: [PATCH 20/23] [Infra] Add no-cache env var for run docker script

---
 run-docker.sh | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/run-docker.sh b/run-docker.sh
index 1358337a37..8bf6440d4f 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -102,6 +102,7 @@ SCRIPTPATH=$(dirname "$SCRIPT")
 : ${FINN_SINGULARITY=""}
 : ${FINN_SKIP_XRT_DOWNLOAD=""}
 : ${FINN_XRT_PATH=""}
+: ${FINN_DOCKER_NO_CACHE="0"}
 
 DOCKER_INTERACTIVE=""
 
@@ -190,12 +191,18 @@ if [ -d "$FINN_XRT_PATH" ];then
   export LOCAL_XRT=1
 fi
 
+if [ "$FINN_DOCKER_NO_CACHE" = "1" ]; then
+  export NO_CACHE_STRING="--no-cache"
+else
+  export NO_CACHE_STRING=""
+fi
+
 # Build the FINN Docker image
 if [ "$FINN_DOCKER_PREBUILT" = "0" ] && [ -z "$FINN_SINGULARITY" ]; then
   # Need to ensure this is done within the finn/ root folder:
   OLD_PWD=$(pwd)
   cd $SCRIPTPATH
-  docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --build-arg SKIP_XRT=$FINN_SKIP_XRT_DOWNLOAD --build-arg LOCAL_XRT=$LOCAL_XRT --tag=$FINN_DOCKER_TAG $FINN_DOCKER_BUILD_EXTRA .
+  docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --build-arg SKIP_XRT=$FINN_SKIP_XRT_DOWNLOAD --build-arg LOCAL_XRT=$LOCAL_XRT --tag=$FINN_DOCKER_TAG $FINN_DOCKER_BUILD_EXTRA $NO_CACHE_STRING .
   cd $OLD_PWD
 fi
 

From 72dcb87f510436d60ad0c370e6b90692ebf5b213 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 24 Oct 2024 14:41:37 +0100
Subject: [PATCH 21/23] [Infra] Re-use build extra env vars to enable no cache
 option

---
 run-docker.sh | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/run-docker.sh b/run-docker.sh
index 8bf6440d4f..69c998c467 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -192,9 +192,7 @@ if [ -d "$FINN_XRT_PATH" ];then
 fi
 
 if [ "$FINN_DOCKER_NO_CACHE" = "1" ]; then
-  export NO_CACHE_STRING="--no-cache"
-else
-  export NO_CACHE_STRING=""
+  FINN_DOCKER_BUILD_EXTRA+="--no-cache"
 fi
 
 # Build the FINN Docker image
@@ -202,7 +200,7 @@ if [ "$FINN_DOCKER_PREBUILT" = "0" ] && [ -z "$FINN_SINGULARITY" ]; then
   # Need to ensure this is done within the finn/ root folder:
   OLD_PWD=$(pwd)
   cd $SCRIPTPATH
-  docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --build-arg SKIP_XRT=$FINN_SKIP_XRT_DOWNLOAD --build-arg LOCAL_XRT=$LOCAL_XRT --tag=$FINN_DOCKER_TAG $FINN_DOCKER_BUILD_EXTRA $NO_CACHE_STRING .
+  docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --build-arg SKIP_XRT=$FINN_SKIP_XRT_DOWNLOAD --build-arg LOCAL_XRT=$LOCAL_XRT --tag=$FINN_DOCKER_TAG $FINN_DOCKER_BUILD_EXTRA .
   cd $OLD_PWD
 fi
 

From f0aafa261e7a8f57891ba12cd1572e7d3062bc19 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 24 Oct 2024 15:19:55 +0100
Subject: [PATCH 22/23] [Infra] Add space to no cache var to allow for future
 extension

---
 run-docker.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run-docker.sh b/run-docker.sh
index 69c998c467..b59af88eb7 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -192,7 +192,7 @@ if [ -d "$FINN_XRT_PATH" ];then
 fi
 
 if [ "$FINN_DOCKER_NO_CACHE" = "1" ]; then
-  FINN_DOCKER_BUILD_EXTRA+="--no-cache"
+  FINN_DOCKER_BUILD_EXTRA+="--no-cache "
 fi
 
 # Build the FINN Docker image

From a9f1898deccb74a4f8e38717c5bef00e46c9f70f Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 1 Nov 2024 11:35:04 +0000
Subject: [PATCH 23/23] Use Vivado tclstore from install instead of home

---
 run-docker.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/run-docker.sh b/run-docker.sh
index b59af88eb7..ec55299f6c 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -231,6 +231,9 @@ DOCKER_EXEC+="-e NUM_DEFAULT_WORKERS=$NUM_DEFAULT_WORKERS "
 # Workaround for FlexLM issue, see:
 # https://community.flexera.com/t5/InstallAnywhere-Forum/Issues-when-running-Xilinx-tools-or-Other-vendor-tools-in-docker/m-p/245820#M10647
 DOCKER_EXEC+="-e LD_PRELOAD=/lib/x86_64-linux-gnu/libudev.so.1 "
+# Workaround for running multiple Vivado instances simultaneously, see:
+# https://adaptivesupport.amd.com/s/article/63253?language=en_US
+DOCKER_EXEC+="-e XILINX_LOCAL_USER_DATA=no "
 if [ "$FINN_DOCKER_RUN_AS_ROOT" = "0" ] && [ -z "$FINN_SINGULARITY" ];then
   DOCKER_EXEC+="-v /etc/group:/etc/group:ro "
   DOCKER_EXEC+="-v /etc/passwd:/etc/passwd:ro "