From 52cfc4a2ac4c9feb729ad7acd2adbfb0e1a41207 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Wed, 13 Mar 2024 10:17:08 +0100 Subject: [PATCH 01/23] Fix clipping range issue in RoundAndClipThresholds transformation --- src/finn/transformation/streamline/round_thresholds.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py index 5ba5ee0ff5..2bf3630cff 100644 --- a/src/finn/transformation/streamline/round_thresholds.py +++ b/src/finn/transformation/streamline/round_thresholds.py @@ -57,10 +57,10 @@ def apply(self, model): model.set_tensor_datatype(n.input[1], idtype) graph_modified = True if idtype.is_integer() and ( - (Tnew < (idtype.min() - 1)).any() or (Tnew > (idtype.max() + 1)).any() + (Tnew < (idtype.min())).any() or (Tnew > (idtype.max())).any() ): # clip any large thresholds to input range + 1 - Tnew = np.clip(Tnew, idtype.min() - 1, idtype.max() + 1) + Tnew = np.clip(Tnew, idtype.min(), idtype.max()) model.set_initializer(n.input[1], Tnew) # use same datatype as inputs for thresholds model.set_tensor_datatype(n.input[1], idtype) From c8292e2a27bebb2254f278e409b00f448c35e600 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Sat, 6 Apr 2024 17:06:03 +0200 Subject: [PATCH 02/23] Rework RoundAndClipThresholds to avoid range and type promotion issues See https://github.com/Xilinx/finn/issues/978 --- .../streamline/round_thresholds.py | 105 +++++++++++++----- 1 file changed, 76 insertions(+), 29 deletions(-) diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py index 2bf3630cff..2666242730 100644 --- a/src/finn/transformation/streamline/round_thresholds.py +++ b/src/finn/transformation/streamline/round_thresholds.py @@ -26,43 +26,90 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# Need numpy for modifying the onnx graph tensors, which are numpy style arrays import numpy as np + +# QONNX wrapper of ONNX model graphs +from qonnx.core.modelwrapper import ModelWrapper + +# QONNX graph transformation base class from qonnx.transformation.base import Transformation +# Transformation running qonnx datatype inference +from qonnx.transformation.infer_datatypes import InferDataTypes + +# Rounds and clips thresholds to integer values if the node inputs are integer, +# respecting range, representability and data type (promotion) of the container +# data type class RoundAndClipThresholds(Transformation): """For MultiThreshold nodes operating on integer inputs, round up thresholds values to the nearest integer. Additionally, if the input - is unsigned, sets negative thresholds to zero.""" + is unsigned, sets negative thresholds to zero. Type-casts thresholds (back) + to the float32 container type (this is separate from the quantization + annotation). Runs InferDataTypes() afterward to propagate any changes to the + quantization data types.""" - def apply(self, model): + # Applies the transform to a whole model graph + def apply(self, model: ModelWrapper): # noqa + # Get the model graph out of the model wrapper object graph = model.graph + # Keep track of whether the graph has been modified graph_modified = False - for n in graph.node: - if n.op_type == "MultiThreshold": - idtype = model.get_tensor_datatype(n.input[0]) - T = model.get_initializer(n.input[1]) - Tnew = np.ceil(T) - if idtype.is_integer() and (T != Tnew).any(): - # round up the thresholds to nearest integer - model.set_initializer(n.input[1], Tnew) - # use same datatype as inputs for thresholds - model.set_tensor_datatype(n.input[1], idtype) - graph_modified = True - if idtype.is_integer() and not idtype.signed() and (Tnew < 0).any(): - # clip any negative thresholds if input is unsigned - Tnew = np.clip(Tnew, 0, None) - model.set_initializer(n.input[1], Tnew) - # use same datatype as inputs for thresholds - model.set_tensor_datatype(n.input[1], idtype) - graph_modified = True - if idtype.is_integer() and ( - (Tnew < (idtype.min())).any() or (Tnew > (idtype.max())).any() - ): - # clip any large thresholds to input range + 1 - Tnew = np.clip(Tnew, idtype.min(), idtype.max()) - model.set_initializer(n.input[1], Tnew) - # use same datatype as inputs for thresholds - model.set_tensor_datatype(n.input[1], idtype) + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(graph.node): + # Applies to initializer tensors of MultiThreshold operations + if node.op_type == "MultiThreshold": + # Try to get the thresholds initializer tensor + thresholds = model.get_initializer(node.input[1]) + # There might be no constant thresholds stored as initializer + # tensor inside the model + if thresholds is None: + # Nothing we can do, skip to the next node + continue + # Get the data type of the inputs to this operation + dtype = model.get_tensor_datatype(node.input[0]) + # This transformation only applies to thresholding operations + # operating on integer inputs + if not dtype.is_integer(): + # Nothing we can do, skip to the next node + continue + # Round thresholds up to nearest integer and clip thresholds + # outside the input range + # Note: This might promote the thresholds to float64 and + # introduce extra inaccuracies due to large integers not being + # exactly representable in floating-point representation. + # See for example: np.ceil(np.float32(16777217)) == 16777216 + # fmt: off + new_thresholds = np.clip( + np.ceil(thresholds), dtype.min(), dtype.max() + ) + # fmt: on + # Convert back to the preferred float32 container type + # Note: np.clip might have promoted the thresholds to float64 + # TODO: Maybe consider an int64 container type for thresholds + # rounded to integer? Need to check all other transformations + # and code generation through the whole FINN and QONNX stack + # first, as these probably assume a float32 container type. + new_thresholds = new_thresholds.astype(np.float32) + # Insert the rounded and clipped thresholds back into the model + model.set_initializer(node.input[1], new_thresholds) + # The rounded and clipped thresholds now fit into the input data + # type + model.set_tensor_datatype(node.input[1], dtype) + # Test whether the new thresholds actually differ from the old + # ones + if np.any(new_thresholds != thresholds): + # Track the graph has been modified to inform the transform + # container to exhaustively repeat this transformation until + # no changes are possible graph_modified = True - return (model, graph_modified) + # Immediately exit here to propagate the data type changes + # before considering the next node + break + # Some data types might have changed, do one pass of data type inference + # to propagate these changes through the graph + model = model.transform(InferDataTypes()) + # Return the transformed model and indicate whether the graph actually + # has been transformed to exhaustively apply this transformation again. + return model, graph_modified From 3109645cb2a2bb764bd982948a36e2788756efc1 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Sat, 6 Apr 2024 17:10:36 +0200 Subject: [PATCH 03/23] [Tests] Rework test-cases for reworked RoundAndClipThresholds See https://github.com/Xilinx/finn/issues/978 --- .../streamline/test_round_thresholds.py | 257 ++++++++++++++++-- 1 file changed, 227 insertions(+), 30 deletions(-) diff --git a/tests/transformation/streamline/test_round_thresholds.py b/tests/transformation/streamline/test_round_thresholds.py index 85c60b37d5..63375598a0 100644 --- a/tests/transformation/streamline/test_round_thresholds.py +++ b/tests/transformation/streamline/test_round_thresholds.py @@ -26,45 +26,242 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# fmt: off +# Disable formatter. This is deliberately formatted to stay within 80 characters +# per line. Black, however, formats some lines going beyond this. + +# Testing framework import pytest +# Use numpy for python execution / computing the ground truth expected values import numpy as np + +# Utility types and function for creating onnx nodes and graphs from onnx import TensorProto, helper + +# QONNX data types like INT25 from qonnx.core.datatype import DataType + +# QONNX wrapper of ONNX model graphs from qonnx.core.modelwrapper import ModelWrapper -from qonnx.util.basic import qonnx_make_model +# Generate random tensors of QONNX/FINN data types for testing +from qonnx.util.basic import gen_finn_dt_tensor + +# Execution of onnx graphs within FINN import finn.core.onnx_exec as oxe + +# The transformation to be tested from finn.transformation.streamline import RoundAndClipThresholds -@pytest.mark.streamline -def test_round_thresholds(): - v = helper.make_tensor_value_info("v", TensorProto.FLOAT, [1, 4]) - thresholds = helper.make_tensor_value_info("thresholds", TensorProto.FLOAT, [4, 1]) - out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, 4]) - node_def = helper.make_node( - "MultiThreshold", ["v", "thresholds"], ["out"], domain="qonnx.custom_op.general" +# Tests the RoundAndClipThresholds transformation under various input, output +# data type combinations with purely integer inputs. Without proper rounding, +# this tests only the clipping, range and type-casting behavior of the +# transformation. +@pytest.mark.parametrize("i_dtype", [ + # Explanation for selecting these test configurations: + # 1. Below 24-bit thresholds we will not observe any interesting rounding + # behavior, as all integers < 2^24 can be exactly represented in 32-bit + # floating-point. Thus, we test thresholds at 25-bit signed integers and + # generate test inputs slightly above and below this. + # 2. We want to test out-of-range clipping of thresholds, in particular + # clipping of the negative portion of signed thresholds. Thus, we only + # generate signed thresholds, but test with signed and unsigned + # inputs of smaller, larger and equal range. + # 3. Testing proper floating-point thresholds requires a separate test-case + "INT23", "UINT23", "INT24", "UINT24", "INT25", "UINT25", "INT26", "UINT26" +]) +@pytest.mark.parametrize("o_dtype", [ + # Explanation for selecting these test configurations: + # 1. Outputs of MultiThreshold are typically much smaller bit-width than the + # inputs and thresholds. + # 2. However, with randomly samples thresholds from a rather large range due + # to the selected input bit-widths (see above), we risk not adequately + # covering the input range if we sample too few thresholds. The number of + # thresholds sampled depends on the bit-width of the output, thus we use + # rather high bit-width for testing. + # 3. For a "real" model, the quantization procedure *should* take care of + # adequately covering the true input range. + "INT8", "UINT8" +]) +@pytest.mark.parametrize("n_elems", [ + # Explanation for selecting these test configurations: + # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4 + # 2. Large test case 256, hopefully amplifying any rarely occurring errors + 1, 2, 3, 4, 256 +]) +def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems): + # Convert string representation of data type to onnx DataType + i_dtype = DataType[i_dtype] + t_dtype = DataType["INT25"] # Note: Matches configuration above + o_dtype = DataType[o_dtype] # noqa: Duplicate model setup code + # Create a dummy MultiThreshold operation to be tested + node = helper.make_node( + # Op-Type of the node + "MultiThreshold", + # MultiThreshold is implemented under the qonnx domain + domain="qonnx.custom_op.general", + # List the names of the input tensors + inputs=["inp", "thresholds"], + # List the names of the output tensors + outputs=["out"], + # The CustomOp needs to know the data type of the output to be produced + out_dtype=str(o_dtype) + ) + # Number of threshold values required to produce outputs of type o_dtype + n_thresholds = o_dtype.get_num_possible_values() - 1 + # Create tensor value infos for all input/output tensors involved + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, n_elems]) + out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, n_elems]) + # Create a tensor value info for the thresholds parameter tensor + # Note: Number of thresholds is determined by the output data type + thresholds = helper.make_tensor_value_info( + "thresholds", TensorProto.FLOAT, [n_elems, n_thresholds] + ) + # Combine node and tensor value infos into an onnx graph + graph = helper.make_graph([node], "thresholds", [inp, thresholds], [out]) + # Wrap the model graph in a ModelWrapper container + model = ModelWrapper(helper.make_model(graph)) + # Sample random tensors of the configured input data type + inp = gen_finn_dt_tensor(i_dtype, [1, n_elems]) + # Generate sorted thresholds for each of the input channels + thresholds = np.sort(gen_finn_dt_tensor(t_dtype, [n_elems, n_thresholds])) + # Set data type annotations for the input and thresholds tensor + model.set_tensor_datatype("inp", i_dtype) # noqa: Duplicate model execution + model.set_tensor_datatype("thresholds", t_dtype) + model.set_tensor_datatype("out", o_dtype) + # Set the thresholds as initializer input to the model + model.set_initializer("thresholds", thresholds) + # Execute the model before running the RoundAndClipThresholds transformation + out_expected = oxe.execute_onnx(model, {"inp": inp})["out"] + # Before rounding the threshold data type must be as annotated + assert model.get_tensor_datatype("thresholds") == t_dtype + # Run the transformation to be tested + model = model.transform(RoundAndClipThresholds()) + # After this transformation, the thresholds and output data type should be + # inferred correctly + assert model.get_tensor_datatype("thresholds") == i_dtype + assert model.get_tensor_datatype("out") == o_dtype + # After this transformation, the container type used to store the thresholds + # values must be float32. No other type-cast or type promotion may happen. + assert model.get_initializer("thresholds").dtype == np.float32 + # After rounding, all thresholds must be integers represented as float32 + assert all( + x.is_integer() for x in model.get_initializer("thresholds").flatten() + ) + # Execute the model after running the RoundAndClipThresholds transformation + out_produced = oxe.execute_onnx(model, {"inp": inp})["out"] + # Compare the results before and after: This is the pure integer test-case + # and no actual rounding should happen, thus the rounded operation should + # produce outputs exactly equal. + assert np.all(out_produced == out_expected) + + +# Tests the RoundAndClipThresholds transformation under various input, output +# data type combinations with purely integer inputs. This test case tests actual +# rounding of floating-point thresholds. +@pytest.mark.parametrize("i_dtype", [ + # Explanation for selecting these test configurations: + # 1. Below 24-bit thresholds we will not observe any interesting rounding + # behavior, as all integers < 2^24 can be exactly represented in 32-bit + # floating-point. Thus, we test thresholds at 25-bit signed integers and + # generate test inputs slightly above and below this. + # 2. We want to test out-of-range clipping of thresholds, in particular + # clipping of the negative portion of signed thresholds. Thus, we only + # generate signed thresholds, but test with signed and unsigned + # inputs of smaller, larger and equal range. + # 3. Testing proper floating-point thresholds requires a separate test-case + "INT23", "UINT23", "INT24", "UINT24", "INT25", "UINT25", "INT26", "UINT26" +]) +@pytest.mark.parametrize("o_dtype", [ + # Explanation for selecting these test configurations: + # 1. Outputs of MultiThreshold are typically much smaller bit-width than the + # inputs and thresholds. + # 2. However, with randomly samples thresholds from a rather large range due + # to the selected input bit-widths (see above), we risk not adequately + # covering the input range if we sample too few thresholds. The number of + # thresholds sampled depends on the bit-width of the output, thus we use + # rather high bit-width for testing. + # 3. For a "real" model, the quantization procedure *should* take care of + # adequately covering the true input range. + "INT8", "UINT8" +]) +@pytest.mark.parametrize("n_elems", [ + # Explanation for selecting these test configurations: + # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4 + # 2. Large test case 256, hopefully amplifying any rarely occurring errors + 1, 2, 3, 4, 256 +]) +def test_round_and_clip_thresholds_floats(i_dtype, o_dtype, n_elems): + # Convert string representation of data type to onnx DataType + i_dtype = DataType[i_dtype] + t_dtype = DataType["FLOAT32"] + o_dtype = DataType[o_dtype] # noqa: Duplicate model setup code + # Create a dummy MultiThreshold operation to be tested + node = helper.make_node( + # Op-Type of the node + "MultiThreshold", + # MultiThreshold is implemented under the qonnx domain + domain="qonnx.custom_op.general", + # List the names of the input tensors + inputs=["inp", "thresholds"], + # List the names of the output tensors + outputs=["out"], + # The CustomOp needs to know the data type of the output to be produced + out_dtype=str(o_dtype) + ) + # Number of threshold values required to produce outputs of type o_dtype + n_thresholds = o_dtype.get_num_possible_values() - 1 + # Create tensor value infos for all input/output tensors involved + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, n_elems]) + out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, n_elems]) + # Create a tensor value info for the thresholds parameter tensor + # Note: Number of thresholds is determined by the output data type + thresholds = helper.make_tensor_value_info( + "thresholds", TensorProto.FLOAT, [n_elems, n_thresholds] + ) + # Combine node and tensor value infos into an onnx graph + graph = helper.make_graph([node], "thresholds", [inp, thresholds], [out]) + # Wrap the model graph in a ModelWrapper container + model = ModelWrapper(helper.make_model(graph)) + # Sample random tensors of the configured input data type + inp = gen_finn_dt_tensor(i_dtype, [1, n_elems]) + # Draw uniformly random prototype thresholds in [0,+1] range + thresholds = np.random.rand(n_elems, n_thresholds) + # Type alias to 25-bit signed integer type used to set the range of the + # thresholds + INT25 = DataType["INT25"] # noqa: Variable name not lowercase + # Map the prototype thresholds into the test integer range and sort + thresholds = np.sort((INT25.max() - INT25.min()) * thresholds + INT25.min()) + # Set data type annotations for the input and thresholds tensor + model.set_tensor_datatype("inp", i_dtype) # noqa: Duplicate model execution + model.set_tensor_datatype("thresholds", t_dtype) + model.set_tensor_datatype("out", o_dtype) + # Set the thresholds as initializer input to the model + model.set_initializer("thresholds", thresholds) + # Execute the model before running the RoundAndClipThresholds transformation + out_expected = oxe.execute_onnx(model, {"inp": inp})["out"] + # Before rounding the threshold data type must be as annotated + assert model.get_tensor_datatype("thresholds") == t_dtype + # Run the transformation to be tested + model = model.transform(RoundAndClipThresholds()) + # After this transformation, the thresholds and output data type should be + # inferred correctly + assert model.get_tensor_datatype("thresholds") == i_dtype + assert model.get_tensor_datatype("out") == o_dtype + # After this transformation, the container type used to store the thresholds + # values must be float32. No other type-cast or type promotion may happen. + assert model.get_initializer("thresholds").dtype == np.float32 + # After rounding, all thresholds must be integers represented as float32 + assert all( + x.is_integer() for x in model.get_initializer("thresholds").flatten() ) - graph_def = helper.make_graph([node_def], "test_model", [v, thresholds], [out]) - model_def = qonnx_make_model(graph_def) - model = ModelWrapper(model_def) - threshold_val = np.asarray([[-1.1], [0.7], [2.3], [5.1]], dtype=np.float32) - model.set_initializer("thresholds", threshold_val) - model.set_tensor_datatype("v", DataType["INT8"]) - inp_dict_f = {"v": np.floor(threshold_val).T} - inp_dict_n = {"v": np.round(threshold_val).T} - inp_dict_c = {"v": np.ceil(threshold_val).T} - orig_f = oxe.execute_onnx(model, inp_dict_f)["out"] - orig_n = oxe.execute_onnx(model, inp_dict_n)["out"] - orig_c = oxe.execute_onnx(model, inp_dict_c)["out"] - assert model.get_tensor_datatype("thresholds") == DataType["FLOAT32"] - new_model = model.transform(RoundAndClipThresholds()) - # rounded up thresholds should have same dtype as input - assert new_model.get_tensor_datatype("thresholds") == DataType["INT8"] - new_f = oxe.execute_onnx(new_model, inp_dict_f)["out"] - new_n = oxe.execute_onnx(new_model, inp_dict_n)["out"] - new_c = oxe.execute_onnx(new_model, inp_dict_c)["out"] - assert np.isclose(orig_f, new_f, atol=1e-3).all() - assert np.isclose(orig_n, new_n, atol=1e-3).all() - assert np.isclose(orig_c, new_c, atol=1e-3).all() + # Execute the model after running the RoundAndClipThresholds transformation + out_produced = oxe.execute_onnx(model, {"inp": inp})["out"] + # Compare the results before and after: This is the floating-point test with + # actual rounding, this the transformed result may only be equal within some + # tolerance. + # Hm, never observed this to be relevant. For all test configurations, exact + # equality seems to hold, probably due to only integer inputs being tested. + assert np.allclose(out_produced, out_expected, atol=1.0e-3) From e22201f800a573b88d55f9b0024454a8e10fa0d4 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 16 Aug 2024 16:15:40 +0100 Subject: [PATCH 04/23] [HWop-MVAU] Ensure shape is compatible in execution function --- src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 1c86ae7b7a..8f0a987bce 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -130,6 +130,8 @@ def get_nodeattr_types(self): def execute_node(self, context, graph): node = self.onnx_node in_act = context[node.input[0]] + # ensure that shape is compatible + in_act = in_act.reshape(self.get_normal_input_shape()) mvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0] mvau_w = np_helper.to_array(mvau_w_init) # Matrix multiplication From ec5613c68f209202cf7fefb21d383b0072a2441f Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 19 Sep 2024 10:15:08 +0100 Subject: [PATCH 05/23] [InsertFIFO] Preserve onnx tensor dtype when inserting FIFOs --- src/finn/transformation/fpgadataflow/insert_fifo.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py index 9df193efcf..21fb843052 100644 --- a/src/finn/transformation/fpgadataflow/insert_fifo.py +++ b/src/finn/transformation/fpgadataflow/insert_fifo.py @@ -29,7 +29,6 @@ import numpy as np import warnings -from onnx import TensorProto from onnx import helper as oh from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation @@ -114,6 +113,8 @@ def apply(self, model): # determine fifo node attributes fld_shape = n0.get_folded_output_shape() dtype = n0.get_output_datatype() + n0_otensor = model.get_tensor_valueinfo(output_name) + n0_tensor_dtype = n0_otensor.type.tensor_type.elem_type # check if folded_shape of output of first node and # input of the second node is equal @@ -145,7 +146,7 @@ def apply(self, model): # or unless create_shallow_fifos is specified fifo_output_tensor = oh.make_tensor_value_info( model.make_new_valueinfo_name(), - TensorProto.FLOAT, + n0_tensor_dtype, n0.get_normal_output_shape(), ) graph.value_info.append(fifo_output_tensor) @@ -196,13 +197,15 @@ def apply(self, model): fld_shape = n0.get_folded_input_shape(inp_ind) n_shape = n0.get_normal_input_shape(inp_ind) dtype = n0.get_input_datatype(inp_ind) + n0_itensor = model.get_tensor_valueinfo(graph_in_name) + n0_tensor_dtype = n0_itensor.type.tensor_type.elem_type fifo_depth = n0.get_nodeattr("inFIFODepths")[inp_ind] if fifo_depth > 2 or self.create_shallow_fifos: # create fifo node fifo_output_tensor = oh.make_tensor_value_info( model.make_new_valueinfo_name(), - TensorProto.FLOAT, + n0_tensor_dtype, n0.get_normal_input_shape(inp_ind), ) graph.value_info.append(fifo_output_tensor) @@ -256,13 +259,15 @@ def apply(self, model): fld_shape = n0.get_folded_output_shape(out_ind) n_shape = n0.get_normal_output_shape(out_ind) dtype = n0.get_output_datatype(out_ind) + n0_otensor = model.get_tensor_valueinfo(graph_out_name) + n0_tensor_dtype = n0_otensor.type.tensor_type.elem_type fifo_depth = n0.get_nodeattr("outFIFODepths")[out_ind] if fifo_depth > 2 or self.create_shallow_fifos: # create fifo node fifo_input_tensor = oh.make_tensor_value_info( model.make_new_valueinfo_name(), - TensorProto.FLOAT, + n0_tensor_dtype, n0.get_normal_output_shape(), ) graph.value_info.append(fifo_input_tensor) From fb600553d5618d36be334f7dd6c99dea789b0c83 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 19 Sep 2024 10:58:43 +0100 Subject: [PATCH 06/23] [InsertDWC] Preserve onnx tensor dtype when inserting DWCs --- src/finn/transformation/fpgadataflow/insert_dwc.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py index 33cc3e86d3..b56c8b74ea 100644 --- a/src/finn/transformation/fpgadataflow/insert_dwc.py +++ b/src/finn/transformation/fpgadataflow/insert_dwc.py @@ -26,7 +26,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from onnx import TensorProto from onnx import helper as oh from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation @@ -110,12 +109,15 @@ def apply(self, model): # determine shape for dwc dwc_shape = n0.get_normal_output_shape() - # determine dtype for dwc + # determine FINN dtype for dwc dtype = n0.get_output_datatype() + # determine onnx tensor dtype for dwc + n0_otensor = model.get_tensor_valueinfo(output_name) + n0_tensor_dtype = n0_otensor.type.tensor_type.elem_type dwc_output_tensor = oh.make_tensor_value_info( model.make_new_valueinfo_name(), - TensorProto.FLOAT, + n0_tensor_dtype, dwc_shape, ) graph.value_info.append(dwc_output_tensor) From 03830929697464666b58be717ece8328bc6c6965 Mon Sep 17 00:00:00 2001 From: Michal Danilowicz Date: Mon, 16 Sep 2024 13:28:15 +0000 Subject: [PATCH 07/23] [Fix] InferDuplicateStreamsLayer now properly handles forks of multiple-output nodes --- .../fpgadataflow/convert_to_hw_layers.py | 96 +++++++++---------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index 25a2032aeb..b02bc89db8 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -585,63 +585,63 @@ def apply(self, model): for node in graph.node: node_ind += 1 - successors = model.find_consumers(node.output[0]) - if successors is not None and len(successors) >= 2: - output_tensor = node.output[0] - n_outputs = len(successors) + for output_tensor in node.output: + successors = model.find_consumers(output_tensor) + if successors is not None and len(successors) >= 2: + n_outputs = len(successors) - dt = model.get_tensor_datatype(output_tensor) + dt = model.get_tensor_datatype(output_tensor) - # skip conversion for layers with float input - if not dt.is_integer(): - continue + # skip conversion for layers with float input + if not dt.is_integer(): + continue - # create clone tensors - out_shape = model.get_tensor_shape(output_tensor) - out_tensor_clones = [] - for i in range(n_outputs): - clone = helper.make_tensor_value_info( - model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape - ) - model.graph.value_info.append(clone) - out_tensor_clones += [clone.name] + # create clone tensors + out_shape = model.get_tensor_shape(output_tensor) + out_tensor_clones = [] + for i in range(n_outputs): + clone = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape + ) + model.graph.value_info.append(clone) + out_tensor_clones += [clone.name] - num_ch = int(out_shape[-1]) - vecs = out_shape[:-1] + num_ch = int(out_shape[-1]) + vecs = out_shape[:-1] - # create node with no parallelization first - pe = 1 + # create node with no parallelization first + pe = 1 - dup_node = helper.make_node( - "DuplicateStreams", - [output_tensor], - out_tensor_clones, - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - NumChannels=num_ch, - PE=pe, - inputDataType=dt.name, - numInputVectors=vecs, - NumOutputStreams=n_outputs, - outFIFODepths=[2] * n_outputs, - name="DuplicateStreams_" + node.name, - ) + dup_node = helper.make_node( + "DuplicateStreams", + [output_tensor], + out_tensor_clones, + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=num_ch, + PE=pe, + inputDataType=dt.name, + numInputVectors=vecs, + NumOutputStreams=n_outputs, + outFIFODepths=[2] * n_outputs, + name="DuplicateStreams_" + node.name, + ) - graph.node.insert(node_ind, dup_node) + graph.node.insert(node_ind, dup_node) - # connect successors to out tensor clone - clone_idx = 0 - for successor in successors: - for i, succ_input in enumerate(successor.input): - if succ_input == output_tensor: - successor.input[i] = out_tensor_clones[clone_idx] - clone_idx += 1 - # if one node has multiple connections to the same output - # find_direct_successors will return one node per input - # so break the inner loop will result in correct behaviour - break + # connect successors to out tensor clone + clone_idx = 0 + for successor in successors: + for i, succ_input in enumerate(successor.input): + if succ_input == output_tensor: + successor.input[i] = out_tensor_clones[clone_idx] + clone_idx += 1 + # if one node has multiple connections to the same output + # find_direct_successors will return one node per input + # so break the inner loop will result in correct behaviour + break - graph_modified = True + graph_modified = True if graph_modified: model = model.transform(SortGraph()) From d13aa7e7debb21bd1d75b6dbb6eddc959b4ae8c8 Mon Sep 17 00:00:00 2001 From: Michal Danilowicz Date: Mon, 16 Sep 2024 13:48:43 +0000 Subject: [PATCH 08/23] [Fix] MoveScalarLinearPastInvariants, MakeMaxPoolNHWC, MakeScaleResizeNHWC transformations are checking whether the node to be moved is a fork node, in which case the MoveOpPastFork is called. MoveOpPastFork uses deepcopies of the original node. --- src/finn/transformation/streamline/reorder.py | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py index 8ac2d7dad6..9a7e9d0723 100644 --- a/src/finn/transformation/streamline/reorder.py +++ b/src/finn/transformation/streamline/reorder.py @@ -29,6 +29,7 @@ import numpy as np import qonnx.core.data_layout as DataLayout import warnings +from copy import deepcopy from onnx import TensorProto from onnx import helper as oh from qonnx.core.datatype import DataType @@ -641,6 +642,10 @@ def apply(self, model): # if initializer is not scalar, skip if np.prod(init0.shape) != 1: continue + if model.is_fork_node(prod0): + model = model.transform(MoveOpPastFork(prod0.op_type)) + # topology modified, "ask" ModelWrapper to apply this transform again + return (model, True) # Flatten input if required if len(init0.shape) > 0: init0 = init0.flatten()[0] @@ -713,6 +718,12 @@ def apply(self, model): elif producer is not None and producer.op_type == "Transpose": perms = list(get_by_name(producer.attribute, "perm").ints) if perms == [0, 3, 1, 2]: + # check if the producer is a fork node + # (need to move it past the fork before this transform) + if model.is_fork_node(producer): + model = model.transform(MoveTransposePastFork()) + # topology modified, "ask" ModelWrapper to apply this transform again + return (model, True) ceil_mode = get_by_name(n.attribute, "ceil_mode") if ceil_mode is not None: ceil_mode = ceil_mode.i @@ -764,6 +775,12 @@ def apply(self, model): if producer is not None and producer.op_type == "Transpose": perms = list(get_by_name(producer.attribute, "perm").ints) if perms == [0, 3, 1, 2]: + # check if the producer is a fork node + # (need to move it past the fork before this transform) + if model.is_fork_node(producer): + model = model.transform(MoveTransposePastFork()) + # topology modified, "ask" ModelWrapper to apply this transform again + return (model, True) old_value = model.get_initializer(n.input[scales_ind]) new_value = np.array( [old_value[idx] for idx in (0, 2, 3, 1)], @@ -813,10 +830,9 @@ class MoveOpPastFork(Transformation): can be merged with nodes in the branches """ - def __init__(self, op_name_list, get_attrs_fxn=lambda x: {}): + def __init__(self, op_name_list): super().__init__() self.ops_to_move = op_name_list - self.get_attrs_fxn = get_attrs_fxn def apply(self, model): graph = model.graph @@ -859,11 +875,9 @@ def apply(self, model): new_param_name = model.make_new_valueinfo_name() new_inp_list = [n.input[0], new_param_name] model.set_initializer(new_param_name, op_init_param) - attrs = self.get_attrs_fxn(n) - # TODO use copy of original node instead to get attrs? - new_node = oh.make_node( - n.op_type, new_inp_list, [new_output_tensor_name], **attrs - ) + new_node = deepcopy(n) + new_node.input[:] = new_inp_list + new_node.output[:] = [new_output_tensor_name] graph.node.insert(node_ind, new_node) node_ind += 1 @@ -901,7 +915,7 @@ def __init__(self): class MoveTransposePastFork(MoveOpPastFork): def __init__(self): - super().__init__(["Transpose"], lambda x: {"perm": get_by_name(x.attribute, "perm").ints}) + super().__init__(["Transpose"]) class MoveMaxPoolPastMultiThreshold(Transformation): From 6223abe86c7d9aee43788825f3c19545dab0ea54 Mon Sep 17 00:00:00 2001 From: Michal Danilowicz Date: Mon, 16 Sep 2024 13:59:14 +0000 Subject: [PATCH 09/23] [Fix] InsertFIFO transform is fixed for the case of the last node in the graph being a fork node --- src/finn/transformation/fpgadataflow/insert_fifo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py index 21fb843052..9ed0f51cd4 100644 --- a/src/finn/transformation/fpgadataflow/insert_fifo.py +++ b/src/finn/transformation/fpgadataflow/insert_fifo.py @@ -268,7 +268,7 @@ def apply(self, model): fifo_input_tensor = oh.make_tensor_value_info( model.make_new_valueinfo_name(), n0_tensor_dtype, - n0.get_normal_output_shape(), + n0.get_normal_output_shape(out_ind), ) graph.value_info.append(fifo_input_tensor) model.set_tensor_datatype(fifo_input_tensor.name, dtype) @@ -294,7 +294,7 @@ def apply(self, model): graph.node.append(fifo_node) # set fifo output tensor as new input tensor of second node - final_node.output[0] = fifo_input_tensor.name + final_node.output[out_ind] = fifo_input_tensor.name else: warnings.warn( """Output FIFO for %s has depth %d and won't From 11d8234fdcfb03c00a700dd3ba82cb88d6da66e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 25 Sep 2024 13:27:04 +0100 Subject: [PATCH 10/23] Harden lane width computations against 32-bit numeric overflow. --- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 107a00918e..dabb36647e 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -72,6 +72,10 @@ module mvu_8sx8u_dsp48 #( return res; endfunction : init_leave_loads + function int unsigned sum_width(input int unsigned n, input int unsigned w); + return w <= 16? $clog2(1 + n*(2**w - 1)) : w + $clog2(n); + endfunction : sum_width + // Pipeline for last indicator flag logic [1:5] L = '0; always_ff @(posedge clk) begin @@ -445,7 +449,7 @@ module mvu_8sx8u_dsp48 #( // Stage #4: Cross-SIMD Reduction // Count leaves reachable from each node - localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop + localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop // Range of Cross-lane Contribution Tracked in Hi4 /* @@ -462,7 +466,7 @@ module mvu_8sx8u_dsp48 #( * signed value is determined by its lower bound to be at least: * 1 + $clog2(2^(w-1)+SIMD) */ - localparam int unsigned HI_WIDTH = 1 + $clog2(2**(ACCU_WIDTH-D[1]-1)+SIMD); + localparam int unsigned HI_WIDTH = 1 + ($clog2(SIMD) < ACCU_WIDTH-D[1]? ACCU_WIDTH-D[1] : $clog2(2**(ACCU_WIDTH-D[1]-1)+SIMD)); uwire signed [ACCU_WIDTH -1:0] up4; uwire signed [HI_WIDTH -1:0] hi4; @@ -504,12 +508,12 @@ module mvu_8sx8u_dsp48 #( // Conclusive low part accumulation if(i >= PE_REM) begin : blkLo // Adder Tree across all SIMD low contributions (all unsigned arithmetic) - localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); + localparam int unsigned ROOT_WIDTH = sum_width(SIMD, LO_WIDTH); uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; for(genvar n = 0; n < SIMD-1; n++) begin // Sum truncated to actual maximum bit width at this node - localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); + localparam int unsigned NODE_WIDTH = sum_width(LEAVE_LOAD[n], LO_WIDTH); uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; assign tree[n] = s; end From 945a4a4c7e341b3d5acaa929e51672babe70bc36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 25 Sep 2024 13:33:04 +0100 Subject: [PATCH 11/23] Adding testbench having two accumulator sized run against one another. --- finn-rtllib/mvu/tb/mvu_accu_tb.dat | 192 +++++++++++++++++++++++++++++ finn-rtllib/mvu/tb/mvu_accu_tb.sv | 162 ++++++++++++++++++++++++ 2 files changed, 354 insertions(+) create mode 100644 finn-rtllib/mvu/tb/mvu_accu_tb.dat create mode 100644 finn-rtllib/mvu/tb/mvu_accu_tb.sv diff --git a/finn-rtllib/mvu/tb/mvu_accu_tb.dat b/finn-rtllib/mvu/tb/mvu_accu_tb.dat new file mode 100644 index 0000000000..7e102ab6ab --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_accu_tb.dat @@ -0,0 +1,192 @@ +9 +4 +d +9 +2 +a +d +7 +9 +7 +b +4 +4 +7 +0 +0 +c +9 +9 +1 +9 +0 +a +0 +5 +5 +7 +7 +2 +6 +7 +9 +0 +0 +9 +7 +7 +c +7 +9 +7 +1 +2 +0 +f +7 +1 +7 +f +7 +1 +7 +1 +6 +6 +9 +e +f +e +a +6 +1 +7 +9 +d +a +7 +7 +f +4 +7 +f +9 +f +9 +1 +9 +f +7 +3 +4 +1 +1 +0 +d +c +d +b +9 +9 +f +7 +0 +5 +e +6 +7 +e +7 +1 +7 +0 +e +3 +c +4 +9 +7 +9 +9 +d +e +c +1 +f +7 +0 +7 +1 +7 +d +0 +7 +e +a +1 +9 +4 +b +7 +9 +0 +a +e +6 +7 +2 +9 +0 +9 +0 +9 +1 +9 +0 +0 +7 +2 +7 +1 +5 +9 +1 +9 +6 +7 +c +1 +9 +d +9 +f +c +9 +9 +9 +b +b +9 +f +9 +5 +1 +3 +0 +9 +0 +9 +2 +a +9 +0 +f +0 +7 +0 +a +7 +3 +e +5 +7 diff --git a/finn-rtllib/mvu/tb/mvu_accu_tb.sv b/finn-rtllib/mvu/tb/mvu_accu_tb.sv new file mode 100644 index 0000000000..ceeb31194c --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_accu_tb.sv @@ -0,0 +1,162 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU core compute kernel. + *****************************************************************************/ + +module mvu_accu_tb; + + localparam IS_MVU = 1; + localparam COMPUTE_CORE = "mvu_8sx8u_dsp48"; + localparam PUMPED_COMPUTE = 0; + localparam MW = 6; + localparam MH = 32; + localparam PE = 1; + localparam SIMD = 1; + localparam ACTIVATION_WIDTH = 8; + localparam WEIGHT_WIDTH = 4; + localparam NARROW_WEIGHTS = 1; + localparam SIGNED_ACTIVATIONS = 1; + localparam SEGMENTLEN = 1; + localparam FORCE_BEHAVIORAL = 0; + + // Safely deducible parameters + localparam WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8; + localparam INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8; + + // Global Control + logic clk = 0; + always #5ns clk = !clk; + logic rst = 1; + initial begin + repeat(16) @(posedge clk); + rst <= 0; + end + + logic [WEIGHT_WIDTH-1:0] WeightMem[MH*MW]; + initial $readmemh("mvu_accu_tb.dat", WeightMem); + + // Shared Input Feed + logic [INPUT_STREAM_WIDTH_BA-1:0] in_TDATA; + logic in_TVALID[2]; + uwire in_TREADY[2]; + initial begin + in_TDATA = 'x; + in_TVALID = '{ default: 0 }; + @(posedge clk iff !rst); + + repeat(2161*MW) begin + automatic logic [ACTIVATION_WIDTH-1:0] a = $urandom(); + in_TDATA <= a; + in_TVALID <= '{ default: 1 }; + fork + begin + @(posedge clk iff in_TREADY[0]); + in_TVALID[0] <= 0; + end + begin + @(posedge clk iff in_TREADY[1]); + in_TVALID[1] <= 0; + end + join + end + + repeat(MH*MW) @(posedge clk); + $display("Test completed."); + $finish; + end + + // DUTs + localparam int unsigned ACCU_WIDTHS[2] = '{ 16, 32 }; + int OutQ[2][$]; + for(genvar i = 0; i < $size(ACCU_WIDTHS); i++) begin : genDUTs + localparam int unsigned ACCU_WIDTH = ACCU_WIDTHS[i]; + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; + + // Private Weight Feed + logic [WEIGHT_STREAM_WIDTH_BA-1:0] weights_TDATA; + logic weights_TVALID; + uwire weights_TREADY; + initial begin + weights_TDATA = 'x; + weights_TVALID = 0; + @(posedge clk iff !rst); + + weights_TVALID <= 1; + forever begin + for(int unsigned i = 0; i < MH*MW; i++) begin + weights_TDATA <= WeightMem[i]; + @(posedge clk iff weights_TREADY); + end + end + end + + // Private Output Capture into Queue + uwire signed [OUTPUT_STREAM_WIDTH_BA-1:0] out_TDATA; + uwire out_TVALID; + uwire out_TREADY = !rst; + always_ff @(posedge clk iff !rst) begin + if(out_TVALID) OutQ[i].push_back(out_TDATA); + end + + // Actual DUT Instance + mvu_vvu_axi #( + .IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .PUMPED_COMPUTE(PUMPED_COMPUTE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .NARROW_WEIGHTS(NARROW_WEIGHTS), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) + ) dut ( + .ap_clk(clk), + .ap_clk2x(1'b0), + .ap_rst_n(!rst), + .s_axis_weights_tdata(weights_TDATA), + .s_axis_weights_tvalid(weights_TVALID), + .s_axis_weights_tready(weights_TREADY), + .s_axis_input_tdata(in_TDATA), + .s_axis_input_tvalid(in_TVALID[i]), + .s_axis_input_tready(in_TREADY[i]), + .m_axis_output_tdata(out_TDATA), + .m_axis_output_tvalid(out_TVALID), + .m_axis_output_tready(out_TREADY) + ); + end : genDUTs + + // Output Equivalence Checker + always_ff @(posedge clk) begin + if(OutQ[0].size && OutQ[1].size) begin + automatic int unsigned y0 = OutQ[0].pop_front(); + automatic int unsigned y1 = OutQ[1].pop_front(); + assert(y0 == y1) else begin + $error("Output Mismatch: %0d vs. %0d", y0, y1); + $stop; + end + end + end + +endmodule : mvu_accu_tb From 00c3a83aae2a28d75abc097d2655633fc7d55c0d Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 26 Sep 2024 11:44:18 +0100 Subject: [PATCH 12/23] [RoundThresh] Clean-up transformation and test files --- .../streamline/round_thresholds.py | 43 +-- .../streamline/test_round_thresholds.py | 263 +++++++++--------- 2 files changed, 132 insertions(+), 174 deletions(-) diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py index 2666242730..ab986e7826 100644 --- a/src/finn/transformation/streamline/round_thresholds.py +++ b/src/finn/transformation/streamline/round_thresholds.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx +# Copyright (C) 2022-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -26,22 +27,12 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# Need numpy for modifying the onnx graph tensors, which are numpy style arrays import numpy as np - -# QONNX wrapper of ONNX model graphs from qonnx.core.modelwrapper import ModelWrapper - -# QONNX graph transformation base class from qonnx.transformation.base import Transformation - -# Transformation running qonnx datatype inference from qonnx.transformation.infer_datatypes import InferDataTypes -# Rounds and clips thresholds to integer values if the node inputs are integer, -# respecting range, representability and data type (promotion) of the container -# data type class RoundAndClipThresholds(Transformation): """For MultiThreshold nodes operating on integer inputs, round up thresholds values to the nearest integer. Additionally, if the input @@ -50,29 +41,19 @@ class RoundAndClipThresholds(Transformation): annotation). Runs InferDataTypes() afterward to propagate any changes to the quantization data types.""" - # Applies the transform to a whole model graph def apply(self, model: ModelWrapper): # noqa - # Get the model graph out of the model wrapper object graph = model.graph - # Keep track of whether the graph has been modified graph_modified = False - # Iterate all nodes in the graph keeping track of the index for index, node in enumerate(graph.node): - # Applies to initializer tensors of MultiThreshold operations - if node.op_type == "MultiThreshold": - # Try to get the thresholds initializer tensor + op_type = node.op_type + if op_type == "MultiThreshold": thresholds = model.get_initializer(node.input[1]) - # There might be no constant thresholds stored as initializer - # tensor inside the model if thresholds is None: - # Nothing we can do, skip to the next node continue - # Get the data type of the inputs to this operation dtype = model.get_tensor_datatype(node.input[0]) # This transformation only applies to thresholding operations # operating on integer inputs if not dtype.is_integer(): - # Nothing we can do, skip to the next node continue # Round thresholds up to nearest integer and clip thresholds # outside the input range @@ -80,24 +61,14 @@ def apply(self, model: ModelWrapper): # noqa # introduce extra inaccuracies due to large integers not being # exactly representable in floating-point representation. # See for example: np.ceil(np.float32(16777217)) == 16777216 - # fmt: off - new_thresholds = np.clip( - np.ceil(thresholds), dtype.min(), dtype.max() - ) - # fmt: on + new_thresholds = np.clip(np.ceil(thresholds), dtype.min(), dtype.max()) # Convert back to the preferred float32 container type - # Note: np.clip might have promoted the thresholds to float64 - # TODO: Maybe consider an int64 container type for thresholds - # rounded to integer? Need to check all other transformations - # and code generation through the whole FINN and QONNX stack - # first, as these probably assume a float32 container type. new_thresholds = new_thresholds.astype(np.float32) # Insert the rounded and clipped thresholds back into the model model.set_initializer(node.input[1], new_thresholds) # The rounded and clipped thresholds now fit into the input data # type model.set_tensor_datatype(node.input[1], dtype) - # Test whether the new thresholds actually differ from the old # ones if np.any(new_thresholds != thresholds): # Track the graph has been modified to inform the transform @@ -107,9 +78,5 @@ def apply(self, model: ModelWrapper): # noqa # Immediately exit here to propagate the data type changes # before considering the next node break - # Some data types might have changed, do one pass of data type inference - # to propagate these changes through the graph model = model.transform(InferDataTypes()) - # Return the transformed model and indicate whether the graph actually - # has been transformed to exhaustively apply this transformation again. return model, graph_modified diff --git a/tests/transformation/streamline/test_round_thresholds.py b/tests/transformation/streamline/test_round_thresholds.py index 63375598a0..7e2d39176e 100644 --- a/tests/transformation/streamline/test_round_thresholds.py +++ b/tests/transformation/streamline/test_round_thresholds.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx, Inc. +# Copyright (C) 2022-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -26,32 +27,15 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# fmt: off -# Disable formatter. This is deliberately formatted to stay within 80 characters -# per line. Black, however, formats some lines going beyond this. - -# Testing framework import pytest -# Use numpy for python execution / computing the ground truth expected values import numpy as np - -# Utility types and function for creating onnx nodes and graphs from onnx import TensorProto, helper - -# QONNX data types like INT25 from qonnx.core.datatype import DataType - -# QONNX wrapper of ONNX model graphs from qonnx.core.modelwrapper import ModelWrapper - -# Generate random tensors of QONNX/FINN data types for testing from qonnx.util.basic import gen_finn_dt_tensor -# Execution of onnx graphs within FINN import finn.core.onnx_exec as oxe - -# The transformation to be tested from finn.transformation.streamline import RoundAndClipThresholds @@ -59,173 +43,186 @@ # data type combinations with purely integer inputs. Without proper rounding, # this tests only the clipping, range and type-casting behavior of the # transformation. -@pytest.mark.parametrize("i_dtype", [ - # Explanation for selecting these test configurations: - # 1. Below 24-bit thresholds we will not observe any interesting rounding - # behavior, as all integers < 2^24 can be exactly represented in 32-bit - # floating-point. Thus, we test thresholds at 25-bit signed integers and - # generate test inputs slightly above and below this. - # 2. We want to test out-of-range clipping of thresholds, in particular - # clipping of the negative portion of signed thresholds. Thus, we only - # generate signed thresholds, but test with signed and unsigned - # inputs of smaller, larger and equal range. - # 3. Testing proper floating-point thresholds requires a separate test-case - "INT23", "UINT23", "INT24", "UINT24", "INT25", "UINT25", "INT26", "UINT26" -]) -@pytest.mark.parametrize("o_dtype", [ - # Explanation for selecting these test configurations: - # 1. Outputs of MultiThreshold are typically much smaller bit-width than the - # inputs and thresholds. - # 2. However, with randomly samples thresholds from a rather large range due - # to the selected input bit-widths (see above), we risk not adequately - # covering the input range if we sample too few thresholds. The number of - # thresholds sampled depends on the bit-width of the output, thus we use - # rather high bit-width for testing. - # 3. For a "real" model, the quantization procedure *should* take care of - # adequately covering the true input range. - "INT8", "UINT8" -]) -@pytest.mark.parametrize("n_elems", [ - # Explanation for selecting these test configurations: - # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4 - # 2. Large test case 256, hopefully amplifying any rarely occurring errors - 1, 2, 3, 4, 256 -]) +@pytest.mark.parametrize( + "i_dtype", + [ + # Explanation for selecting these test configurations: + # 1. Below 24-bit thresholds we will not observe any interesting rounding + # behavior, as all integers < 2^24 can be exactly represented in 32-bit + # floating-point. Thus, we test thresholds at 25-bit signed integers and + # generate test inputs slightly above and below this. + # 2. We want to test out-of-range clipping of thresholds, in particular + # clipping of the negative portion of signed thresholds. Thus, we only + # generate signed thresholds, but test with signed and unsigned + # inputs of smaller, larger and equal range. + # 3. Testing proper floating-point thresholds requires a separate test-case + "INT23", + "UINT23", + "INT24", + "UINT24", + "INT25", + "UINT25", + "INT26", + "UINT26", + ], +) +@pytest.mark.parametrize( + "o_dtype", + [ + # Explanation for selecting these test configurations: + # 1. Outputs of MultiThreshold are typically much smaller bit-width than the + # inputs and thresholds. + # 2. However, with randomly samples thresholds from a rather large range due + # to the selected input bit-widths (see above), we risk not adequately + # covering the input range if we sample too few thresholds. The number of + # thresholds sampled depends on the bit-width of the output, thus we use + # rather high bit-width for testing. + # 3. For a "real" model, the quantization procedure *should* take care of + # adequately covering the true input range. + "INT8", + "UINT8", + ], +) +@pytest.mark.parametrize( + "n_elems", + [ + # Explanation for selecting these test configurations: + # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4 + # 2. Large test case 256, hopefully amplifying any rarely occurring errors + 1, + 2, + 3, + 4, + 256, + ], +) def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems): - # Convert string representation of data type to onnx DataType i_dtype = DataType[i_dtype] t_dtype = DataType["INT25"] # Note: Matches configuration above o_dtype = DataType[o_dtype] # noqa: Duplicate model setup code - # Create a dummy MultiThreshold operation to be tested node = helper.make_node( - # Op-Type of the node "MultiThreshold", - # MultiThreshold is implemented under the qonnx domain domain="qonnx.custom_op.general", - # List the names of the input tensors inputs=["inp", "thresholds"], - # List the names of the output tensors outputs=["out"], - # The CustomOp needs to know the data type of the output to be produced - out_dtype=str(o_dtype) + out_dtype=str(o_dtype), ) - # Number of threshold values required to produce outputs of type o_dtype n_thresholds = o_dtype.get_num_possible_values() - 1 - # Create tensor value infos for all input/output tensors involved inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, n_elems]) out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, n_elems]) - # Create a tensor value info for the thresholds parameter tensor - # Note: Number of thresholds is determined by the output data type thresholds = helper.make_tensor_value_info( "thresholds", TensorProto.FLOAT, [n_elems, n_thresholds] ) - # Combine node and tensor value infos into an onnx graph graph = helper.make_graph([node], "thresholds", [inp, thresholds], [out]) - # Wrap the model graph in a ModelWrapper container model = ModelWrapper(helper.make_model(graph)) - # Sample random tensors of the configured input data type + inp = gen_finn_dt_tensor(i_dtype, [1, n_elems]) - # Generate sorted thresholds for each of the input channels thresholds = np.sort(gen_finn_dt_tensor(t_dtype, [n_elems, n_thresholds])) - # Set data type annotations for the input and thresholds tensor model.set_tensor_datatype("inp", i_dtype) # noqa: Duplicate model execution model.set_tensor_datatype("thresholds", t_dtype) model.set_tensor_datatype("out", o_dtype) - # Set the thresholds as initializer input to the model model.set_initializer("thresholds", thresholds) + # Execute the model before running the RoundAndClipThresholds transformation out_expected = oxe.execute_onnx(model, {"inp": inp})["out"] - # Before rounding the threshold data type must be as annotated assert model.get_tensor_datatype("thresholds") == t_dtype - # Run the transformation to be tested + model = model.transform(RoundAndClipThresholds()) + # After this transformation, the thresholds and output data type should be # inferred correctly assert model.get_tensor_datatype("thresholds") == i_dtype assert model.get_tensor_datatype("out") == o_dtype + # After this transformation, the container type used to store the thresholds # values must be float32. No other type-cast or type promotion may happen. assert model.get_initializer("thresholds").dtype == np.float32 + # After rounding, all thresholds must be integers represented as float32 - assert all( - x.is_integer() for x in model.get_initializer("thresholds").flatten() - ) + assert all(x.is_integer() for x in model.get_initializer("thresholds").flatten()) + # Execute the model after running the RoundAndClipThresholds transformation out_produced = oxe.execute_onnx(model, {"inp": inp})["out"] - # Compare the results before and after: This is the pure integer test-case - # and no actual rounding should happen, thus the rounded operation should - # produce outputs exactly equal. + assert np.all(out_produced == out_expected) # Tests the RoundAndClipThresholds transformation under various input, output # data type combinations with purely integer inputs. This test case tests actual # rounding of floating-point thresholds. -@pytest.mark.parametrize("i_dtype", [ - # Explanation for selecting these test configurations: - # 1. Below 24-bit thresholds we will not observe any interesting rounding - # behavior, as all integers < 2^24 can be exactly represented in 32-bit - # floating-point. Thus, we test thresholds at 25-bit signed integers and - # generate test inputs slightly above and below this. - # 2. We want to test out-of-range clipping of thresholds, in particular - # clipping of the negative portion of signed thresholds. Thus, we only - # generate signed thresholds, but test with signed and unsigned - # inputs of smaller, larger and equal range. - # 3. Testing proper floating-point thresholds requires a separate test-case - "INT23", "UINT23", "INT24", "UINT24", "INT25", "UINT25", "INT26", "UINT26" -]) -@pytest.mark.parametrize("o_dtype", [ - # Explanation for selecting these test configurations: - # 1. Outputs of MultiThreshold are typically much smaller bit-width than the - # inputs and thresholds. - # 2. However, with randomly samples thresholds from a rather large range due - # to the selected input bit-widths (see above), we risk not adequately - # covering the input range if we sample too few thresholds. The number of - # thresholds sampled depends on the bit-width of the output, thus we use - # rather high bit-width for testing. - # 3. For a "real" model, the quantization procedure *should* take care of - # adequately covering the true input range. - "INT8", "UINT8" -]) -@pytest.mark.parametrize("n_elems", [ - # Explanation for selecting these test configurations: - # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4 - # 2. Large test case 256, hopefully amplifying any rarely occurring errors - 1, 2, 3, 4, 256 -]) +@pytest.mark.parametrize( + "i_dtype", + [ + # Explanation for selecting these test configurations: + # 1. Below 24-bit thresholds we will not observe any interesting rounding + # behavior, as all integers < 2^24 can be exactly represented in 32-bit + # floating-point. Thus, we test thresholds at 25-bit signed integers and + # generate test inputs slightly above and below this. + # 2. We want to test out-of-range clipping of thresholds, in particular + # clipping of the negative portion of signed thresholds. Thus, we only + # generate signed thresholds, but test with signed and unsigned + # inputs of smaller, larger and equal range. + # 3. Testing proper floating-point thresholds requires a separate test-case + "INT23", + "UINT23", + "INT24", + "UINT24", + "INT25", + "UINT25", + "INT26", + "UINT26", + ], +) +@pytest.mark.parametrize( + "o_dtype", + [ + # Explanation for selecting these test configurations: + # 1. Outputs of MultiThreshold are typically much smaller bit-width than the + # inputs and thresholds. + # 2. However, with randomly samples thresholds from a rather large range due + # to the selected input bit-widths (see above), we risk not adequately + # covering the input range if we sample too few thresholds. The number of + # thresholds sampled depends on the bit-width of the output, thus we use + # rather high bit-width for testing. + # 3. For a "real" model, the quantization procedure *should* take care of + # adequately covering the true input range. + "INT8", + "UINT8", + ], +) +@pytest.mark.parametrize( + "n_elems", + [ + # Explanation for selecting these test configurations: + # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4 + # 2. Large test case 256, hopefully amplifying any rarely occurring errors + 1, + 2, + 3, + 4, + 256, + ], +) def test_round_and_clip_thresholds_floats(i_dtype, o_dtype, n_elems): - # Convert string representation of data type to onnx DataType i_dtype = DataType[i_dtype] t_dtype = DataType["FLOAT32"] o_dtype = DataType[o_dtype] # noqa: Duplicate model setup code - # Create a dummy MultiThreshold operation to be tested node = helper.make_node( - # Op-Type of the node "MultiThreshold", - # MultiThreshold is implemented under the qonnx domain domain="qonnx.custom_op.general", - # List the names of the input tensors inputs=["inp", "thresholds"], - # List the names of the output tensors outputs=["out"], - # The CustomOp needs to know the data type of the output to be produced - out_dtype=str(o_dtype) + out_dtype=str(o_dtype), ) - # Number of threshold values required to produce outputs of type o_dtype n_thresholds = o_dtype.get_num_possible_values() - 1 - # Create tensor value infos for all input/output tensors involved inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, n_elems]) out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, n_elems]) - # Create a tensor value info for the thresholds parameter tensor - # Note: Number of thresholds is determined by the output data type thresholds = helper.make_tensor_value_info( "thresholds", TensorProto.FLOAT, [n_elems, n_thresholds] ) - # Combine node and tensor value infos into an onnx graph graph = helper.make_graph([node], "thresholds", [inp, thresholds], [out]) - # Wrap the model graph in a ModelWrapper container model = ModelWrapper(helper.make_model(graph)) - # Sample random tensors of the configured input data type + inp = gen_finn_dt_tensor(i_dtype, [1, n_elems]) # Draw uniformly random prototype thresholds in [0,+1] range thresholds = np.random.rand(n_elems, n_thresholds) @@ -238,30 +235,24 @@ def test_round_and_clip_thresholds_floats(i_dtype, o_dtype, n_elems): model.set_tensor_datatype("inp", i_dtype) # noqa: Duplicate model execution model.set_tensor_datatype("thresholds", t_dtype) model.set_tensor_datatype("out", o_dtype) - # Set the thresholds as initializer input to the model model.set_initializer("thresholds", thresholds) + # Execute the model before running the RoundAndClipThresholds transformation out_expected = oxe.execute_onnx(model, {"inp": inp})["out"] # Before rounding the threshold data type must be as annotated assert model.get_tensor_datatype("thresholds") == t_dtype - # Run the transformation to be tested + model = model.transform(RoundAndClipThresholds()) - # After this transformation, the thresholds and output data type should be - # inferred correctly + assert model.get_tensor_datatype("thresholds") == i_dtype assert model.get_tensor_datatype("out") == o_dtype + # After this transformation, the container type used to store the thresholds # values must be float32. No other type-cast or type promotion may happen. assert model.get_initializer("thresholds").dtype == np.float32 # After rounding, all thresholds must be integers represented as float32 - assert all( - x.is_integer() for x in model.get_initializer("thresholds").flatten() - ) - # Execute the model after running the RoundAndClipThresholds transformation + assert all(x.is_integer() for x in model.get_initializer("thresholds").flatten()) + out_produced = oxe.execute_onnx(model, {"inp": inp})["out"] - # Compare the results before and after: This is the floating-point test with - # actual rounding, this the transformed result may only be equal within some - # tolerance. - # Hm, never observed this to be relevant. For all test configurations, exact - # equality seems to hold, probably due to only integer inputs being tested. + assert np.allclose(out_produced, out_expected, atol=1.0e-3) From 717bfc13e2361e767c220a3d298245f04cfd84ef Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 26 Sep 2024 12:57:06 +0100 Subject: [PATCH 13/23] [RoundThresh] Expand rounding of thresholds to hw layers --- src/finn/builder/build_dataflow_steps.py | 2 ++ src/finn/transformation/streamline/round_thresholds.py | 2 +- tests/end2end/test_end2end_bnn_pynq.py | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index bdbcc53d83..ab2280554c 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -121,6 +121,7 @@ ) from finn.transformation.streamline import Streamline from finn.transformation.streamline.reorder import MakeMaxPoolNHWC +from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds from finn.util.basic import ( get_rtlsim_trace_depth, pyverilate_get_liveness_threshold_cycles, @@ -503,6 +504,7 @@ def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig): if cfg.minimize_bit_width: model = model.transform(MinimizeWeightBitWidth()) model = model.transform(MinimizeAccumulatorWidth()) + model = model.transform(RoundAndClipThresholds()) # make sure the changed datatypes are propagated through the network model = model.transform(InferDataTypes()) return model diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py index ab986e7826..907f127896 100644 --- a/src/finn/transformation/streamline/round_thresholds.py +++ b/src/finn/transformation/streamline/round_thresholds.py @@ -46,7 +46,7 @@ def apply(self, model: ModelWrapper): # noqa graph_modified = False for index, node in enumerate(graph.node): op_type = node.op_type - if op_type == "MultiThreshold": + if op_type == "MultiThreshold" or op_type.startswith("Thresholding"): thresholds = model.get_initializer(node.input[1]) if thresholds is None: continue diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index 81c6316ec1..0d3418624a 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -94,6 +94,7 @@ MakeMaxPoolNHWC, MoveScalarLinearPastInvariants, ) +from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds from finn.util.basic import get_finn_root, make_build_dir, test_board_map from finn.util.pytorch import ToTensor from finn.util.test import ( @@ -672,6 +673,7 @@ def test_minimize_bit_width(self, topology, wbits, abits, board): model = load_test_checkpoint_or_skip(prev_chkpt_name) model = model.transform(MinimizeAccumulatorWidth()) model = model.transform(MinimizeWeightBitWidth()) + model = model.transform(RoundAndClipThresholds()) curr_chkpt_name = get_checkpoint_name(topology, wbits, abits, "minimize_bit_width") model.save(curr_chkpt_name) From 6ade140e684167100cce408454efbd9c2b4008c3 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 26 Sep 2024 14:20:04 +0100 Subject: [PATCH 14/23] [RoundThresh] Add change of the weight datatype to hw op threshold rounding --- src/finn/transformation/streamline/round_thresholds.py | 5 +++++ tests/end2end/test_end2end_mobilenet_v1.py | 1 + 2 files changed, 6 insertions(+) diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py index 907f127896..ee6a31e3dc 100644 --- a/src/finn/transformation/streamline/round_thresholds.py +++ b/src/finn/transformation/streamline/round_thresholds.py @@ -29,6 +29,7 @@ import numpy as np from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation from qonnx.transformation.infer_datatypes import InferDataTypes @@ -69,6 +70,10 @@ def apply(self, model: ModelWrapper): # noqa # The rounded and clipped thresholds now fit into the input data # type model.set_tensor_datatype(node.input[1], dtype) + # If hw op we need to set the weight data type attribute as well + if op_type.startswith("Thresholding"): + inst = getCustomOp(node) + inst.set_nodeattr("weightDataType", dtype.name) # ones if np.any(new_thresholds != thresholds): # Track the graph has been modified to inform the transform diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py index 01d995c147..4c52277970 100644 --- a/tests/end2end/test_end2end_mobilenet_v1.py +++ b/tests/end2end/test_end2end_mobilenet_v1.py @@ -353,6 +353,7 @@ def test_end2end_mobilenet_minimize_bit_width(): model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_folded.onnx") model = model.transform(MinimizeAccumulatorWidth()) model = model.transform(MinimizeWeightBitWidth()) + model = model.transform(RoundAndClipThresholds()) model.save(build_dir + "/end2end_mobilenet_minimize_bitwidth.onnx") From db353f4fda97df13c593c0a6733e1e3aee9c3ecc Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 27 Sep 2024 15:36:10 +0100 Subject: [PATCH 15/23] [RoundThresh] Allow for range + 1 --- .../streamline/round_thresholds.py | 17 ++++++++++++----- .../test_fpgadataflow_thresholding.py | 11 +++++++---- .../streamline/test_round_thresholds.py | 16 ++++++++++++++-- 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py index ee6a31e3dc..312db404ac 100644 --- a/src/finn/transformation/streamline/round_thresholds.py +++ b/src/finn/transformation/streamline/round_thresholds.py @@ -28,6 +28,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import numpy as np +from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation @@ -62,18 +63,24 @@ def apply(self, model: ModelWrapper): # noqa # introduce extra inaccuracies due to large integers not being # exactly representable in floating-point representation. # See for example: np.ceil(np.float32(16777217)) == 16777216 - new_thresholds = np.clip(np.ceil(thresholds), dtype.min(), dtype.max()) + new_thresholds = np.clip(np.ceil(thresholds), dtype.min(), dtype.max() + 1) # Convert back to the preferred float32 container type new_thresholds = new_thresholds.astype(np.float32) # Insert the rounded and clipped thresholds back into the model model.set_initializer(node.input[1], new_thresholds) - # The rounded and clipped thresholds now fit into the input data - # type - model.set_tensor_datatype(node.input[1], dtype) + # The rounded and clipped thresholds now fit into a data type + # that is one bit bigger than the input datatype + # Determine new max_value + max_val = dtype.max() + 1 + if not dtype.signed(): + tdt = DataType.get_smallest_possible(max_val) + else: + tdt = DataType.get_smallest_possible(-(max_val) - 1) + model.set_tensor_datatype(node.input[1], tdt) # If hw op we need to set the weight data type attribute as well if op_type.startswith("Thresholding"): inst = getCustomOp(node) - inst.set_nodeattr("weightDataType", dtype.name) + inst.set_nodeattr("weightDataType", tdt.name) # ones if np.any(new_thresholds != thresholds): # Track the graph has been modified to inform the transform diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index fe7ba3d9fb..2079fe7fc5 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -49,6 +49,7 @@ from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 @@ -133,10 +134,8 @@ def make_single_multithresholding_modelwrapper( @pytest.mark.parametrize( "idt_tdt_cfg", [ - (DataType["INT8"], DataType["INT8"]), - (DataType["INT8"], DataType["INT9"]), - (DataType["UINT5"], DataType["UINT5"]), - (DataType["UINT5"], DataType["UINT6"]), + (DataType["INT8"], DataType["INT25"]), + (DataType["UINT5"], DataType["UINT8"]), ], ) @pytest.mark.parametrize("fold", [-1, 1, 2]) @@ -145,6 +144,7 @@ def make_single_multithresholding_modelwrapper( @pytest.mark.parametrize("impl_style", ["hls", "rtl"]) @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) @pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"]) +@pytest.mark.parametrize("round_thresh", [True, False]) @pytest.mark.fpgadataflow @pytest.mark.vivado @pytest.mark.slow @@ -159,6 +159,7 @@ def test_fpgadataflow_thresholding( impl_style, exec_mode, mem_mode, + round_thresh, ): # the mem_mode parameter can only be used for the hls thresholding # so the test will only be executed once for impl_style=rtl and once skipped @@ -234,6 +235,8 @@ def test_fpgadataflow_thresholding( node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0] inst = getCustomOp(node) inst.set_nodeattr("PE", pe) + if round_thresh is True: + model = model.transform(RoundAndClipThresholds()) model = model.transform(GiveUniqueNodeNames()) if impl_style == "hls": diff --git a/tests/transformation/streamline/test_round_thresholds.py b/tests/transformation/streamline/test_round_thresholds.py index 7e2d39176e..6de82e6750 100644 --- a/tests/transformation/streamline/test_round_thresholds.py +++ b/tests/transformation/streamline/test_round_thresholds.py @@ -96,6 +96,7 @@ 256, ], ) +@pytest.mark.streamline def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems): i_dtype = DataType[i_dtype] t_dtype = DataType["INT25"] # Note: Matches configuration above @@ -106,6 +107,7 @@ def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems): inputs=["inp", "thresholds"], outputs=["out"], out_dtype=str(o_dtype), + out_bias=float(o_dtype.min()), ) n_thresholds = o_dtype.get_num_possible_values() - 1 inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, n_elems]) @@ -117,6 +119,7 @@ def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems): model = ModelWrapper(helper.make_model(graph)) inp = gen_finn_dt_tensor(i_dtype, [1, n_elems]) + inp[0][0] = i_dtype.max() thresholds = np.sort(gen_finn_dt_tensor(t_dtype, [n_elems, n_thresholds])) model.set_tensor_datatype("inp", i_dtype) # noqa: Duplicate model execution model.set_tensor_datatype("thresholds", t_dtype) @@ -131,7 +134,11 @@ def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems): # After this transformation, the thresholds and output data type should be # inferred correctly - assert model.get_tensor_datatype("thresholds") == i_dtype + if not i_dtype.signed(): + new_tdt = DataType.get_smallest_possible(i_dtype.max() + 1) + else: + new_tdt = DataType.get_smallest_possible(-(i_dtype.max() + 1) - 1) + assert model.get_tensor_datatype("thresholds") == new_tdt assert model.get_tensor_datatype("out") == o_dtype # After this transformation, the container type used to store the thresholds @@ -203,6 +210,7 @@ def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems): 256, ], ) +@pytest.mark.streamline def test_round_and_clip_thresholds_floats(i_dtype, o_dtype, n_elems): i_dtype = DataType[i_dtype] t_dtype = DataType["FLOAT32"] @@ -244,7 +252,11 @@ def test_round_and_clip_thresholds_floats(i_dtype, o_dtype, n_elems): model = model.transform(RoundAndClipThresholds()) - assert model.get_tensor_datatype("thresholds") == i_dtype + if not i_dtype.signed(): + new_tdt = DataType.get_smallest_possible(i_dtype.max() + 1) + else: + new_tdt = DataType.get_smallest_possible(-(i_dtype.max() + 1) - 1) + assert model.get_tensor_datatype("thresholds") == new_tdt assert model.get_tensor_datatype("out") == o_dtype # After this transformation, the container type used to store the thresholds From b250047d444dfdc129bd667ce790c9c7982f2b39 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 11 Oct 2024 09:47:01 +0100 Subject: [PATCH 16/23] [tutorial] Update folding config to new custom operator structure --- tutorials/fpga_flow/folding_config.json | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tutorials/fpga_flow/folding_config.json b/tutorials/fpga_flow/folding_config.json index 642200d02b..bf94f8058d 100644 --- a/tutorials/fpga_flow/folding_config.json +++ b/tutorials/fpga_flow/folding_config.json @@ -1,30 +1,29 @@ { "Defaults": {}, - "Thresholding_Batch_0": { - "PE": 49, - "ram_style": "block" + "Thresholding_rtl_0": { + "PE": 49 }, - "MatrixVectorActivation_0": { + "MVAU_hls_0": { "PE": 16, "SIMD": 49, "ram_style": "block" }, - "MatrixVectorActivation_1": { + "MVAU_hls_1": { "PE": 8, "SIMD": 8, "ram_style": "auto" }, - "MatrixVectorActivation_2": { + "MVAU_hls_2": { "PE": 8, "SIMD": 8, "ram_style": "auto" }, - "MatrixVectorActivation_3": { + "MVAU_hls_3": { "PE": 10, "SIMD": 8, "ram_style": "distributed" }, - "LabelSelect_Batch_0": { + "LabelSelect_hls_0": { "PE": 1 } } From b48147e0a6637659a8a7127dd0016edded998ed5 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 11 Oct 2024 10:36:20 +0100 Subject: [PATCH 17/23] [tutorial] Format tutorial README --- tutorials/fpga_flow/README.md | 44 ++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/tutorials/fpga_flow/README.md b/tutorials/fpga_flow/README.md index 2aaad0423b..71f2a2a625 100644 --- a/tutorials/fpga_flow/README.md +++ b/tutorials/fpga_flow/README.md @@ -25,20 +25,29 @@ This demo was created using Vivado 2022.1. Prior to running, insure the following prerequisites have been met: - Install FINN and prerequisites. The [Getting Started](https://finn.readthedocs.io/en/latest/getting_started.html#quickstart) section of the FINN documentation might be helpful for this. - Ensure you have the `FINN_XILINX_PATH` and `FINN_XILINX_VERSION` env variables set appropriately for your install. For example: -> export FINN_XILINX_PATH=/opt/Xilinx -> export FINN_XILINX_VERSION=2022.1 +```shell +export FINN_XILINX_PATH=/opt/Xilinx +export FINN_XILINX_VERSION=2022.1 +``` + - Set the env variable for your `finn` install top directory (where you cloned the FINN compiler repo): -> export FINN_ROOT=/home/foo/finn +```shell +export FINN_ROOT=/home/foo/finn +``` Then, change to `finn` install directory and invoke the build as follows: -> cd ${FINN_ROOT} -> ./run-docker.sh build_custom ${FINN_ROOT}/tutorials/fpga_flow/ +```shell +cd ${FINN_ROOT} +./run-docker.sh build_custom ${FINN_ROOT}/tutorials/fpga_flow/ +``` Alternatively, since the tutorials folder is already part of the FINN compiler installation, you can invoke it from within the Docker container: -> cd ${FINN_ROOT} -> ./run-docker.sh -> cd tutorials/fpga_flow -> python build.py +```shell +cd ${FINN_ROOT} +./run-docker.sh +cd tutorials/fpga_flow +python build.py +``` The build should finish in about 10 minutes, and the FINN docker will close on success. @@ -59,12 +68,14 @@ The build should finish in about 10 minutes, and the FINN docker will close on s ### Examine the Stitched IP Navigate to the stitched IP project directory: - -> cd ${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/stitched_ip +```shell +cd ${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/stitched_ip +``` And, open the project: - -> vivado finn_vivado_stitch_proj.xpr +```shell +vivado finn_vivado_stitch_proj.xpr +``` Explore the IPI board design and note the interfaces. @@ -89,9 +100,10 @@ them under `${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/sim`. Let's ex the FINN compiler. Used for launching the testbench simulation. You can now launch the simulation as follows: - -> cd ${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/sim -> vivado -mode gui -source make_sim_proj.tcl +```shell +cd ${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/sim +vivado -mode gui -source make_sim_proj.tcl +``` The simulation should complete with: From f6acf7075b3af97719edd3705f1268f0d357e0fa Mon Sep 17 00:00:00 2001 From: Alexander Hornburg Date: Wed, 23 Oct 2024 17:42:26 +0100 Subject: [PATCH 18/23] [Infra] support passing arguments to build_custom flow --- run-docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run-docker.sh b/run-docker.sh index b1fe44eb0c..1358337a37 100755 --- a/run-docker.sh +++ b/run-docker.sh @@ -142,7 +142,7 @@ elif [ "$1" = "build_custom" ]; then DOCKER_INTERACTIVE="-it" #FINN_HOST_BUILD_DIR=$BUILD_DATAFLOW_DIR/build gecho "Running build_custom: $BUILD_CUSTOM_DIR/$FLOW_NAME.py" - DOCKER_CMD="python -mpdb -cc -cq $FLOW_NAME.py" + DOCKER_CMD="python -mpdb -cc -cq $FLOW_NAME.py ${@:4}" elif [ -z "$1" ]; then gecho "Running container only" DOCKER_CMD="bash" From 1d7636b8f8d841eda4e20b6cbd365b4a7257f24d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 23 Oct 2024 17:41:32 +0000 Subject: [PATCH 19/23] Bump onnx from 1.13.0 to 1.17.0 Bumps [onnx](https://github.com/onnx/onnx) from 1.13.0 to 1.17.0. - [Release notes](https://github.com/onnx/onnx/releases) - [Changelog](https://github.com/onnx/onnx/blob/main/docs/Changelog-ml.md) - [Commits](https://github.com/onnx/onnx/compare/v1.13.0...v1.17.0) --- updated-dependencies: - dependency-name: onnx dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d4ca45cb37..85a0ca1175 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ gspread==3.6.0 importlib-resources==6.1.0 ipython==8.12.2 numpy==1.24.1 -onnx==1.13.0 +onnx==1.17.0 onnxoptimizer onnxruntime==1.16.1 pre-commit==3.3.2 From 14b68b7efa235089bf7e1d8d40416095bcb23e81 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 24 Oct 2024 14:29:36 +0100 Subject: [PATCH 20/23] [Infra] Add no-cache env var for run docker script --- run-docker.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/run-docker.sh b/run-docker.sh index 1358337a37..8bf6440d4f 100755 --- a/run-docker.sh +++ b/run-docker.sh @@ -102,6 +102,7 @@ SCRIPTPATH=$(dirname "$SCRIPT") : ${FINN_SINGULARITY=""} : ${FINN_SKIP_XRT_DOWNLOAD=""} : ${FINN_XRT_PATH=""} +: ${FINN_DOCKER_NO_CACHE="0"} DOCKER_INTERACTIVE="" @@ -190,12 +191,18 @@ if [ -d "$FINN_XRT_PATH" ];then export LOCAL_XRT=1 fi +if [ "$FINN_DOCKER_NO_CACHE" = "1" ]; then + export NO_CACHE_STRING="--no-cache" +else + export NO_CACHE_STRING="" +fi + # Build the FINN Docker image if [ "$FINN_DOCKER_PREBUILT" = "0" ] && [ -z "$FINN_SINGULARITY" ]; then # Need to ensure this is done within the finn/ root folder: OLD_PWD=$(pwd) cd $SCRIPTPATH - docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --build-arg SKIP_XRT=$FINN_SKIP_XRT_DOWNLOAD --build-arg LOCAL_XRT=$LOCAL_XRT --tag=$FINN_DOCKER_TAG $FINN_DOCKER_BUILD_EXTRA . + docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --build-arg SKIP_XRT=$FINN_SKIP_XRT_DOWNLOAD --build-arg LOCAL_XRT=$LOCAL_XRT --tag=$FINN_DOCKER_TAG $FINN_DOCKER_BUILD_EXTRA $NO_CACHE_STRING . cd $OLD_PWD fi From 72dcb87f510436d60ad0c370e6b90692ebf5b213 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 24 Oct 2024 14:41:37 +0100 Subject: [PATCH 21/23] [Infra] Re-use build extra env vars to enable no cache option --- run-docker.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/run-docker.sh b/run-docker.sh index 8bf6440d4f..69c998c467 100755 --- a/run-docker.sh +++ b/run-docker.sh @@ -192,9 +192,7 @@ if [ -d "$FINN_XRT_PATH" ];then fi if [ "$FINN_DOCKER_NO_CACHE" = "1" ]; then - export NO_CACHE_STRING="--no-cache" -else - export NO_CACHE_STRING="" + FINN_DOCKER_BUILD_EXTRA+="--no-cache" fi # Build the FINN Docker image @@ -202,7 +200,7 @@ if [ "$FINN_DOCKER_PREBUILT" = "0" ] && [ -z "$FINN_SINGULARITY" ]; then # Need to ensure this is done within the finn/ root folder: OLD_PWD=$(pwd) cd $SCRIPTPATH - docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --build-arg SKIP_XRT=$FINN_SKIP_XRT_DOWNLOAD --build-arg LOCAL_XRT=$LOCAL_XRT --tag=$FINN_DOCKER_TAG $FINN_DOCKER_BUILD_EXTRA $NO_CACHE_STRING . + docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --build-arg SKIP_XRT=$FINN_SKIP_XRT_DOWNLOAD --build-arg LOCAL_XRT=$LOCAL_XRT --tag=$FINN_DOCKER_TAG $FINN_DOCKER_BUILD_EXTRA . cd $OLD_PWD fi From f0aafa261e7a8f57891ba12cd1572e7d3062bc19 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 24 Oct 2024 15:19:55 +0100 Subject: [PATCH 22/23] [Infra] Add space to no cache var to allow for future extension --- run-docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run-docker.sh b/run-docker.sh index 69c998c467..b59af88eb7 100755 --- a/run-docker.sh +++ b/run-docker.sh @@ -192,7 +192,7 @@ if [ -d "$FINN_XRT_PATH" ];then fi if [ "$FINN_DOCKER_NO_CACHE" = "1" ]; then - FINN_DOCKER_BUILD_EXTRA+="--no-cache" + FINN_DOCKER_BUILD_EXTRA+="--no-cache " fi # Build the FINN Docker image From a9f1898deccb74a4f8e38717c5bef00e46c9f70f Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 1 Nov 2024 11:35:04 +0000 Subject: [PATCH 23/23] Use Vivado tclstore from install instead of home --- run-docker.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/run-docker.sh b/run-docker.sh index b59af88eb7..ec55299f6c 100755 --- a/run-docker.sh +++ b/run-docker.sh @@ -231,6 +231,9 @@ DOCKER_EXEC+="-e NUM_DEFAULT_WORKERS=$NUM_DEFAULT_WORKERS " # Workaround for FlexLM issue, see: # https://community.flexera.com/t5/InstallAnywhere-Forum/Issues-when-running-Xilinx-tools-or-Other-vendor-tools-in-docker/m-p/245820#M10647 DOCKER_EXEC+="-e LD_PRELOAD=/lib/x86_64-linux-gnu/libudev.so.1 " +# Workaround for running multiple Vivado instances simultaneously, see: +# https://adaptivesupport.amd.com/s/article/63253?language=en_US +DOCKER_EXEC+="-e XILINX_LOCAL_USER_DATA=no " if [ "$FINN_DOCKER_RUN_AS_ROOT" = "0" ] && [ -z "$FINN_SINGULARITY" ];then DOCKER_EXEC+="-v /etc/group:/etc/group:ro " DOCKER_EXEC+="-v /etc/passwd:/etc/passwd:ro "