From 337dced31be208116b196d69855ecf3b731309d9 Mon Sep 17 00:00:00 2001
From: lstasytis <l.stasytis1@gmail.com>
Date: Wed, 18 Sep 2024 10:29:19 +0100
Subject: [PATCH] refactoring and moving log computations to cpp compile side

---
 .../custom_op/fpgadataflow/hls/iodma_hls.py   |  16 +-
 .../hls/streamingdatawidthconverter_hls.py    |  11 +-
 .../streamingdatawidthconverter.py            | 316 ++++++++++-
 tests/fpgadataflow/test_fpgadataflow_dwc.py   | 492 +++++-------------
 4 files changed, 454 insertions(+), 381 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py
index eb6fa977ae..0ba7ba974f 100644
--- a/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py
@@ -236,7 +236,7 @@ def docompute(self):
             raise ValueError("Invalid IODMA direction, please set to in or out")
         # define templates for instantiation
         dma_inst_template = func + "<DataWidth1, NumBytes1>(%s, %s, numReps);"
-        dwc_inst_template = dwc_func + "<%d, %d, %d, %d, %d, %d, %d, %d>(%s, %s, numReps);"
+        dwc_inst_template = dwc_func + "<%d, %d, %d, %d, %d>(%s, %s, numReps);"
         # do stream infrastructure and instantiations
         intfw = self.get_nodeattr("intfWidth")
         strmw = self.get_nodeattr("streamWidth")
@@ -257,10 +257,6 @@ def docompute(self):
             if outWidth > inWidth:
                 totalIters += int(np.floor(outWidth / inWidth) + 1) - 1
 
-            NumInWordsLog = int(np.log2(numInWords) + 1)
-            NumOutWordsLog = int(np.log2(numOutWords) + 1)
-            BufferWidthLog = int(np.log2(inWidth + outWidth) + 1)
-
             # AXI MM -> IODMA -> (DWCs) -> out
             # DWCs depend on AXI MM and out interface width
             if strmw == intfw:
@@ -281,9 +277,6 @@ def docompute(self):
                         outWidth,
                         numInWords,
                         numOutWords,
-                        NumInWordsLog,
-                        NumOutWordsLog,
-                        BufferWidthLog,
                         totalIters,
                         "dma2dwc",
                         "out_" + self.hls_sname(),
@@ -301,10 +294,6 @@ def docompute(self):
             if outWidth > inWidth:
                 totalIters += int(np.floor(outWidth / inWidth) + 1) - 1
 
-            NumInWordsLog = int(np.log2(numInWords) + 1)
-            NumOutWordsLog = int(np.log2(numOutWords) + 1)
-            BufferWidthLog = int(np.log2(inWidth + outWidth) + 1)
-
             # in0 -> (DWCs) -> IODMA -> AXI MM
             # DWCs depend on AXI MM and out interface width
             if strmw == intfw:
@@ -324,9 +313,6 @@ def docompute(self):
                         outWidth,
                         numInWords,
                         numOutWords,
-                        NumInWordsLog,
-                        NumOutWordsLog,
-                        BufferWidthLog,
                         totalIters,
                         "in0_" + self.hls_sname(),
                         "dwc2dma",
diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py
index 94f54939bc..81f43c3315 100644
--- a/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py
@@ -41,7 +41,7 @@
 
 
 class StreamingDataWidthConverter_hls(StreamingDataWidthConverter, HLSBackend):
-    """Class that corresponds to finn-hlslib StreamingDataWidthConverter_Batch
+    """Class that corresponds to finn-hlslib StreamingDataWidthConverterGeneralized_Batch
     function."""
 
     def get_nodeattr_types(self):
@@ -77,18 +77,12 @@ def defines(self, var):
         if outWidth > inWidth:
             totalIters += int(np.floor(outWidth / inWidth) + 1) - 1
 
-        NumInWordsLog = int(np.log2(numInWords) + 1)
-        NumOutWordsLog = int(np.log2(numOutWords) + 1)
-        BufferWidthLog = int(np.log2(inWidth + outWidth) + 1)
 
         self.code_gen_dict["$DEFINES$"] = [
             "#define InWidth %d " % inWidth,
             "#define OutWidth %d " % outWidth,
             "#define NumInWords %d " % numInWords,
             "#define NumOutWords %d " % numOutWords,
-            "#define NumInWordsLog %d " % NumInWordsLog,
-            "#define NumOutWordsLog %d " % NumOutWordsLog,
-            "#define BufferWidthLog %d " % BufferWidthLog,
             "#define totalIters %d " % totalIters,
             "#define numReps %d" % numReps,
         ]
@@ -109,11 +103,10 @@ def strm_decl(self):
 
     def docompute(self):
         # TODO continue with fxns below, they are copy-pasted
-        op = "StreamingDataWidthConverter_Batch"
+        op = "StreamingDataWidthConverterGeneralized_Batch"
 
         self.code_gen_dict["$DOCOMPUTE$"] = [
             "%s<InWidth, OutWidth, NumInWords,NumOutWords," % op
-            + "NumInWordsLog, NumOutWordsLog, BufferWidthLog,"
             + " totalIters>(in0_%s, out_%s, numReps);" % (self.hls_sname(), self.hls_sname())
         ]
 
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py
index 3b670e0241..37dbead02c 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py
@@ -49,7 +49,6 @@ def get_nodeattr_types(self):
             # bit width of input and output streams
             "inWidth": ("i", True, 0),
             "outWidth": ("i", True, 0),
-            "generalized_variant": ("i", True, 1),
             # FINN DataTypes for inputs/outputs
             "dataType": ("s", True, ""),
         }
@@ -241,4 +240,317 @@ def get_exp_cycles(self):
         exp_cycles = words + min_words
     
         return int(exp_cycles)
-    
\ No newline at end of file
+    
+
+    def prepare_kwargs_for_characteristic_fx(self):
+
+        numInWords = int(np.prod(self.get_folded_input_shape()[-2:-1]))
+        numOutWords = int(np.prod(self.get_folded_output_shape()[-2:-1]))
+        numReps = int(np.prod(self.get_folded_input_shape()[:1]))
+
+        inWidth = self.get_nodeattr("inWidth")
+        outWidth = self.get_nodeattr("outWidth")
+        
+
+
+        kwargs = (numInWords,numOutWords,inWidth,outWidth,numReps)
+
+       # assert True==False
+        return kwargs
+
+
+
+    def characteristic_fx_input(self, txns, cycles, counter, kwargs):
+
+        (numInWords,numOutWords,inWidth,outWidth,numReps) = kwargs
+
+
+
+
+        # HYPER PARAMETERS WHICH MAY CHANGE OVER TIME
+        windup_clocks_up_convert_input = 4
+
+
+        windup_clocks_down_convert_input = 3
+
+
+        windup_clocks_down_convert_output = 4
+        windup_clocks_equal_convert_output = 3
+        
+
+
+        if numInWords < windup_clocks_up_convert_input:
+            windup_clocks_up_convert_input = numInWords
+
+        if numInWords < windup_clocks_down_convert_input:
+            windup_clocks_down_convert_input = numInWords
+
+
+
+        if numOutWords < windup_clocks_down_convert_output:
+            windup_clocks_down_convert_output = numOutWords
+
+        
+
+        if numOutWords < windup_clocks_equal_convert_output:
+            windup_clocks_equal_convert_output = numOutWords
+
+
+        # calculation to adjust for padding or cropping adding latency
+        
+
+        if outWidth > inWidth:
+            higher = outWidth
+            lower = inWidth
+        else:
+            higher = inWidth
+            lower = outWidth
+
+        if higher % lower != 0:
+            if numInWords*inWidth > numOutWords*outWidth:
+                crop = True
+                pad = False
+            else:
+                cropping = False
+                pad = True
+
+        else:
+            crop = False
+            pad = False
+
+
+        # first input period
+        tracker = 0
+        maximum = numReps*numInWords
+
+        if numReps > 1:
+            # loop windup
+            for i in range(2):
+                txns.append(counter)
+                counter+=1
+                cycles+=1
+                tracker+=1
+
+        for j in range(0,numReps):
+            for i in range(0,numInWords):
+                if tracker < maximum:
+                    txns.append(counter)
+                    counter+=1
+                    cycles+=1
+                    tracker+=1
+            for i in range(0,1):
+                txns.append(counter)
+                cycles+=1
+
+        return txns, cycles, counter
+
+
+
+    def characteristic_fx_output(self, txns, cycles, counter, kwargs):
+
+        (numInWords,numOutWords,inWidth,outWidth,numReps) = kwargs
+
+
+
+
+
+        # HYPER PARAMETERS WHICH MAY CHANGE
+        windup_clocks_up_convert_input = 3
+        windup_clocks_down_convert_input = 2
+
+
+        windup_clocks_down_convert_output = 3
+        windup_clocks_equal_convert_output = 2
+        
+
+
+        if numInWords < windup_clocks_up_convert_input:
+            windup_clocks_up_convert_input = numInWords
+
+        if numInWords < windup_clocks_down_convert_input:
+            windup_clocks_down_convert_input = numInWords
+
+
+
+        if numOutWords < windup_clocks_down_convert_output:
+            windup_clocks_down_convert_output = numOutWords
+
+        
+
+        if numOutWords < windup_clocks_equal_convert_output:
+            windup_clocks_equal_convert_output = numOutWords
+
+
+
+
+        # calculation to adjust for padding or cropping adding latency
+        
+
+        if outWidth > inWidth:
+            higher = outWidth
+            lower = inWidth
+        else:
+            higher = inWidth
+            lower = outWidth
+
+        if higher % lower != 0:
+            if numInWords*inWidth > numOutWords*outWidth:
+                crop = True
+                pad = False
+            else:
+                cropping = False
+                pad = True
+
+        else:
+            crop = False
+            pad = False
+
+
+
+            # windup period
+            if inWidth == outWidth:
+                clock = windup_clocks_equal_convert_output
+            else:
+                clock = windup_clocks_up_convert_input
+            for i in range(0,clock):
+                txns.append(counter)
+                cycles+=1
+               # padding +=1
+
+            # first input period
+
+            if pad:
+                offset = 2
+            else:
+                offset = 1
+
+
+            remainder = 0
+
+
+            for k in range(numReps):
+
+                # windup
+                txns.append(counter)
+                cycles+=1
+
+                for i in range(0,numOutWords):
+                    for j in range(0,int(np.floor(outWidth/inWidth))):
+                        if j != 0:
+                            txns.append(counter)
+                            cycles +=1
+                        remainder += inWidth
+                    #  padding +=1
+                        
+
+
+                    if pad and remainder < outWidth:
+                        print(remainder)
+                        txns.append(counter)
+                        remainder += inWidth
+                        cycles +=1
+
+                    txns.append(counter)
+                    cycles +=1
+
+                    counter+=1
+                    remainder -= outWidth
+
+
+        return txns, cycles, counter
+
+
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
+        }
+
+        ignore = self.get_nodeattr("ipgen_ignore")
+        if ignore == 0: # this node is being derived using RTLSIM
+            # RTL-based flow
+            super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
+            return
+
+     
+
+        # Analytical flow 
+        
+        txns_in = {key: [] for (key, value) in io_dict["inputs"].items() if "in" in key}
+        txns_out = {key: [] for (key, value) in io_dict["outputs"].items() if "out" in key}
+
+        all_txns_in = np.empty((len(txns_in.keys()), 2 * period), dtype=np.int32)
+        all_txns_out = np.empty((len(txns_out.keys()), 2 * period), dtype=np.int32)
+
+
+        self.set_nodeattr("io_chrc_period",period)
+
+
+
+
+        txn_in = []
+        txn_out = []
+
+
+        # INPUT
+
+        counter = 0
+        padding = 0
+        
+
+        kwargs = self.prepare_kwargs_for_characteristic_fx()
+
+        
+        # first period
+        cycles = 0
+        txn_in, cycles, counter = self.characteristic_fx_input(txn_in,cycles,counter,kwargs)
+
+        txn_in += [counter] * (period-cycles)
+        padding+=(period*-cycles)
+        
+
+        # second period
+        cycles = period
+        txn_in, cycles, counter = self.characteristic_fx_input(txn_in,cycles,counter,kwargs)
+
+
+        #for i in range(cycles,period*2):
+        #    txn_in.append(counter)
+        #pads = (period*2-cycles)
+
+        txn_in += [counter] * (period*2-cycles)
+        padding+=(period*2-cycles)
+
+        # final assignments
+        all_txns_in[0, :] = np.array(txn_in)
+        self.set_nodeattr("io_chrc_in", all_txns_in)
+        self.set_nodeattr("io_chrc_pads_in", padding)
+
+
+        # OUTPUT
+        
+        counter = 0
+        cycles = 0  
+        padding = 0          
+
+
+        txn_out, cycles, counter = self.characteristic_fx_output(txn_out,cycles,counter,kwargs)
+
+
+        txn_out += [counter] * (period-cycles)
+        padding += (period*-cycles)
+
+        cycles = period
+
+        txn_out, cycles, counter = self.characteristic_fx_output(txn_out,cycles,counter,kwargs)
+
+        txn_out += [counter] * (period*2-cycles)
+        padding+=(period*2-cycles)
+
+
+        all_txns_out[0, :] = np.array(txn_out)   
+        self.set_nodeattr("io_chrc_out", all_txns_out)
+        self.set_nodeattr("io_chrc_pads_out", padding)
diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py
index 04c0a82b1c..f86c62a9a6 100644
--- a/tests/fpgadataflow/test_fpgadataflow_dwc.py
+++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py
@@ -1,5 +1,4 @@
-# Copyright (C) 2020-2022, Xilinx, Inc.
-# Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -27,365 +26,148 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import pytest
-
 import numpy as np
-import os
-import xml.etree.ElementTree as ET
-from onnx import TensorProto, helper
-from qonnx.core.datatype import DataType
-from qonnx.core.modelwrapper import ModelWrapper
+from onnx import TensorProto
+from onnx import helper as oh
 from qonnx.custom_op.registry import getCustomOp
-from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
-
-import finn.builder.build_dataflow as build
-import finn.builder.build_dataflow_config as build_cfg
-import finn.core.onnx_exec as oxe
-from finn.analysis.fpgadataflow.post_synth_res import post_synth_res
-from finn.core.throughput_test import throughput_test_rtlsim
-from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
-from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
-from finn.util.basic import make_build_dir
-from finn.util.fpgadataflow import is_hls_node, is_rtl_node
+from qonnx.transformation.base import Transformation
 
+from finn.util.fpgadataflow import is_fpgadataflow_node
 
-def post_synth_res_dwc(model, override_synth_report_filename=None):
-    """Extracts the FPGA resource results from the Vivado synthesis.
-    This function extras only a DWC from a DWC-only stitched model
 
-    Returns {node name : resources_dict}."""
+def _is_dwc_node(node):
+    return node.op_type.startswith("StreamingDataWidthConverter")
 
-    res_dict = {}
-    if override_synth_report_filename is not None:
-        synth_report_filename = override_synth_report_filename
-    else:
-        synth_report_filename = model.get_metadata_prop("vivado_synth_rpt")
-    if os.path.isfile(synth_report_filename):
-        tree = ET.parse(synth_report_filename)
-        root = tree.getroot()
-        all_cells = root.findall(".//tablecell")
-        # strip all whitespace from table cell contents
-        for cell in all_cells:
-            cell.attrib["contents"] = cell.attrib["contents"].strip()
-    else:
-        raise Exception("Please run synthesis first")
-
-    # TODO build these indices based on table headers instead of harcoding
-    restype_to_ind_default = {
-        "LUT": 2,
-        "SRL": 5,
-        "FF": 6,
-        "BRAM_36K": 7,
-        "BRAM_18K": 8,
-        "DSP48": 9,
-    }
-    restype_to_ind_vitis = {
-        "LUT": 4,
-        "SRL": 7,
-        "FF": 8,
-        "BRAM_36K": 9,
-        "BRAM_18K": 10,
-        "URAM": 11,
-        "DSP48": 12,
-    }
-
-    if model.get_metadata_prop("platform") == "alveo":
-        restype_to_ind = restype_to_ind_vitis
-    else:
-        restype_to_ind = restype_to_ind_default
 
-    def get_instance_stats(inst_name):
-        row = root.findall(".//*[@contents='%s']/.." % inst_name)
-        if row != []:
-            node_dict = {}
-            row = list(row[0])
-            for restype, ind in restype_to_ind.items():
-                node_dict[restype] = int(row[ind].attrib["contents"])
-            return node_dict
+def _suitable_node(node):
+    if node is not None:
+        if is_fpgadataflow_node(node):
+            if _is_dwc_node(node):
+                # no DWC for DWCs
+                return False
+            elif node.op_type == "IODMA_hls":
+                # IODMA data shapes/widths need special handling
+                return False
+            else:
+                return True
         else:
-            return None
-
-    # global (top-level) stats, including shell etc.
-    top_dict = get_instance_stats("(top)")
-    if top_dict is not None:
-        res_dict["(top)"] = top_dict
-
-    for node in model.graph.node:
-        if node.op_type == "StreamingDataflowPartition":
-            sdp_model = ModelWrapper(getCustomOp(node).get_nodeattr("model"))
-            sdp_res_dict = post_synth_res(sdp_model, synth_report_filename)
-            res_dict.update(sdp_res_dict)
-        elif is_hls_node(node) or is_rtl_node(node):
-            node_dict = get_instance_stats(
-                f"top_StreamingDataflowPartition_1_0_StreamingDataflowPartition_1_StreamingDataflowPartition_1_StreamingDataWidthConverter_hls_0_0"
-            )
-            if node_dict is not None:
-                res_dict[node.name] = node_dict
-
-    return res_dict
-
-
-def make_single_dwc_modelwrapper(in_shape, out_shape, inWidth, outWidth, finn_dtype):
-    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, in_shape)
-    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, out_shape)
-
-    optype = "StreamingDataWidthConverter"
-
-    DWC_node = helper.make_node(
-        optype,
-        ["inp"],
-        ["outp"],
-        domain="finn.custom_op.fpgadataflow",
-        backend="fpgadataflow",
-        in_shape=in_shape,
-        out_shape=out_shape,
-        inWidth=inWidth,
-        outWidth=outWidth,
-        preferred_impl_style="hls",
-        generalized_variant=True,
-        dataType=str(finn_dtype.name),
-    )
-
-    graph = helper.make_graph(nodes=[DWC_node], name="dwc_graph", inputs=[inp], outputs=[outp])
-
-    model = qonnx_make_model(graph, producer_name="dwc-model")
-    model = ModelWrapper(model)
-
-    model.set_tensor_datatype("inp", finn_dtype)
-    model.set_tensor_datatype("outp", finn_dtype)
-
-    return model
-
-
-def prepare_inputs(input_tensor, dt):
-    return {"inp": input_tensor}
-
-
-@pytest.mark.parametrize(
-    "config",
-    [
-        ([1, 2, 2, 1680], [1, 2, 2, 1680], 70, 240, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 2, 2, 1680], [1, 2, 2, 1680], 240, 70, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 1680], [1, 1680], 70, 240, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 1680], [1, 1680], 240, 70, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 1680], [1, 1680], 35, 280, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 1680], [1, 1680], 280, 35, DataType["BIPOLAR"]),  # extra word of padding
-        # requires LCM for old version
-        ([1, 42], [1, 42], 6, 14, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 1239], [1, 1239], 21, 59, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 1680], [1, 1680], 70, 240, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 42], [1, 42], 14, 6, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 1239], [1, 1239], 59, 21, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 1680], [1, 1680], 240, 70, DataType["BIPOLAR"]),  # extra word of padding
-        # conversion without needing LCMs
-        ([1, 180], [1, 180], 2, 18, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 720], [1, 720], 8, 72, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 2880], [1, 2880], 32, 288, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 180], [1, 180], 18, 2, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 720], [1, 720], 72, 8, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 2880], [1, 2880], 288, 32, DataType["BIPOLAR"]),  # extra word of padding
-        # passthrough
-        ([1, 100], [1, 100], 10, 10, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 400], [1, 400], 40, 40, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 1600], [1, 1600], 160, 160, DataType["BIPOLAR"]),  # extra word of padding
-    ],
-)
-@pytest.mark.parametrize("exec_mode", ["rtlsim", "cppsim"])
-@pytest.mark.fpgadataflow
-@pytest.mark.slow
-@pytest.mark.vivado
-def test_fpgadataflow_dwc(config, exec_mode):
-    in_shape, out_shape, inWidth, outWidth, finn_dtype = config
-
-    test_fpga_part = "xc7z020clg400-1"
-    # generate input data
-    x = gen_finn_dt_tensor(finn_dtype, in_shape)
-    input_dict = prepare_inputs(x, finn_dtype)
-
-    model = make_single_dwc_modelwrapper(in_shape, out_shape, inWidth, outWidth, finn_dtype)
-    # verify abstraction level execution
-    y = oxe.execute_onnx(model, input_dict)["outp"]
-
-    assert y.shape == tuple(out_shape), """The output shape is incorrect."""
-    # remove padding if it was performed
-    y = y.reshape(1, np.prod(y.shape))
-    x = x.reshape(1, np.prod(x.shape))
-
-    if y.shape[-1] > x.shape[-1]:
-        y = y[0, : x.shape[-1]]
-    else:
-        x = x[0, : y.shape[-1]]
-
-    assert (
-        y == x
-    ).all(), """The output values are not the same as the
-        input values anymore."""
-
-    model = model.transform(SpecializeLayers(test_fpga_part))
-    model = model.transform(GiveUniqueNodeNames())
-    if exec_mode == "cppsim":
-        model = model.transform(PrepareCppSim())
-        model = model.transform(CompileCppSim())
-        model = model.transform(SetExecMode("cppsim"))
-    elif exec_mode == "rtlsim":
-        model = model.transform(GiveUniqueNodeNames())
-        model = model.transform(PrepareIP(test_fpga_part, 5))
-        model = model.transform(HLSSynthIP())
-        model = model.transform(SetExecMode("rtlsim"))
-        model = model.transform(PrepareRTLSim())
-    y = oxe.execute_onnx(model, input_dict)["outp"]
-
-    assert y.shape == tuple(out_shape), """The output shape is incorrect."""
-
-    # remove padding if it was performed
-    y = y.reshape(1, np.prod(y.shape))
-    x = x.reshape(1, np.prod(x.shape))
-
-    if y.shape[-1] > x.shape[-1]:
-        y = y[0, : x.shape[-1]]
-    else:
-        x = x[0, : y.shape[-1]]
-
-    # cpp sim assert fails for BIPOLAR data type, but not RTL.
-    if (finn_dtype != DataType["BIPOLAR"]) or (
-        finn_dtype != DataType["BIPOLAR"] and exec_mode != "cppsim"
-    ):
-        assert (
-            y == x
-        ).all(), """The output values are not the same as the
-            input values anymore."""
+            return False
     else:
-        assert True  # we
-
-
-@pytest.mark.parametrize(
-    "config",
-    [
-        ([1, 840], [1, 840], 35, 120, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 840], [1, 840], 120, 35, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 1680], [1, 1680], 35, 280, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 1680], [1, 1680], 280, 35, DataType["BIPOLAR"]),  # extra word of padding
-        # requires LCM for old version
-        ([1, 42], [1, 42], 6, 14, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 1239], [1, 1239], 21, 59, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 1680], [1, 1680], 70, 240, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 42], [1, 42], 14, 6, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 1239], [1, 1239], 59, 21, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 1680], [1, 1680], 240, 70, DataType["BIPOLAR"]),  # extra word of padding
-        # conversion without needing LCMs
-        ([1, 180], [1, 180], 2, 18, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 720], [1, 720], 8, 72, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 2880], [1, 2880], 32, 288, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 180], [1, 180], 18, 2, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 720], [1, 720], 72, 8, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 2880], [1, 2880], 288, 32, DataType["BIPOLAR"]),  # extra word of padding
-        # passthrough
-        ([1, 100], [1, 100], 10, 10, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 400], [1, 400], 40, 40, DataType["BIPOLAR"]),  # extra word of padding
-        ([1, 1600], [1, 1600], 160, 160, DataType["BIPOLAR"]),  # extra word of padding
-    ],
-)
-@pytest.mark.fpgadataflow
-@pytest.mark.slow
-@pytest.mark.parametrize("measure_resources", [False])
-@pytest.mark.parametrize("measure_functionality", [False])
-@pytest.mark.parametrize("measure_performance", [False])
-@pytest.mark.parametrize("test_type", ["new"])
-@pytest.mark.vivado
-def test_fpgadataflow_dwc_stitched_rtlsim(
-    config, measure_resources, measure_functionality, measure_performance, test_type
-):
-    in_shape, out_shape, inWidth, outWidth, finn_dtype = config
-
-    test_fpga_part = "xc7z020clg400-1"
-    target_clk_ns = 4
-    # generate input data
-    x = gen_finn_dt_tensor(finn_dtype, in_shape)
-    input_dict = prepare_inputs(x, finn_dtype)
-
-    test_name = "dwc_res_tests_{inWidth}_{outWidth}"
-
-    build_dir = os.environ["FINN_BUILD_DIR"]
-
-    build_dir = build_dir + "/test_model/"
-    if not os.path.isdir(build_dir):
-        build_dir = make_build_dir(prefix="dwc_performance_testing_")
-
-    model = make_single_dwc_modelwrapper(in_shape, out_shape, inWidth, outWidth, finn_dtype)
-    model = model.transform(SpecializeLayers(test_fpga_part))
-    model_dir = f"{build_dir}/dwc_res_tests_{inWidth}_{outWidth}"
-    model_file = f"{model_dir}/model.onnx"
-    model.save(model_dir)
-
-    final_output_dir = build_dir
-
-    # Delete previous run results if exist
-    # if os.path.exists(final_output_dir):
-    #     shutil.rmtree(final_output_dir)
-    #     print("Previous run results deleted!")
-
-    cfg = build.DataflowBuildConfig(
-        output_dir=final_output_dir,
-        mvau_wwidth_max=80,
-        target_fps=1000000,
-        synth_clk_period_ns=target_clk_ns,
-        board="Pynq-Z1",
-        # board               = "U250",
-        shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ,
-        generate_outputs=[
-            # build_cfg.DataflowOutputType.STITCHED_IP,
-            #    build_cfg.DataflowOutputType.OOC_SYNTH,
-            build_cfg.DataflowOutputType.BITFILE,
-            #    build_cfg.DataflowOutputType.PYNQ_DRIVER,
-            #    build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE,
-        ],
-    )
-    build.build_dataflow_cfg(model_dir, cfg)
-
-    model.set_metadata_prop("rtlsim_so", "")
-    model.set_metadata_prop("exec_mode", "rtlsim")
-    res = post_synth_res_dwc(model, f"{final_output_dir}/report/post_synth_resources.xml")
-    res = res[""]
-    build_dir = os.environ["FINN_BUILD_DIR"]
-    build_dir += f"/dwc_performance_testing_{test_type}"
-    lut = res["LUT"]
-    ff = res["FF"]
-    target_clk = int(np.round(1000 / target_clk_ns))
-    with open(f"{build_dir}/measurements.txt", "a+") as f:
-        f.writelines(f"{target_clk}\t{inWidth}\t{outWidth}\tnew_hls\t{lut}\t{ff}\n")
-
-    # with open(f"{build_dir}_new_DWC_res.txt", 'a+') as f:
-    #   f.write(res) # here filter to only what we care about
-    print(f"{target_clk}\t{inWidth}\t{outWidth}\tnew_hls\t{lut}\t{ff}\n")
-
-    # assert True == False
-
-    if measure_functionality:
-        y = oxe.execute_onnx(model, input_dict)["outp"]
-
-        assert y.shape == tuple(out_shape), """The output shape is incorrect."""
-
-        # remove padding if it was performed
-        y = y.reshape(1, np.prod(y.shape))
-        x = x.reshape(1, np.prod(x.shape))
-
-        if y.shape[-1] > x.shape[-1]:
-            y = y[0, : x.shape[-1]]
-        else:
-            x = x[0, : y.shape[-1]]
-
-        assert (
-            y == x
-        ).all(), """The output values are not the same as the
-            input values anymore."""
-
-    if measure_performance:
-        rtlsim_bs = 50
-        res = throughput_test_rtlsim(model, rtlsim_bs)
-        print(f"Performance for {in_shape, out_shape,inWidth,outWidth} :", res)
+        return False
+
+
+class InsertDWC(Transformation):
+    """Add data width converters between layers where necessary."""
+
+    def __init__(self):
+        super().__init__()
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = -1
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if _suitable_node(n):
+                for output_name in n.output:
+                    consumers = model.find_consumers(output_name)
+                    if consumers == []:
+                        continue
+                    assert len(consumers) == 1, (
+                        n.name + ": HW node with fan-out higher than 1 cannot be stitched"
+                    )
+                    consumer = consumers[0]
+                    if _suitable_node(consumer) is True:
+                        n0 = getCustomOp(n)
+                        n1 = getCustomOp(consumer)
+                        n0_out_shape = n0.get_folded_output_shape()
+                        # in some special cases, we need to get folded shapes of
+                        # non-default inputs for the consumer
+                        # - if FC and external mem, it could be connected to input 1
+                        # - if concat, could be connected to any input
+                        if (
+                            consumer.op_type.startswith("MVAU")
+                            and n1.get_nodeattr("mem_mode") == "external"
+                        ) or (consumer.op_type.startswith("StreamingConcat")):
+                            # get input idx
+                            in_idx = None
+                            for idx, n_input in enumerate(consumer.input):
+                                if output_name == n_input:
+                                    in_idx = idx
+                            assert in_idx is not None, "Malformed model"
+                            n1_in_shape = n1.get_folded_input_shape(in_idx)
+                        else:
+                            # use default folded input shape
+                            n1_in_shape = n1.get_folded_input_shape()
+
+                        # insert the DWC if either the widths missmatch
+                        # (use DWC for folding conversion)
+                        # or if the total element counts differ (use DWC for padding & cropping)
+                        if n0_out_shape[-1] != n1_in_shape[-1] or np.prod(n0_out_shape) != np.prod(
+                            n1_in_shape
+                        ):
+                            graph_modified = True
+                            # determine dwc inwidth
+                            dwc_in_width = n0.get_outstream_width()
+                            # determine dwc outwidth
+                            dwc_out_width = n1.get_instream_width()
+                            node_optype = "StreamingDataWidthConverter"
+
+                            if max(dwc_in_width, dwc_out_width) % min(
+                                dwc_in_width, dwc_out_width
+                            ) == 0 and np.prod(n0_out_shape) == np.prod(n1_in_shape):
+                                # the DWC does not need to perform conversions between
+                                # widths which can be divided by one another,
+                                # nor is padding or cropping happening
+                                # thus we can use the optimal RTL variant
+                                style = "rtl"
+                            else:
+                                # either complex width conversion or padding/cropping
+                                # are involved, so we use the generalized HLS variant
+                                style = "hls"
+                            # determine dtype for dwc
+                            dtype = n0.get_output_datatype()
+                            n1_dtype = n1.get_input_datatype()
+                            assert dtype == n1_dtype, f"Neighboring node datatypes are Incompatible ({dtype}) != ({n1_dtype})"
+                            
+                            # determine shapes for dwc
+                            # generalized version allows them to differ
+                            # and will either pad or crop depending
+                            # on the difference in elements sent
+                            # and requested
+                            in_shape = n0.get_normal_output_shape()
+                            out_shape = n1.get_normal_input_shape()
+
+                            dwc_output_tensor = oh.make_tensor_value_info(
+                                model.make_new_valueinfo_name(),
+                                TensorProto.FLOAT,
+                                out_shape,
+                            )
+                            graph.value_info.append(dwc_output_tensor)
+
+                            print(f"inserting DWC_{style}, in_shape={in_shape},out_shape={out_shape},inWidth={dwc_in_width}, outWidth={dwc_out_width}, dtype={str(dtype.name)}")
+                            #if str(dtype.name) == "UINT32":
+                            #    assert True == False
+                            
+                            dwc_node = oh.make_node(
+                                node_optype,
+                                [output_name],
+                                [dwc_output_tensor.name],
+                                domain="finn.custom_op.fpgadataflow",
+                                backend="fpgadataflow",
+                                in_shape=in_shape,
+                                out_shape=out_shape,
+                                inWidth=dwc_in_width,
+                                outWidth=dwc_out_width,
+                                preferred_impl_style=style,
+                                dataType=str(dtype.name),
+                            )
+                            # insert dwc
+                            graph.node.insert(node_ind + 1, dwc_node)
+
+                            # set dwc output tensor as new input tensor of second node
+                            for idx, inp in enumerate(consumer.input):
+                                if inp == output_name:
+                                    consumer.input[idx] = dwc_output_tensor.name
+
+        return (model, graph_modified)