From 337dced31be208116b196d69855ecf3b731309d9 Mon Sep 17 00:00:00 2001 From: lstasytis Date: Wed, 18 Sep 2024 10:29:19 +0100 Subject: [PATCH] refactoring and moving log computations to cpp compile side --- .../custom_op/fpgadataflow/hls/iodma_hls.py | 16 +- .../hls/streamingdatawidthconverter_hls.py | 11 +- .../streamingdatawidthconverter.py | 316 ++++++++++- tests/fpgadataflow/test_fpgadataflow_dwc.py | 492 +++++------------- 4 files changed, 454 insertions(+), 381 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py index eb6fa977ae..0ba7ba974f 100644 --- a/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py @@ -236,7 +236,7 @@ def docompute(self): raise ValueError("Invalid IODMA direction, please set to in or out") # define templates for instantiation dma_inst_template = func + "(%s, %s, numReps);" - dwc_inst_template = dwc_func + "<%d, %d, %d, %d, %d, %d, %d, %d>(%s, %s, numReps);" + dwc_inst_template = dwc_func + "<%d, %d, %d, %d, %d>(%s, %s, numReps);" # do stream infrastructure and instantiations intfw = self.get_nodeattr("intfWidth") strmw = self.get_nodeattr("streamWidth") @@ -257,10 +257,6 @@ def docompute(self): if outWidth > inWidth: totalIters += int(np.floor(outWidth / inWidth) + 1) - 1 - NumInWordsLog = int(np.log2(numInWords) + 1) - NumOutWordsLog = int(np.log2(numOutWords) + 1) - BufferWidthLog = int(np.log2(inWidth + outWidth) + 1) - # AXI MM -> IODMA -> (DWCs) -> out # DWCs depend on AXI MM and out interface width if strmw == intfw: @@ -281,9 +277,6 @@ def docompute(self): outWidth, numInWords, numOutWords, - NumInWordsLog, - NumOutWordsLog, - BufferWidthLog, totalIters, "dma2dwc", "out_" + self.hls_sname(), @@ -301,10 +294,6 @@ def docompute(self): if outWidth > inWidth: totalIters += int(np.floor(outWidth / inWidth) + 1) - 1 - NumInWordsLog = int(np.log2(numInWords) + 1) - NumOutWordsLog = int(np.log2(numOutWords) + 1) - BufferWidthLog = int(np.log2(inWidth + outWidth) + 1) - # in0 -> (DWCs) -> IODMA -> AXI MM # DWCs depend on AXI MM and out interface width if strmw == intfw: @@ -324,9 +313,6 @@ def docompute(self): outWidth, numInWords, numOutWords, - NumInWordsLog, - NumOutWordsLog, - BufferWidthLog, totalIters, "in0_" + self.hls_sname(), "dwc2dma", diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py index 94f54939bc..81f43c3315 100644 --- a/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py @@ -41,7 +41,7 @@ class StreamingDataWidthConverter_hls(StreamingDataWidthConverter, HLSBackend): - """Class that corresponds to finn-hlslib StreamingDataWidthConverter_Batch + """Class that corresponds to finn-hlslib StreamingDataWidthConverterGeneralized_Batch function.""" def get_nodeattr_types(self): @@ -77,18 +77,12 @@ def defines(self, var): if outWidth > inWidth: totalIters += int(np.floor(outWidth / inWidth) + 1) - 1 - NumInWordsLog = int(np.log2(numInWords) + 1) - NumOutWordsLog = int(np.log2(numOutWords) + 1) - BufferWidthLog = int(np.log2(inWidth + outWidth) + 1) self.code_gen_dict["$DEFINES$"] = [ "#define InWidth %d " % inWidth, "#define OutWidth %d " % outWidth, "#define NumInWords %d " % numInWords, "#define NumOutWords %d " % numOutWords, - "#define NumInWordsLog %d " % NumInWordsLog, - "#define NumOutWordsLog %d " % NumOutWordsLog, - "#define BufferWidthLog %d " % BufferWidthLog, "#define totalIters %d " % totalIters, "#define numReps %d" % numReps, ] @@ -109,11 +103,10 @@ def strm_decl(self): def docompute(self): # TODO continue with fxns below, they are copy-pasted - op = "StreamingDataWidthConverter_Batch" + op = "StreamingDataWidthConverterGeneralized_Batch" self.code_gen_dict["$DOCOMPUTE$"] = [ "%s(in0_%s, out_%s, numReps);" % (self.hls_sname(), self.hls_sname()) ] diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py index 3b670e0241..37dbead02c 100644 --- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py +++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py @@ -49,7 +49,6 @@ def get_nodeattr_types(self): # bit width of input and output streams "inWidth": ("i", True, 0), "outWidth": ("i", True, 0), - "generalized_variant": ("i", True, 1), # FINN DataTypes for inputs/outputs "dataType": ("s", True, ""), } @@ -241,4 +240,317 @@ def get_exp_cycles(self): exp_cycles = words + min_words return int(exp_cycles) - \ No newline at end of file + + + def prepare_kwargs_for_characteristic_fx(self): + + numInWords = int(np.prod(self.get_folded_input_shape()[-2:-1])) + numOutWords = int(np.prod(self.get_folded_output_shape()[-2:-1])) + numReps = int(np.prod(self.get_folded_input_shape()[:1])) + + inWidth = self.get_nodeattr("inWidth") + outWidth = self.get_nodeattr("outWidth") + + + + kwargs = (numInWords,numOutWords,inWidth,outWidth,numReps) + + # assert True==False + return kwargs + + + + def characteristic_fx_input(self, txns, cycles, counter, kwargs): + + (numInWords,numOutWords,inWidth,outWidth,numReps) = kwargs + + + + + # HYPER PARAMETERS WHICH MAY CHANGE OVER TIME + windup_clocks_up_convert_input = 4 + + + windup_clocks_down_convert_input = 3 + + + windup_clocks_down_convert_output = 4 + windup_clocks_equal_convert_output = 3 + + + + if numInWords < windup_clocks_up_convert_input: + windup_clocks_up_convert_input = numInWords + + if numInWords < windup_clocks_down_convert_input: + windup_clocks_down_convert_input = numInWords + + + + if numOutWords < windup_clocks_down_convert_output: + windup_clocks_down_convert_output = numOutWords + + + + if numOutWords < windup_clocks_equal_convert_output: + windup_clocks_equal_convert_output = numOutWords + + + # calculation to adjust for padding or cropping adding latency + + + if outWidth > inWidth: + higher = outWidth + lower = inWidth + else: + higher = inWidth + lower = outWidth + + if higher % lower != 0: + if numInWords*inWidth > numOutWords*outWidth: + crop = True + pad = False + else: + cropping = False + pad = True + + else: + crop = False + pad = False + + + # first input period + tracker = 0 + maximum = numReps*numInWords + + if numReps > 1: + # loop windup + for i in range(2): + txns.append(counter) + counter+=1 + cycles+=1 + tracker+=1 + + for j in range(0,numReps): + for i in range(0,numInWords): + if tracker < maximum: + txns.append(counter) + counter+=1 + cycles+=1 + tracker+=1 + for i in range(0,1): + txns.append(counter) + cycles+=1 + + return txns, cycles, counter + + + + def characteristic_fx_output(self, txns, cycles, counter, kwargs): + + (numInWords,numOutWords,inWidth,outWidth,numReps) = kwargs + + + + + + # HYPER PARAMETERS WHICH MAY CHANGE + windup_clocks_up_convert_input = 3 + windup_clocks_down_convert_input = 2 + + + windup_clocks_down_convert_output = 3 + windup_clocks_equal_convert_output = 2 + + + + if numInWords < windup_clocks_up_convert_input: + windup_clocks_up_convert_input = numInWords + + if numInWords < windup_clocks_down_convert_input: + windup_clocks_down_convert_input = numInWords + + + + if numOutWords < windup_clocks_down_convert_output: + windup_clocks_down_convert_output = numOutWords + + + + if numOutWords < windup_clocks_equal_convert_output: + windup_clocks_equal_convert_output = numOutWords + + + + + # calculation to adjust for padding or cropping adding latency + + + if outWidth > inWidth: + higher = outWidth + lower = inWidth + else: + higher = inWidth + lower = outWidth + + if higher % lower != 0: + if numInWords*inWidth > numOutWords*outWidth: + crop = True + pad = False + else: + cropping = False + pad = True + + else: + crop = False + pad = False + + + + # windup period + if inWidth == outWidth: + clock = windup_clocks_equal_convert_output + else: + clock = windup_clocks_up_convert_input + for i in range(0,clock): + txns.append(counter) + cycles+=1 + # padding +=1 + + # first input period + + if pad: + offset = 2 + else: + offset = 1 + + + remainder = 0 + + + for k in range(numReps): + + # windup + txns.append(counter) + cycles+=1 + + for i in range(0,numOutWords): + for j in range(0,int(np.floor(outWidth/inWidth))): + if j != 0: + txns.append(counter) + cycles +=1 + remainder += inWidth + # padding +=1 + + + + if pad and remainder < outWidth: + print(remainder) + txns.append(counter) + remainder += inWidth + cycles +=1 + + txns.append(counter) + cycles +=1 + + counter+=1 + remainder -= outWidth + + + return txns, cycles, counter + + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + + ignore = self.get_nodeattr("ipgen_ignore") + if ignore == 0: # this node is being derived using RTLSIM + # RTL-based flow + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) + return + + + + # Analytical flow + + txns_in = {key: [] for (key, value) in io_dict["inputs"].items() if "in" in key} + txns_out = {key: [] for (key, value) in io_dict["outputs"].items() if "out" in key} + + all_txns_in = np.empty((len(txns_in.keys()), 2 * period), dtype=np.int32) + all_txns_out = np.empty((len(txns_out.keys()), 2 * period), dtype=np.int32) + + + self.set_nodeattr("io_chrc_period",period) + + + + + txn_in = [] + txn_out = [] + + + # INPUT + + counter = 0 + padding = 0 + + + kwargs = self.prepare_kwargs_for_characteristic_fx() + + + # first period + cycles = 0 + txn_in, cycles, counter = self.characteristic_fx_input(txn_in,cycles,counter,kwargs) + + txn_in += [counter] * (period-cycles) + padding+=(period*-cycles) + + + # second period + cycles = period + txn_in, cycles, counter = self.characteristic_fx_input(txn_in,cycles,counter,kwargs) + + + #for i in range(cycles,period*2): + # txn_in.append(counter) + #pads = (period*2-cycles) + + txn_in += [counter] * (period*2-cycles) + padding+=(period*2-cycles) + + # final assignments + all_txns_in[0, :] = np.array(txn_in) + self.set_nodeattr("io_chrc_in", all_txns_in) + self.set_nodeattr("io_chrc_pads_in", padding) + + + # OUTPUT + + counter = 0 + cycles = 0 + padding = 0 + + + txn_out, cycles, counter = self.characteristic_fx_output(txn_out,cycles,counter,kwargs) + + + txn_out += [counter] * (period-cycles) + padding += (period*-cycles) + + cycles = period + + txn_out, cycles, counter = self.characteristic_fx_output(txn_out,cycles,counter,kwargs) + + txn_out += [counter] * (period*2-cycles) + padding+=(period*2-cycles) + + + all_txns_out[0, :] = np.array(txn_out) + self.set_nodeattr("io_chrc_out", all_txns_out) + self.set_nodeattr("io_chrc_pads_out", padding) diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py index 04c0a82b1c..f86c62a9a6 100644 --- a/tests/fpgadataflow/test_fpgadataflow_dwc.py +++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py @@ -1,5 +1,4 @@ -# Copyright (C) 2020-2022, Xilinx, Inc. -# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -27,365 +26,148 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import pytest - import numpy as np -import os -import xml.etree.ElementTree as ET -from onnx import TensorProto, helper -from qonnx.core.datatype import DataType -from qonnx.core.modelwrapper import ModelWrapper +from onnx import TensorProto +from onnx import helper as oh from qonnx.custom_op.registry import getCustomOp -from qonnx.transformation.general import GiveUniqueNodeNames -from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model - -import finn.builder.build_dataflow as build -import finn.builder.build_dataflow_config as build_cfg -import finn.core.onnx_exec as oxe -from finn.analysis.fpgadataflow.post_synth_res import post_synth_res -from finn.core.throughput_test import throughput_test_rtlsim -from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP -from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim -from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim -from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode -from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -from finn.util.basic import make_build_dir -from finn.util.fpgadataflow import is_hls_node, is_rtl_node +from qonnx.transformation.base import Transformation +from finn.util.fpgadataflow import is_fpgadataflow_node -def post_synth_res_dwc(model, override_synth_report_filename=None): - """Extracts the FPGA resource results from the Vivado synthesis. - This function extras only a DWC from a DWC-only stitched model - Returns {node name : resources_dict}.""" +def _is_dwc_node(node): + return node.op_type.startswith("StreamingDataWidthConverter") - res_dict = {} - if override_synth_report_filename is not None: - synth_report_filename = override_synth_report_filename - else: - synth_report_filename = model.get_metadata_prop("vivado_synth_rpt") - if os.path.isfile(synth_report_filename): - tree = ET.parse(synth_report_filename) - root = tree.getroot() - all_cells = root.findall(".//tablecell") - # strip all whitespace from table cell contents - for cell in all_cells: - cell.attrib["contents"] = cell.attrib["contents"].strip() - else: - raise Exception("Please run synthesis first") - - # TODO build these indices based on table headers instead of harcoding - restype_to_ind_default = { - "LUT": 2, - "SRL": 5, - "FF": 6, - "BRAM_36K": 7, - "BRAM_18K": 8, - "DSP48": 9, - } - restype_to_ind_vitis = { - "LUT": 4, - "SRL": 7, - "FF": 8, - "BRAM_36K": 9, - "BRAM_18K": 10, - "URAM": 11, - "DSP48": 12, - } - - if model.get_metadata_prop("platform") == "alveo": - restype_to_ind = restype_to_ind_vitis - else: - restype_to_ind = restype_to_ind_default - def get_instance_stats(inst_name): - row = root.findall(".//*[@contents='%s']/.." % inst_name) - if row != []: - node_dict = {} - row = list(row[0]) - for restype, ind in restype_to_ind.items(): - node_dict[restype] = int(row[ind].attrib["contents"]) - return node_dict +def _suitable_node(node): + if node is not None: + if is_fpgadataflow_node(node): + if _is_dwc_node(node): + # no DWC for DWCs + return False + elif node.op_type == "IODMA_hls": + # IODMA data shapes/widths need special handling + return False + else: + return True else: - return None - - # global (top-level) stats, including shell etc. - top_dict = get_instance_stats("(top)") - if top_dict is not None: - res_dict["(top)"] = top_dict - - for node in model.graph.node: - if node.op_type == "StreamingDataflowPartition": - sdp_model = ModelWrapper(getCustomOp(node).get_nodeattr("model")) - sdp_res_dict = post_synth_res(sdp_model, synth_report_filename) - res_dict.update(sdp_res_dict) - elif is_hls_node(node) or is_rtl_node(node): - node_dict = get_instance_stats( - f"top_StreamingDataflowPartition_1_0_StreamingDataflowPartition_1_StreamingDataflowPartition_1_StreamingDataWidthConverter_hls_0_0" - ) - if node_dict is not None: - res_dict[node.name] = node_dict - - return res_dict - - -def make_single_dwc_modelwrapper(in_shape, out_shape, inWidth, outWidth, finn_dtype): - inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, in_shape) - outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, out_shape) - - optype = "StreamingDataWidthConverter" - - DWC_node = helper.make_node( - optype, - ["inp"], - ["outp"], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - in_shape=in_shape, - out_shape=out_shape, - inWidth=inWidth, - outWidth=outWidth, - preferred_impl_style="hls", - generalized_variant=True, - dataType=str(finn_dtype.name), - ) - - graph = helper.make_graph(nodes=[DWC_node], name="dwc_graph", inputs=[inp], outputs=[outp]) - - model = qonnx_make_model(graph, producer_name="dwc-model") - model = ModelWrapper(model) - - model.set_tensor_datatype("inp", finn_dtype) - model.set_tensor_datatype("outp", finn_dtype) - - return model - - -def prepare_inputs(input_tensor, dt): - return {"inp": input_tensor} - - -@pytest.mark.parametrize( - "config", - [ - ([1, 2, 2, 1680], [1, 2, 2, 1680], 70, 240, DataType["BIPOLAR"]), # extra word of padding - ([1, 2, 2, 1680], [1, 2, 2, 1680], 240, 70, DataType["BIPOLAR"]), # extra word of padding - ([1, 1680], [1, 1680], 70, 240, DataType["BIPOLAR"]), # extra word of padding - ([1, 1680], [1, 1680], 240, 70, DataType["BIPOLAR"]), # extra word of padding - ([1, 1680], [1, 1680], 35, 280, DataType["BIPOLAR"]), # extra word of padding - ([1, 1680], [1, 1680], 280, 35, DataType["BIPOLAR"]), # extra word of padding - # requires LCM for old version - ([1, 42], [1, 42], 6, 14, DataType["BIPOLAR"]), # extra word of padding - ([1, 1239], [1, 1239], 21, 59, DataType["BIPOLAR"]), # extra word of padding - ([1, 1680], [1, 1680], 70, 240, DataType["BIPOLAR"]), # extra word of padding - ([1, 42], [1, 42], 14, 6, DataType["BIPOLAR"]), # extra word of padding - ([1, 1239], [1, 1239], 59, 21, DataType["BIPOLAR"]), # extra word of padding - ([1, 1680], [1, 1680], 240, 70, DataType["BIPOLAR"]), # extra word of padding - # conversion without needing LCMs - ([1, 180], [1, 180], 2, 18, DataType["BIPOLAR"]), # extra word of padding - ([1, 720], [1, 720], 8, 72, DataType["BIPOLAR"]), # extra word of padding - ([1, 2880], [1, 2880], 32, 288, DataType["BIPOLAR"]), # extra word of padding - ([1, 180], [1, 180], 18, 2, DataType["BIPOLAR"]), # extra word of padding - ([1, 720], [1, 720], 72, 8, DataType["BIPOLAR"]), # extra word of padding - ([1, 2880], [1, 2880], 288, 32, DataType["BIPOLAR"]), # extra word of padding - # passthrough - ([1, 100], [1, 100], 10, 10, DataType["BIPOLAR"]), # extra word of padding - ([1, 400], [1, 400], 40, 40, DataType["BIPOLAR"]), # extra word of padding - ([1, 1600], [1, 1600], 160, 160, DataType["BIPOLAR"]), # extra word of padding - ], -) -@pytest.mark.parametrize("exec_mode", ["rtlsim", "cppsim"]) -@pytest.mark.fpgadataflow -@pytest.mark.slow -@pytest.mark.vivado -def test_fpgadataflow_dwc(config, exec_mode): - in_shape, out_shape, inWidth, outWidth, finn_dtype = config - - test_fpga_part = "xc7z020clg400-1" - # generate input data - x = gen_finn_dt_tensor(finn_dtype, in_shape) - input_dict = prepare_inputs(x, finn_dtype) - - model = make_single_dwc_modelwrapper(in_shape, out_shape, inWidth, outWidth, finn_dtype) - # verify abstraction level execution - y = oxe.execute_onnx(model, input_dict)["outp"] - - assert y.shape == tuple(out_shape), """The output shape is incorrect.""" - # remove padding if it was performed - y = y.reshape(1, np.prod(y.shape)) - x = x.reshape(1, np.prod(x.shape)) - - if y.shape[-1] > x.shape[-1]: - y = y[0, : x.shape[-1]] - else: - x = x[0, : y.shape[-1]] - - assert ( - y == x - ).all(), """The output values are not the same as the - input values anymore.""" - - model = model.transform(SpecializeLayers(test_fpga_part)) - model = model.transform(GiveUniqueNodeNames()) - if exec_mode == "cppsim": - model = model.transform(PrepareCppSim()) - model = model.transform(CompileCppSim()) - model = model.transform(SetExecMode("cppsim")) - elif exec_mode == "rtlsim": - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareIP(test_fpga_part, 5)) - model = model.transform(HLSSynthIP()) - model = model.transform(SetExecMode("rtlsim")) - model = model.transform(PrepareRTLSim()) - y = oxe.execute_onnx(model, input_dict)["outp"] - - assert y.shape == tuple(out_shape), """The output shape is incorrect.""" - - # remove padding if it was performed - y = y.reshape(1, np.prod(y.shape)) - x = x.reshape(1, np.prod(x.shape)) - - if y.shape[-1] > x.shape[-1]: - y = y[0, : x.shape[-1]] - else: - x = x[0, : y.shape[-1]] - - # cpp sim assert fails for BIPOLAR data type, but not RTL. - if (finn_dtype != DataType["BIPOLAR"]) or ( - finn_dtype != DataType["BIPOLAR"] and exec_mode != "cppsim" - ): - assert ( - y == x - ).all(), """The output values are not the same as the - input values anymore.""" + return False else: - assert True # we - - -@pytest.mark.parametrize( - "config", - [ - ([1, 840], [1, 840], 35, 120, DataType["BIPOLAR"]), # extra word of padding - ([1, 840], [1, 840], 120, 35, DataType["BIPOLAR"]), # extra word of padding - ([1, 1680], [1, 1680], 35, 280, DataType["BIPOLAR"]), # extra word of padding - ([1, 1680], [1, 1680], 280, 35, DataType["BIPOLAR"]), # extra word of padding - # requires LCM for old version - ([1, 42], [1, 42], 6, 14, DataType["BIPOLAR"]), # extra word of padding - ([1, 1239], [1, 1239], 21, 59, DataType["BIPOLAR"]), # extra word of padding - ([1, 1680], [1, 1680], 70, 240, DataType["BIPOLAR"]), # extra word of padding - ([1, 42], [1, 42], 14, 6, DataType["BIPOLAR"]), # extra word of padding - ([1, 1239], [1, 1239], 59, 21, DataType["BIPOLAR"]), # extra word of padding - ([1, 1680], [1, 1680], 240, 70, DataType["BIPOLAR"]), # extra word of padding - # conversion without needing LCMs - ([1, 180], [1, 180], 2, 18, DataType["BIPOLAR"]), # extra word of padding - ([1, 720], [1, 720], 8, 72, DataType["BIPOLAR"]), # extra word of padding - ([1, 2880], [1, 2880], 32, 288, DataType["BIPOLAR"]), # extra word of padding - ([1, 180], [1, 180], 18, 2, DataType["BIPOLAR"]), # extra word of padding - ([1, 720], [1, 720], 72, 8, DataType["BIPOLAR"]), # extra word of padding - ([1, 2880], [1, 2880], 288, 32, DataType["BIPOLAR"]), # extra word of padding - # passthrough - ([1, 100], [1, 100], 10, 10, DataType["BIPOLAR"]), # extra word of padding - ([1, 400], [1, 400], 40, 40, DataType["BIPOLAR"]), # extra word of padding - ([1, 1600], [1, 1600], 160, 160, DataType["BIPOLAR"]), # extra word of padding - ], -) -@pytest.mark.fpgadataflow -@pytest.mark.slow -@pytest.mark.parametrize("measure_resources", [False]) -@pytest.mark.parametrize("measure_functionality", [False]) -@pytest.mark.parametrize("measure_performance", [False]) -@pytest.mark.parametrize("test_type", ["new"]) -@pytest.mark.vivado -def test_fpgadataflow_dwc_stitched_rtlsim( - config, measure_resources, measure_functionality, measure_performance, test_type -): - in_shape, out_shape, inWidth, outWidth, finn_dtype = config - - test_fpga_part = "xc7z020clg400-1" - target_clk_ns = 4 - # generate input data - x = gen_finn_dt_tensor(finn_dtype, in_shape) - input_dict = prepare_inputs(x, finn_dtype) - - test_name = "dwc_res_tests_{inWidth}_{outWidth}" - - build_dir = os.environ["FINN_BUILD_DIR"] - - build_dir = build_dir + "/test_model/" - if not os.path.isdir(build_dir): - build_dir = make_build_dir(prefix="dwc_performance_testing_") - - model = make_single_dwc_modelwrapper(in_shape, out_shape, inWidth, outWidth, finn_dtype) - model = model.transform(SpecializeLayers(test_fpga_part)) - model_dir = f"{build_dir}/dwc_res_tests_{inWidth}_{outWidth}" - model_file = f"{model_dir}/model.onnx" - model.save(model_dir) - - final_output_dir = build_dir - - # Delete previous run results if exist - # if os.path.exists(final_output_dir): - # shutil.rmtree(final_output_dir) - # print("Previous run results deleted!") - - cfg = build.DataflowBuildConfig( - output_dir=final_output_dir, - mvau_wwidth_max=80, - target_fps=1000000, - synth_clk_period_ns=target_clk_ns, - board="Pynq-Z1", - # board = "U250", - shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, - generate_outputs=[ - # build_cfg.DataflowOutputType.STITCHED_IP, - # build_cfg.DataflowOutputType.OOC_SYNTH, - build_cfg.DataflowOutputType.BITFILE, - # build_cfg.DataflowOutputType.PYNQ_DRIVER, - # build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE, - ], - ) - build.build_dataflow_cfg(model_dir, cfg) - - model.set_metadata_prop("rtlsim_so", "") - model.set_metadata_prop("exec_mode", "rtlsim") - res = post_synth_res_dwc(model, f"{final_output_dir}/report/post_synth_resources.xml") - res = res[""] - build_dir = os.environ["FINN_BUILD_DIR"] - build_dir += f"/dwc_performance_testing_{test_type}" - lut = res["LUT"] - ff = res["FF"] - target_clk = int(np.round(1000 / target_clk_ns)) - with open(f"{build_dir}/measurements.txt", "a+") as f: - f.writelines(f"{target_clk}\t{inWidth}\t{outWidth}\tnew_hls\t{lut}\t{ff}\n") - - # with open(f"{build_dir}_new_DWC_res.txt", 'a+') as f: - # f.write(res) # here filter to only what we care about - print(f"{target_clk}\t{inWidth}\t{outWidth}\tnew_hls\t{lut}\t{ff}\n") - - # assert True == False - - if measure_functionality: - y = oxe.execute_onnx(model, input_dict)["outp"] - - assert y.shape == tuple(out_shape), """The output shape is incorrect.""" - - # remove padding if it was performed - y = y.reshape(1, np.prod(y.shape)) - x = x.reshape(1, np.prod(x.shape)) - - if y.shape[-1] > x.shape[-1]: - y = y[0, : x.shape[-1]] - else: - x = x[0, : y.shape[-1]] - - assert ( - y == x - ).all(), """The output values are not the same as the - input values anymore.""" - - if measure_performance: - rtlsim_bs = 50 - res = throughput_test_rtlsim(model, rtlsim_bs) - print(f"Performance for {in_shape, out_shape,inWidth,outWidth} :", res) + return False + + +class InsertDWC(Transformation): + """Add data width converters between layers where necessary.""" + + def __init__(self): + super().__init__() + + def apply(self, model): + graph = model.graph + node_ind = -1 + graph_modified = False + for n in graph.node: + node_ind += 1 + if _suitable_node(n): + for output_name in n.output: + consumers = model.find_consumers(output_name) + if consumers == []: + continue + assert len(consumers) == 1, ( + n.name + ": HW node with fan-out higher than 1 cannot be stitched" + ) + consumer = consumers[0] + if _suitable_node(consumer) is True: + n0 = getCustomOp(n) + n1 = getCustomOp(consumer) + n0_out_shape = n0.get_folded_output_shape() + # in some special cases, we need to get folded shapes of + # non-default inputs for the consumer + # - if FC and external mem, it could be connected to input 1 + # - if concat, could be connected to any input + if ( + consumer.op_type.startswith("MVAU") + and n1.get_nodeattr("mem_mode") == "external" + ) or (consumer.op_type.startswith("StreamingConcat")): + # get input idx + in_idx = None + for idx, n_input in enumerate(consumer.input): + if output_name == n_input: + in_idx = idx + assert in_idx is not None, "Malformed model" + n1_in_shape = n1.get_folded_input_shape(in_idx) + else: + # use default folded input shape + n1_in_shape = n1.get_folded_input_shape() + + # insert the DWC if either the widths missmatch + # (use DWC for folding conversion) + # or if the total element counts differ (use DWC for padding & cropping) + if n0_out_shape[-1] != n1_in_shape[-1] or np.prod(n0_out_shape) != np.prod( + n1_in_shape + ): + graph_modified = True + # determine dwc inwidth + dwc_in_width = n0.get_outstream_width() + # determine dwc outwidth + dwc_out_width = n1.get_instream_width() + node_optype = "StreamingDataWidthConverter" + + if max(dwc_in_width, dwc_out_width) % min( + dwc_in_width, dwc_out_width + ) == 0 and np.prod(n0_out_shape) == np.prod(n1_in_shape): + # the DWC does not need to perform conversions between + # widths which can be divided by one another, + # nor is padding or cropping happening + # thus we can use the optimal RTL variant + style = "rtl" + else: + # either complex width conversion or padding/cropping + # are involved, so we use the generalized HLS variant + style = "hls" + # determine dtype for dwc + dtype = n0.get_output_datatype() + n1_dtype = n1.get_input_datatype() + assert dtype == n1_dtype, f"Neighboring node datatypes are Incompatible ({dtype}) != ({n1_dtype})" + + # determine shapes for dwc + # generalized version allows them to differ + # and will either pad or crop depending + # on the difference in elements sent + # and requested + in_shape = n0.get_normal_output_shape() + out_shape = n1.get_normal_input_shape() + + dwc_output_tensor = oh.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + out_shape, + ) + graph.value_info.append(dwc_output_tensor) + + print(f"inserting DWC_{style}, in_shape={in_shape},out_shape={out_shape},inWidth={dwc_in_width}, outWidth={dwc_out_width}, dtype={str(dtype.name)}") + #if str(dtype.name) == "UINT32": + # assert True == False + + dwc_node = oh.make_node( + node_optype, + [output_name], + [dwc_output_tensor.name], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + in_shape=in_shape, + out_shape=out_shape, + inWidth=dwc_in_width, + outWidth=dwc_out_width, + preferred_impl_style=style, + dataType=str(dtype.name), + ) + # insert dwc + graph.node.insert(node_ind + 1, dwc_node) + + # set dwc output tensor as new input tensor of second node + for idx, inp in enumerate(consumer.input): + if inp == output_name: + consumer.input[idx] = dwc_output_tensor.name + + return (model, graph_modified)