Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Compiler integration of the Split operation #1194

Open
wants to merge 10 commits into
base: dev
Choose a base branch
from
2 changes: 1 addition & 1 deletion fetch-repos.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
BREVITAS_COMMIT="d4834bd2a0fad3c1fbc0ff7e1346d5dcb3797ea4"
PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"
CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
HLSLIB_COMMIT="16e5847a5e3ef76cffe84c8fad2f010d593457d3"
HLSLIB_COMMIT="2c066e87f5b8d309693c5d46c206473ca20ac68c"
OMX_COMMIT="0b59762f9e4c4f7e5aa535ee9bc29f292434ca7a"
AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"
Expand Down
2 changes: 2 additions & 0 deletions src/finn/custom_op/fpgadataflow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from finn.custom_op.fpgadataflow.lookup import Lookup
from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU
from finn.custom_op.fpgadataflow.pool import Pool
from finn.custom_op.fpgadataflow.split import StreamingSplit
from finn.custom_op.fpgadataflow.streamingdataflowpartition import (
StreamingDataflowPartition,
)
Expand Down Expand Up @@ -77,6 +78,7 @@
custom_op["Lookup"] = Lookup
custom_op["Pool"] = Pool
custom_op["StreamingConcat"] = StreamingConcat
custom_op["StreamingSplit"] = StreamingSplit
custom_op["StreamingDataWidthConverter"] = StreamingDataWidthConverter
custom_op["StreamingEltwise"] = StreamingEltwise
custom_op["StreamingMaxPool"] = StreamingMaxPool
Expand Down
2 changes: 2 additions & 0 deletions src/finn/custom_op/fpgadataflow/hls/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from finn.custom_op.fpgadataflow.hls.lookup_hls import Lookup_hls
from finn.custom_op.fpgadataflow.hls.matrixvectoractivation_hls import MVAU_hls
from finn.custom_op.fpgadataflow.hls.pool_hls import Pool_hls
from finn.custom_op.fpgadataflow.hls.split_hls import StreamingSplit_hls
from finn.custom_op.fpgadataflow.hls.streamingdatawidthconverter_hls import (
StreamingDataWidthConverter_hls,
)
Expand Down Expand Up @@ -71,6 +72,7 @@
custom_op["Lookup_hls"] = Lookup_hls
custom_op["Pool_hls"] = Pool_hls
custom_op["StreamingConcat_hls"] = StreamingConcat_hls
custom_op["StreamingSplit_hls"] = StreamingSplit_hls
custom_op["StreamingEltwise_hls"] = StreamingEltwise_hls
custom_op["StreamingDataWidthConverter_hls"] = StreamingDataWidthConverter_hls
custom_op["StreamingMaxPool_hls"] = StreamingMaxPool_hls
Expand Down
278 changes: 278 additions & 0 deletions src/finn/custom_op/fpgadataflow/hls/split_hls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
# Copyright (C) 2024, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of FINN nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import numpy as np
import os

from finn.custom_op.fpgadataflow import templates
from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
from finn.custom_op.fpgadataflow.split import StreamingSplit
from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy


class StreamingSplit_hls(StreamingSplit, HLSBackend):
"""Streaming split node with dynamically generated HLS.
Only supports splitting along the last axis."""

def __init__(self, onnx_node, **kwargs):
super().__init__(onnx_node, **kwargs)

def get_nodeattr_types(self):
my_attrs = {}
my_attrs.update(StreamingSplit.get_nodeattr_types(self))
my_attrs.update(HLSBackend.get_nodeattr_types(self))
return my_attrs

def execute_node(self, context, graph):
mode = self.get_nodeattr("exec_mode")
node = self.onnx_node
ishape = self.get_normal_input_shape()
folded_ishape = self.get_folded_input_shape()
n_outputs = self.get_n_outputs()
exp_oshapes = [self.get_normal_output_shape(i) for i in range(len(node.output))]
export_idt = self.get_input_datatype()

if mode == "cppsim":
code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
elif mode == "rtlsim":
code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
else:
raise Exception(
"""Invalid value for attribute exec_mode! Is currently set to: {}
has to be set to one of the following value ("cppsim", "rtlsim")""".format(
mode
)
)

inp = context[node.input[0]]
assert str(inp.dtype) == "float32", "Input datatype is not float32"
assert inp.shape == ishape, "Input shape mismatch for " + node.input[0]
# reshape input into folded form
inp = inp.reshape(folded_ishape)
# make copy before saving array
reshaped_input = inp.copy()
np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)

if mode == "cppsim":
# execute the precompiled model
super().exec_precompiled_singlenode_model()
# load output npy file
super().npy_to_dynamic_outputs(context, ["output_%d.npy" % i for i in range(n_outputs)])
for i in range(n_outputs):
assert (
context[node.output[i]].shape == exp_oshapes[i]
), "cppsim did not produce expected folded output shape: {}, expected: {}".format(
context[node.output[i]].shape, exp_oshapes[i]
)
elif mode == "rtlsim":
sim = self.get_rtlsim()
io_dict = {"inputs": {}, "outputs": {}}

nbits = self.get_instream_width()
rtlsim_inp = npy_to_rtlsim_input(
"%s/input_0.npy" % code_gen_dir,
export_idt,
nbits,
# reverse_inner=True,
)
io_dict["inputs"]["in0"] = rtlsim_inp
super().reset_rtlsim(sim)
super().toggle_clk(sim)

for i in range(n_outputs):
io_dict["outputs"]["out_arr_%d" % i] = []
self.rtlsim_multi_io(sim, io_dict, sname="_")
odt = self.get_output_datatype()
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
for i in range(n_outputs):
out_npy_path = "%s/output_%d.npy" % (code_gen_dir, i)
out_shape = self.get_folded_output_shape(i)
rtlsim_output_to_npy(
io_dict["outputs"]["out_arr_%d" % i],
out_npy_path,
odt,
out_shape,
packed_bits,
target_bits,
# reverse_inner=True,
)
# load and reshape output
output = np.load(out_npy_path)
output = np.asarray([output], dtype=np.float32).reshape(*exp_oshapes[i])
context[node.output[i]] = output
else:
raise Exception(
"""Invalid value for attribute exec_mode! Is currently set to: {}
has to be set to one of the following value ("cppsim", "rtlsim")""".format(
mode
)
)

for i in range(n_outputs):
assert (
context[node.output[i]].shape == exp_oshapes[i]
), "cppsim did not produce expected folded output shape. Got: {}, expected: {}".format(
context[node.output[i]].shape, exp_oshapes[i]
)

def code_generation_cppsim(self, model):
"""Generates c++ code for simulation (cppsim)."""
node = self.onnx_node
path = self.get_nodeattr("code_gen_dir_cppsim")
self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())]
self.generate_params(model, path)
self.global_includes()
self.defines("cppsim")
self.read_npy_data()
self.strm_decl()
self.pragmas()
self.docompute()
self.dataoutstrm()
self.save_as_npy()
self.timeout_value()
self.timeout_condition()
self.timeout_read_stream()

template = templates.docompute_template_timeout

for key in self.code_gen_dict:
# transform list into long string separated by '\n'
code_gen_line = "\n".join(self.code_gen_dict[key])
template = template.replace(key, code_gen_line)
code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
f = open(os.path.join(code_gen_dir, "execute_{}.cpp".format(node.op_type)), "w")
f.write(template)
f.close()
self.code_gen_dict.clear()

def global_includes(self):
self.code_gen_dict["$GLOBALS$"] = ['#include "split.hpp"']

def defines(self, var):
self.code_gen_dict["$DEFINES$"] = ["#define NUM_OUTPUTS " + str(self.get_n_outputs())]

def read_npy_data(self):
code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
npy_type = "float"
self.code_gen_dict["$READNPYDATA$"] = []
simd = self.get_nodeattr("SIMD")
input_elem_hls_type = self.get_input_datatype().get_hls_datatype_str()
npy_in = "%s/input_0.npy" % code_gen_dir
self.code_gen_dict["$READNPYDATA$"].append(
'npy2vectorstream<%s, %s, %d>("%s", in0);'
% (input_elem_hls_type, npy_type, simd, npy_in)
)

def strm_decl(self):
self.code_gen_dict["$STREAMDECLARATIONS$"] = []
simd = self.get_nodeattr("SIMD")
input_elem_hls_type = self.get_input_datatype().get_hls_datatype_str()
stream_name = "in0"
self.code_gen_dict["$STREAMDECLARATIONS$"].append(
'hls::stream<hls::vector<%s, %d>> %s ("%s");'
% (input_elem_hls_type, simd, stream_name, stream_name)
)
self.code_gen_dict["$STREAMDECLARATIONS$"].append(
"hls::stream<hls::vector<{}, {}>> out_arr[NUM_OUTPUTS];".format(
self.get_output_datatype().get_hls_datatype_str(), simd
)
)
self.code_gen_dict["$STREAMDECLARATIONS$"].append(
"hls::stream<hls::vector<{}, {}>> debug_out_arr[NUM_OUTPUTS];".format(
self.get_output_datatype().get_hls_datatype_str(), simd
)
)

def docompute(self):
self.code_gen_dict["$DOCOMPUTE$"] = []
n_outputs = self.get_n_outputs()
output_folds = [str(self.get_folded_output_shape(i)[-2]) for i in range(n_outputs)]
out_stream_folds = ", ".join(output_folds)
comp_call = "StreamingSplit<{}>(in0, out_arr);".format(out_stream_folds)
self.code_gen_dict["$DOCOMPUTE$"] = [comp_call]

def dataoutstrm(self):
npy_type = "float"
simd = self.get_nodeattr("SIMD")
code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
n_outputs = self.get_n_outputs()
self.code_gen_dict["$DATAOUTSTREAM$"] = []
for i in range(n_outputs):
oshape = self.get_folded_output_shape(i)
oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
npy_out = "%s/output_%d.npy" % (code_gen_dir, i)
self.code_gen_dict["$DATAOUTSTREAM$"].append(
'vectorstream2npy<%s, %s, %d>(debug_out_arr[%d], %s, "%s");'
% (
self.get_output_datatype(i).get_hls_datatype_str(),
npy_type,
simd,
i,
oshape_cpp_str,
npy_out,
)
)

def blackboxfunction(self):
input_elem_hls_type = self.get_input_datatype().get_hls_datatype_str()
simd = self.get_nodeattr("SIMD")
in_stream = "hls::stream<hls::vector<%s, %d>> &in0" % (input_elem_hls_type, simd)
out_streams = "hls::stream<hls::vector<%s, %d>> (&out_arr)[NUM_OUTPUTS]" % (
input_elem_hls_type,
simd,
)
blackbox_hls = "void %s(%s, %s)" % (self.onnx_node.name, in_stream, out_streams)
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [blackbox_hls]

def pragmas(self):
pragmas = []
pragmas.append("#pragma HLS INTERFACE axis port=in0")
for i in range(self.get_n_outputs()):
pragmas.append("#pragma HLS INTERFACE axis port=out_arr[%d]" % i)
pragmas.append("#pragma HLS INTERFACE ap_ctrl_none port=return")
pragmas.append("#pragma HLS aggregate variable=in0 compact=bit")
for i in range(self.get_n_outputs()):
pragmas.append("#pragma HLS aggregate variable=out_arr[%d] compact=bit" % i)
self.code_gen_dict["$PRAGMAS$"] = pragmas

def timeout_condition(self):
condition = []
for i in range(self.get_n_outputs()):
condition.append("out_arr[{}].empty()".format(i))
condition = " && ".join(condition)
self.code_gen_dict["$TIMEOUT_CONDITION$"] = [condition]

def timeout_read_stream(self):
read_stream_command = """
for(int i = 0; i < NUM_OUTPUTS; i++){
if(!out_arr[i].empty())
debug_out_arr[i] << out_arr[i].read();
}
"""
self.code_gen_dict["$TIMEOUT_READ_STREAM$"] = [read_stream_command]
14 changes: 14 additions & 0 deletions src/finn/custom_op/fpgadataflow/hlsbackend.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,3 +474,17 @@ def get_ap_int_max_w(self):
ret = max([instream, outstream])
assert ret <= 8191, "AP_INT_MAX_W=%d is larger than allowed maximum of 8191" % ret
return ret

def timeout_value(self):
"""Set timeout value for HLS functions defined for one clock cycle"""
self.code_gen_dict["$TIMEOUT_VALUE$"] = ["100"]

def timeout_condition(self):
"""Set timeout condition for HLS functions defined for one clock cycle"""
self.code_gen_dict["$TIMEOUT_CONDITION$"] = ["out_{}.empty()".format(self.hls_sname())]

def timeout_read_stream(self):
"""Set reading output stream procedure for HLS functions defined for one clock cycle"""
self.code_gen_dict["$TIMEOUT_READ_STREAM$"] = [
"debug_out_{} << out_{}.read();".format(self.hls_sname(), self.hls_sname())
]
4 changes: 2 additions & 2 deletions src/finn/custom_op/fpgadataflow/hwcustomop.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,11 +284,11 @@ def rtlsim(self, sim, inp, inp2=None):
sim.stop_vcd_trace()
return outputs

def rtlsim_multi_io(self, sim, io_dict):
def rtlsim_multi_io(self, sim, io_dict, sname=None):
"Run rtlsim for this node, supports multiple i/o streams."

# signal name
sname = "_" + self.hls_sname() + "_"
sname = "_" + self.hls_sname() + "_" if sname is None else sname

trace_file = self.get_nodeattr("rtlsim_trace")
if trace_file == "default":
Expand Down
Loading