diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index e14181b140..25a2032aeb 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -529,6 +529,60 @@ def apply(self, model):
         graph = model.graph
         node_ind = 0
         graph_modified = False
+        # check first if global input is split
+        successors = model.find_consumers(graph.input[0].name)
+        dt = model.get_tensor_datatype(graph.input[0].name)
+        if successors is not None and len(successors) >= 2 and dt.is_integer():
+            output_tensor = graph.input[0].name
+            n_outputs = len(successors)
+            dt = model.get_tensor_datatype(output_tensor)
+
+            # create clone tensors
+            out_shape = model.get_tensor_shape(output_tensor)
+            out_tensor_clones = []
+            for i in range(n_outputs):
+                clone = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
+                )
+                model.graph.value_info.append(clone)
+                out_tensor_clones += [clone.name]
+
+            num_ch = int(out_shape[-1])
+            vecs = out_shape[:-1]
+
+            # create node with no parallelization first
+            pe = 1
+
+            dup_node = helper.make_node(
+                "DuplicateStreams",
+                [output_tensor],
+                out_tensor_clones,
+                domain="finn.custom_op.fpgadataflow",
+                backend="fpgadataflow",
+                NumChannels=num_ch,
+                PE=pe,
+                inputDataType=dt.name,
+                numInputVectors=vecs,
+                NumOutputStreams=n_outputs,
+                outFIFODepths=[2] * n_outputs,
+                name="DuplicateStreams_" + output_tensor,
+            )
+
+            graph.node.insert(0, dup_node)
+
+            # connect successors to out tensor clone
+            clone_idx = 0
+            for successor in successors:
+                for i, succ_input in enumerate(successor.input):
+                    if succ_input == output_tensor:
+                        successor.input[i] = out_tensor_clones[clone_idx]
+                        clone_idx += 1
+                        # if one node has multiple connections to the same output
+                        # find_direct_successors will return one node per input
+                        # so break the inner loop will result in correct behaviour
+                        break
+            graph_modified = True
+
         for node in graph.node:
             node_ind += 1
             successors = model.find_consumers(node.output[0])
@@ -1197,8 +1251,8 @@ def apply(self, model):
 
 
 class InferStreamingEltwise(Transformation):
-    """Convert eltwise Sub or Sub -> Abs to StreamingEltwise layer
-    with SubEltwise or AbsDiffEltwise op."""
+    """Convert eltwise Add, Sub or Sub -> Abs to StreamingEltwise layer
+    with AddEltwise, SubEltwise or AbsDiffEltwise op."""
 
     def apply(self, model):
         graph = model.graph
@@ -1206,7 +1260,7 @@ def apply(self, model):
         graph_modified = False
         for node in graph.node:
             node_ind += 1
-            if node.op_type == "Sub":
+            if node.op_type in ["Sub", "Add"]:
                 in0 = node.input[0]
                 in1 = node.input[1]
                 result = node.output[0]
@@ -1230,14 +1284,15 @@ def apply(self, model):
                 if not (idt0.is_integer() and idt1.is_integer()):
                     continue
 
-                eltwiseOp = "Sub"
+                eltwiseOp = node.op_type
                 nodes_to_remove = [node]
-                # look for a downstream Abs node
-                res_consumer = model.find_consumer(result)
-                if (res_consumer is not None) and (res_consumer.op_type == "Abs"):
-                    eltwiseOp = "AbsDiff"
-                    result = res_consumer.output[0]
-                    nodes_to_remove.append(res_consumer)
+                if node.op_type == "Sub":
+                    # look for a downstream Abs node
+                    res_consumer = model.find_consumer(result)
+                    if (res_consumer is not None) and (res_consumer.op_type == "Abs"):
+                        eltwiseOp = "AbsDiff"
+                        result = res_consumer.output[0]
+                        nodes_to_remove.append(res_consumer)
 
                 # check layout and convert if necessary
                 in0_layout = model.get_tensor_layout(in0)
@@ -1438,6 +1493,9 @@ def apply(self, model):
             if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is None:
                 mm_input = n.input[0]
                 mm_weight = n.input[1]
+                # if mm_weight is not constant, skip node
+                if model.get_initializer(n.input[1]) is None:
+                    continue
                 mm_output = n.output[0]
                 mm_in_shape = model.get_tensor_shape(mm_input)
                 mm_out_shape = model.get_tensor_shape(mm_output)
diff --git a/src/finn/transformation/qonnx/qonnx_activation_handlers.py b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
index 323e391df4..92a9731c2a 100644
--- a/src/finn/transformation/qonnx/qonnx_activation_handlers.py
+++ b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
@@ -537,11 +537,12 @@ def _calculate_thresholds(self):
                 for t in range(num_thresholds):
                     thresholds[c][t] = min_threshold[c] + step[c] * t
 
-            # ToDo: The index 1 needs to be changed to -1 for the channels last format
+            # currently only per tensor or per channel quantization is supported
             num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[1]
-            final_shape = (num_output_channels, num_thresholds)
-            if thresholds.shape != final_shape:
-                thresholds = np.broadcast_to(thresholds, final_shape)
+            assert (
+                thresholds.shape[0] == 1 or thresholds.shape[0] == num_output_channels
+            ), """Quant node cannot be converted to MultiThreshold because only
+                per tensor or per channel quantization supported."""
 
             return thresholds