pulp-platform · fischeti · Mar 20, 2024 · Mar 20, 2024 · Mar 20, 2024 · Mar 21, 2024
@@ -4,8 +4,9 @@
 
 {
     num_inputs: 1,
-    input_shape: [32, 4],
-    output_shape: [32, 16],
-    dtype: "FP64",
-    baseline: true
-}
+    input_shape: [64, 16],
+    output_shape: [64, 16],
+    trans_weights: true,
+    dtype: "FP16",
+    implementation: "BASELINE"
+}
@@ -11,15 +11,14 @@
 import json5
 import sys
 import os
-import torch
+import pyflexfloat as ff
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
 import data_utils  # noqa: E402
 from data_utils import emit_license, \
                        format_struct_definition, format_array_definition, \
-                       format_array_declaration, format_ifdef_wrapper  # noqa: E402
-
-torch.manual_seed(42)
+                       format_array_declaration, format_ifdef_wrapper, \
+                       format_scalar_definition  # noqa: E402
 
 # AXI splits bursts crossing 4KB address boundaries. To minimize
 # the occurrence of these splits the data should be aligned to 4KB
@@ -28,8 +27,8 @@
 
 def golden_model(inputs, weights):
     innermost_dim = len(inputs[0].shape) - 1
-    concat_output = torch.cat(inputs, dim=innermost_dim)
-    linear_output = torch.matmul(concat_output, weights)
+    concat_output = np.concatenate(inputs, axis=innermost_dim)
+    linear_output = np.matmul(concat_output, weights)
     return linear_output, concat_output
 
 
@@ -38,19 +37,21 @@ def emit_header(section, params):
     input_shape = params['input_shape']
     output_shape = params['output_shape']
     prec = params['dtype']
+    trans_weights = params['trans_weights']
 
     assert input_shape[0] == output_shape[0], 'Inconsistent input and output shapes'
 
-    torch_type = data_utils.torch_type_from_precision_t(prec)
+    ff_desc = data_utils.ff_desc_from_precision_t(prec)
+    ctype = data_utils.ctype_from_precision_t(prec)
 
-    inputs = [torch.rand(*input_shape, requires_grad=False, dtype=torch_type)
-              for _ in range(num_inputs)]
-    weights = torch.rand([input_shape[1]*num_inputs, output_shape[1]],
-                         requires_grad=False, dtype=torch_type)
+    inputs = [ff.array(np.random.rand(*input_shape), ff_desc) for _ in range(num_inputs)]
+    weights = ff.array(np.random.rand(input_shape[1]*num_inputs, output_shape[1]), ff_desc)
     linear_output, concat_output = golden_model(inputs, weights)
 
     ctype = data_utils.ctype_from_precision_t(prec)
 
+    weights = weights.T if trans_weights else weights
+
     layer_cfg = {
         **params,
         'inputs': 'inputs',
@@ -66,6 +67,7 @@ def emit_header(section, params):
     data_str += [format_array_declaration(ctype, 'concat_output', concat_output.shape)]
     data_str += [format_array_declaration(ctype, 'linear_output', linear_output.shape)]
     data_str += [format_array_declaration(ctype, 'weights', weights.shape)]
+    data_str += [format_scalar_definition('uint32_t', 'trans_weights', int(trans_weights))]
     data_str += [format_struct_definition('fused_concat_linear_layer_t', 'layer', layer_cfg)]
     data_str += [format_array_definition(ctype, f'input_{i}', t)
                  for i, t in enumerate(inputs)]

@@ -6,7 +6,6 @@
 # Luca Colagrande <[email protected]>
 
 import sys
-import torch
 from pathlib import Path
 from datagen import golden_model
 
@@ -18,6 +17,7 @@
 class FusedConcatLinearVerifier(Verifier):
 
     OUTPUT_UIDS = ['linear_output']
+    ERR_THRESHOLD = {8: 1e-6, 4: 1e-6, 2: 1e-2, 1: 1e-4}
 
     def __init__(self):
         super().__init__()
@@ -29,6 +29,7 @@ def __init__(self):
             'out_width': 'I',
             'inputs': 'I',
             'weights': 'I',
+            'trans_weights': 'I',
             'concat_output': 'I',
             'linear_output': 'I',
             'dtype': 'I',
@@ -44,16 +45,19 @@ def get_actual_results(self):
         return self.get_output_from_symbol('linear_output', ctype_from_precision_t(self.prec))
 
     def get_expected_results(self):
+        trans_weights = self.get_input_from_symbol('trans_weights', 'uint32_t')[0]
         inputs = [self.get_input_from_symbol(f'input_{i}', ctype_from_precision_t(self.prec))
                   for i in range(self.num_inputs)]
-        inputs = [torch.from_numpy(tensor.reshape(self.input_shape)) for tensor in inputs]
+        inputs = [tensor.reshape(self.input_shape) for tensor in inputs]
         weights = self.get_input_from_symbol('weights', ctype_from_precision_t(self.prec))
-        weights = torch.from_numpy(weights.reshape(self.weights_shape))
+        if trans_weights:
+            weights = weights.reshape(self.weights_shape).T
+        weights = weights.reshape(self.weights_shape)
         output_golden, _ = golden_model(inputs, weights)
-        return output_golden.detach().numpy().flatten()
+        return output_golden.flatten()
 
     def check_results(self, *args):
-        return super().check_results(*args, rtol=1E-6)
+        return super().check_results(*args, rtol=self.ERR_THRESHOLD[self.prec])
 
 
 if __name__ == "__main__":

@@ -23,12 +23,13 @@ typedef struct {
     uint32_t num_inputs;
     uint32_t input_shape[2];
     uint32_t output_shape[2];
+    uint32_t trans_weights;
     void **inputs;
     void *weights;
     void *concat_output;
     void *linear_output;
     precision_t dtype;
-    uint32_t baseline;
+    implementation_t implementation;
 } fused_concat_linear_layer_t;
 
 static inline int fused_concat_linear_baseline(fused_concat_linear_layer_t l) {
@@ -41,12 +42,16 @@ static inline int fused_concat_linear_baseline(fused_concat_linear_layer_t l) {
         .dtype = l.dtype};
     int nerr = concat_layer(concat_layer_cfg);
 
+    uint32_t setup_SSR =
+        (l.implementation == OPT) || (l.implementation == OPT_EX);
+
     // Linear layer
     uint32_t m = l.input_shape[0];
     uint32_t k = l.input_shape[1] * l.num_inputs;
     uint32_t n = l.output_shape[1];
-    gemm(l.dtype, 0, 0, 1, 0, snrt_cluster_num(), 1, 1, 1, 1, 1, 0, 0, m, n, k,
-         1.0, l.concat_output, l.weights, 0.0, l.linear_output, l.baseline);
+    gemm(l.dtype, 0, setup_SSR, 1, 0, snrt_cluster_num(), 1, 1, 1, 1, 1, 0,
+         l.trans_weights, m, n, k, 1.0, l.concat_output, l.weights, 0.0,
+         l.linear_output, l.implementation);
 
     snrt_global_barrier();
 
@@ -70,8 +75,12 @@ static inline int fused_concat_linear_optimized(fused_concat_linear_layer_t l) {
     }
     snrt_cluster_hw_barrier();
 
-    gemm(l.dtype, 0, 0, 0, 1, 1, 1, l.num_inputs, 0, 1, 1, 0, 0, m, n, concat_k,
-         1.0, a, l.weights, 0.0, l.linear_output, l.baseline);
+    uint32_t setup_SSR =
+        (l.implementation == OPT) || (l.implementation == OPT_EX);
+
+    gemm(l.dtype, 0, setup_SSR, 0, 1, 1, 1, l.num_inputs, 0, 1, 1, 0,
+         l.trans_weights, m, n, concat_k, 1.0, a, l.weights, 0.0,
+         l.linear_output, l.implementation);
 
     snrt_global_barrier();