diff --git a/apps/ULPPACK/main.c b/apps/ULPPACK/main.c index 564de4da3..0cb32a6d6 100644 --- a/apps/ULPPACK/main.c +++ b/apps/ULPPACK/main.c @@ -1,22 +1,22 @@ -// Author : Theo Dupuis -// GR2M - 2022 -// Polytechnique Montreal +// Author: Elisabeth Humblet +// Based on the work by: Theo Dupuis +// GR2M - 2024 +// Polytechnique Montréal -#include +#include #include -#include +#include #include "ibsconv2d_tensor32.h" #include "runtime.h" #include "cache_metrics.h" -//#ifndef SPIKE -//#include "printf.h" -//#endif - - #include "util.h" +// ============================= +// ====== UART FUNCTIONS ======= +// ============================= + #define UART_BASE 0xFFF0C2C000 #define UART_INTERRUPT_ENABLE UART_BASE + 1 @@ -45,54 +45,64 @@ void init_uart(uint32_t freq, uint32_t baud) write_reg_u8(UART_MODEM_CONTROL, 0x20); // Autoflow mode } +// ============================= +// ==== MACROS DEFINITIONS ===== +// ============================= - - - +// ---- Architecture ---- #define NR_LANES 4 -//#define MULTIRUN - -// use to check if the results are correct -// since we compute the expectude results with -// scalar code and rolled loops, it had a significant -// amout of time on simulation +// ---- Debug ---- //#define VERIF -#define PRECA_MAX 2 -#define PRECW_MAX 2 +// ---- Precisions ---- +#define PRECA_MAX 1 +#define PRECW_MAX 1 + +// ---- Tensors ---- +#define F_MAX 7 // Max size of the kernel +#define C_IN 16 // Number of input channels +#define C_OUT 1 // Number of output channels +#define I_MAX 16 // Max H_in x W_in input size +#define I_START 16 // Start input size -#define F_MAX 7 // Max size of the kernel F x F -#define C_in 16 // Number of input input_channels -#define C_out 4 // Number of output_channels (or output input_channels C_out) -#define I_MAX 64 // Max H_in x W_in input size -#define I_START 64 // Start input size +int8_t i [I_MAX * I_MAX * C_IN]; +int8_t f [F_MAX * F_MAX * C_IN * C_OUT]; +int8_t f_nhwc[F_MAX * F_MAX * C_IN * C_OUT]; +int16_t o [(I_MAX - F_MAX + 1)*(I_MAX - F_MAX + 1) * C_OUT]; -int8_t i[I_MAX * I_MAX * C_in]; +// ============================= +// === MULTICORE DEFINITIONS === +// ============================= -int8_t f [F_MAX * F_MAX * C_in * C_out]; -int8_t f_nhwc[F_MAX * F_MAX * C_in * C_out]; +volatile static uint32_t init_done = 0; +volatile static uint32_t conv_done = 0; +//#define INPUT_MULTICORE +#define OUTPUT_MULTICORE -////////////////////////////////////// -// utilities functions // -////////////////////////////////////// +// ============================= +// ==== UTILITIES FUNCTIONS ==== +// ============================= void iconv2d_tensor_naive(int16_t *o, int8_t *i, int8_t *f, int64_t R, int64_t C, int64_t W, int64_t F, int64_t K) { -//treat pointers as 3D arrays -int8_t (*i_)[R+F-1][C+F-1] = (int8_t (*)[R+F-1][C+F-1])i; -int8_t (*f_)[W][F][F] = (int8_t (*)[W][F][F])f; -int16_t (*o_)[R][C] = (int16_t (*)[R][C])o; - -for(int k = 0 ; k < K ; k++) - for(int ch = 0 ; ch < W ; ch++) - for(int r = 0 ; r < R ; r++) - for(int c = 0 ; c < C ; c++) - for(int fh = 0 ; fh < F ; fh++) - for(int fw = 0 ; fw < F ; fw++) { - o_[k][r][c] += i_[ch][r+fh][c+fw]*f_[k][ch][fh][fw]; - } + //treat pointers as 3D arrays + int8_t (*i_)[R+F-1][C+F-1] = (int8_t (*)[R+F-1][C+F-1])i; + int8_t (*f_)[W][F][F] = (int8_t (*)[W][F][F])f; + int16_t (*o_)[R][C] = (int16_t (*)[R][C])o; + + for(int k = 0 ; k < K ; k++) + for(int ch = 0 ; ch < W ; ch++) + for(int r = 0 ; r < R ; r++) + for(int c = 0 ; c < C ; c++) + for(int fh = 0 ; fh < F ; fh++) + for(int fw = 0 ; fw < F ; fw++) { + o_[k][r][c] += i_[ch][r+fh][c+fw]*f_[k][ch][fh][fw]; + if(get_hartid()==0){ + printf("k %d, ch %d, r %d, c %d, fh %d, fw %d\r\n", k, ch, r, c, fh, fw); + } + } } void NCHW_to_NHWC_8b(int8_t * NCHW_format, int8_t * NHWC_format, int64_t N, int64_t C, int64_t H, int64_t W){ @@ -134,13 +144,9 @@ void init_tensor(int8_t *tensor, int64_t R, int64_t C, int64_t D, int precision) } } - - - -////////////////////////////////////// -// Verification and debug fonctions // -////////////////////////////////////// - +// ============================= +// === VERIF/DEBUG FUNCTIONS === +// ============================= int verify_tensor(int16_t *tensor1, int16_t *tensor2, int64_t height, int64_t width, int64_t channels) { for (int h = 0; h < height; ++h) //depth @@ -156,416 +162,236 @@ return 0; } void print_tensor(uint8_t *tensor, uint64_t num_rows, uint64_t num_columns, uint64_t num_depth) { - printf("0x%8X\n", (uint64_t)tensor); - for (uint64_t k = 0; k < num_depth; ++k) { - for (uint64_t i = 0; i < num_rows; ++i) { - for (uint64_t j = 0; j < num_columns; ++j) { - printf("%10u ", tensor[(i+k*num_rows) * num_columns + j ]); - } - printf("\n"); - } - printf("\n"); - } + printf("0x%8X\n", (uint64_t)tensor); + for (uint64_t k = 0; k < num_depth; ++k) { + for (uint64_t i = 0; i < num_rows; ++i) { + for (uint64_t j = 0; j < num_columns; ++j) { + printf("%10u ", tensor[(i+k*num_rows) * num_columns + j ]); + } + printf("\r\n"); + } + printf("\r\n"); + } } void print_tensor_16_(uint16_t *tensor, uint64_t num_rows, uint64_t num_columns, uint64_t num_depth) { - printf("0x%8X\n", (uint64_t)tensor); - for (uint64_t k = 0; k < num_depth; ++k) { - for (uint64_t i = 0; i < num_rows; ++i) { - for (uint64_t j = 0; j < num_columns; ++j) { - printf("%10u ", tensor[(i+k*num_rows) * num_columns + j ]); - } - printf("\n"); - } - printf("\n"); + printf("0x%8X\n", (uint64_t)tensor); + for (uint64_t k = 0; k < num_depth; ++k) { + for (uint64_t i = 0; i < num_rows; ++i) { + for (uint64_t j = 0; j < num_columns; ++j) { + printf("%10u ", tensor[(i+k*num_rows) * num_columns + j ]); + } + printf("\r\n"); + } + printf("\r\n"); } } - - - -int main(int argc, char** argv) { -init_uart(50000000, 115200); - -volatile static uint32_t amo_cnt = 0; - -while(argv[0][0] != amo_cnt); -if(get_hartid()==0){ -printf("==============\r\n"); -printf("= ULPCONV2D16 =\r\n"); -printf("==============\r\n"); -} -ATOMIC_OP(amo_cnt,0, add,w); - ///////////////////////////////// - // SAME SIZE OUTPUT 64b -> 64b // - ///////////////////////////////// - - - - -for(int64_t precA = PRECA_MAX; precA <= PRECA_MAX; precA++){ - for(int64_t precW = PRECW_MAX; precW <= PRECW_MAX; precW++){ - while(argv[0][0] != amo_cnt); - if(get_hartid()==0){ - printf("\r\n"); - printf("************\r\n"); - printf("*** A%dW%d ***\r\n", precA, precW); - printf("************\r\n"); - - printf("\r\n"); - printf("Filling the input and filter tensors...\r\n"); - } - ATOMIC_OP(amo_cnt,0,add,w); - - start_timer(); - init_tensor(i, I_MAX, I_MAX, C_in, precA); - init_tensor(f, F_MAX, F_MAX, C_in * C_out, precW); - stop_timer(); - int64_t init_time = get_timer(); - printf(" done\r\n"); - - for(int64_t F = F_MAX ; F <= F_MAX ; F += 2){ - start_timer(); - int64_t input_channels = C_in; // channel size is fixed for simplicity - int64_t output_channels = C_out; - int8_t filter[output_channels * F * F * input_channels]; - - for(int k = 0; k < output_channels ; k++) - for(int z = 0; z < input_channels ; z++) - for(int y = 0 ; y < F ; y++) - for(int x = 0 ; x < F ; x++) - filter[x + F * (y + F * (z + k * input_channels))] = f[x + F_MAX * (y + F_MAX * (z + k * input_channels))]; - - #ifdef VERIF - printf("Computing the expected output for this kernel size...\r\n"); - //Compute the expected output - int16_t golden_o[(I_MAX - F + 1) * (I_MAX - F + 1) * C_out]; - - for(int z = 0; z < output_channels ; z++) - for(int y = 0 ; y < (I_MAX - F + 1) ; y++) - for(int x = 0 ; x < (I_MAX - F + 1) ; x++) - { - golden_o[x + (I_MAX - F + 1) * (y + z * (I_MAX - F + 1))] = 0; - } - - iconv2d_tensor_naive(golden_o, i, f, (I_MAX - F + 1), (I_MAX - F + 1), input_channels, F, output_channels); - - printf(" done\r\n"); - #endif - - - // FILTER TRANSPOSITION INTO NHWC format - - NCHW_to_NHWC_8b(f, f_nhwc, output_channels, input_channels, F, F); - - stop_timer(); - int64_t filter_timer = get_timer(); - - //printf("\nfilter %dx%d \r\n", F, F); - - for(int size = I_START ; size <= I_MAX ; size*=2){ - while(argv[0][0] != amo_cnt); - if (get_hartid()==0){ - printf("\n"); - printf("----------------------------------------------------------------\r\n"); - printf("Calculating convolution between \r\n"); - printf("Input of [1 x %d x %d x %d] and Filters of [%d x %d x %d x %d] \r\n", C_in, size, size, C_out, C_in, F, F); - printf("Activation precision of %d and Weights precision of %d \r\n", precA, precW); - printf("Result (16b) is an output of [1 x %d x %d x %d] \r\n", C_out, size - F + 1, size - F + 1); - printf("----------------------------------------------------------------\r\n"); - printf("\r\n"); - - #ifdef VERIF - printf("Formatting data and expected outputs...\r\n"); - #else - printf("Formatting data...\r\n"); - #endif - } - ATOMIC_OP(amo_cnt,0,add,w); - - int64_t width = size; - int64_t height = size; - - int8_t input [width * height * input_channels]; - int8_t i_nhwc [width * height * input_channels]; - int16_t output[width * height * output_channels]; - int16_t golden_output[(width - F + 1) * (height- F + 1) * output_channels]; - - //////////////////////////////////////////////// - // INPUT, FILTERS AND EXPECTED OUTPUT SLICING // - //////////////////////////////////////////////// - start_timer(); - for(int z = 0; z < input_channels ; z++) - for(int y = 0 ; y < height ; y++) - for(int x = 0 ; x < width ; x++) - input[x + width * (y + z * height)] = i[x + I_MAX * (y + z * I_MAX)]; - - for(int z = 0; z < output_channels ; z++) - for(int y = 0 ; y < (height - F + 1) ; y++) - for(int x = 0 ; x < (width - F + 1) ; x++) - { - output[x + (width - F + 1) * (y + z * (height - F + 1))] = 0; - #ifdef VERIF - golden_output[x + (width - F + 1) * (y + z * (height - F + 1))] = golden_o[x + (I_MAX - F + 1) * (y + z * (I_MAX - F + 1))]; - #endif - } - - NCHW_to_NHWC_8b(input, i_nhwc, 1, input_channels, height, width); - stop_timer(); - int64_t slicing_timer = get_timer(); - /////////////////////////// - // FONCTION TO BE TESTED // - /////////////////////////// - - printf(" done\r\n"); - //print_tensor(f, F, F, input_channels); - //print_tensor(input, height, width, input_channels); - printf("Computing results...\r\n"); - - #ifdef MULTIRUN - start_timer(); - ulppack_conv2d(output, input, f, height, width, input_channels, F, output_channels, precA, precW); - stop_timer(); - int64_t run1 = get_timer(); - start_timer(); - ulppack_conv2d(output, input, f, height, width, input_channels, F, output_channels, precA, precW); - stop_timer(); - int64_t run2 = get_timer(); - start_timer(); - ulppack_conv2d(output, input, f, height, width, input_channels, F, output_channels, precA, precW); - stop_timer(); - int64_t run3 = get_timer(); - start_timer(); - ulppack_conv2d(output, input, f, height, width, input_channels, F, output_channels, precA, precW); - stop_timer(); - int64_t run4 = get_timer(); - - #else - for (int core=0;core<4;core++){ - reset_L2_metrics(core); - init_L2_metrics(core); - } - - start_timer(); - - ulppack_conv2d(output, input, f, height, width, input_channels, F, output_channels, precA, precW); - - stop_timer(); - for (int core=0;core<4;core++){ - stop_L2_metrics(core); - } - #endif - printf(" done\r\n"); - - ////////////////// - // VERIFICATION // - ////////////////// - - int16_t golden_output_nhwc[(I_MAX - F + 1) * (I_MAX - F + 1) * C_out]; - - NCHW_to_NHWC_16b(golden_output, golden_output_nhwc, 1, C_out, I_MAX - F + 1, I_MAX - F + 1); - - #ifdef VERIF - printf("Verifying results...\r\n"); - int error = verify_tensor(output, golden_output, (height - F + 1), (width - F + 1), output_channels); - if (error == 0) - printf(" done\r\n"); - else - printf(" ERROR\r\n"); - #else - //printf("-- Change macro to add verification step -- \r\n"); - int error = 0; - #endif - - ///////////// - // METRICS // - ///////////// - - int64_t runtime = get_timer(); - float performance = 2.0 * C_out * C_in * F * F * (size - F + 1) * (size - F + 1) / runtime; - float utilization = 100 * performance / (256 / (precA * precW)) * NR_LANES; - - if (error != 0){ - printf("Fail.\r\n"); - printf("OUT NHWC\r\n"); - print_tensor_16_(output, (height - F + 1), (width - F + 1), output_channels); - printf("EXPECTED OUT NHWC\r\n"); - print_tensor_16_(golden_output_nhwc, (height - F + 1), (width - F + 1), output_channels); - printf("EXPECTED OUT\r\n"); - print_tensor_16_(golden_output, (height - F + 1), (width - F + 1), output_channels); - } - else { - while(argv[0][0] != amo_cnt); - //if (get_hartid()==0){ - printf("Passed.\r\n"); - #ifdef MULTIRUN - printf("The execution of Run 1 took %d cycles. \r\n", run1); - printf("The execution of Run 2 took %d cycles. \r\n", run2); - printf("The execution of Run 3 took %d cycles. \r\n", run3); - printf("The execution of Run 4 took %d cycles. \r\n", run4); - #else - printf("The execution took %d cycles.\r\n", runtime); - #endif - //printf("The initialization took %d cycles. \r\n", init_time); - //printf("The filter init took %d cycles. \r\n", filter_timer); - //printf("The slicing took %d cycles.\r\n",slicing_timer); - //printf("The performance is %f OP/cycle, the utilization is %f % \n", performance, utilization); - //} - for (int core=0;core<4;core++){ - if(get_hartid()==core){ - print_L2_metrics(core); - } - } - #ifdef PERF - printf("The execution of bit-serial packing took %d cycles.\r\n", runtime_bp); - printf("The execution of conv2d took %d cycles.\r\n", runtime - runtime_bp); - #endif - } - ATOMIC_OP(amo_cnt,1,add,w); - //start_timer(); - // - //ulppack_conv2d(output, input, f, height, width, input_channels, F, output_channels, precA, precW); - // - //stop_timer(); - // - //printf(" done\r\n"); - // int64_t runtime2 = get_timer(); - //printf("The execution took %d cycles.\r\n", runtime2); - } +void initialization(int64_t precA, int64_t precW, int64_t F, int64_t input_channels, int64_t output_channels, int8_t *filter, int size, int64_t width, int64_t height, int8_t *input, int8_t *i_nhwc, int16_t *output, int16_t *golden_output, int16_t *golden_o){ + printf("===============\r\n"); + printf("= ULPCONV2D16 =\r\n"); + printf("===============\r\n"); + + printf("\r\n"); + printf("************\r\n"); + printf("*** A%dW%d ***\r\n", precA, precW); + printf("************\r\n"); + + + // ==== Init tensors ==== + printf("\r\n"); + printf("Filling the input and filter tensors... \r\n"); + + init_tensor(i, I_MAX, I_MAX, C_IN, precA); + init_tensor(f, F_MAX, F_MAX, C_IN * C_OUT, precW); + + printf(" done\r\n"); + + //for(int k = 0; k < output_channels ; k++) + // for(int z = 0; z < input_channels ; z++) + // for(int y = 0 ; y < F ; y++) + // for(int x = 0 ; x < F ; x++) + // filter[x + F * (y + F * (z + k * input_channels))] = f[x + F_MAX * (y + F_MAX * (z + k * input_channels))]; + + #ifdef VERIF + // ==== Expected output ==== + printf("Computing the expected output for this kernel size... \r\n"); + for(int z = 0; z < output_channels ; z++) + for(int y = 0 ; y < (I_MAX - F + 1) ; y++) + for(int x = 0 ; x < (I_MAX - F + 1) ; x++) + golden_o[x + (I_MAX - F + 1) * (y + z * (I_MAX - F + 1))] = 0; + + iconv2d_tensor_naive(golden_o, i, f, (I_MAX - F + 1), (I_MAX - F + 1), input_channels, F, output_channels); + printf(" done\r\n"); + #endif + + // ==== Transpose filter ==== + NCHW_to_NHWC_8b(f, f_nhwc, output_channels, input_channels, F, F); + + // ==== Information ==== + printf("\r\n"); + printf("----------------------------------------------------------------\r\n"); + printf("Calculating convolution between \r\n"); + printf("Input of [1 x %d x %d x %d] and Filters of [%d x %d x %d x %d] \r\n", C_IN, size, size, C_OUT, C_IN, F, F); + printf("Activation precision of %d and Weights precision of %d \r\n", precA, precW); + printf("Result (16b) is an output of [1 x %d x %d x %d] \r\n", C_OUT, size - F + 1, size - F + 1); + printf("----------------------------------------------------------------\r\n"); + printf("\r\n"); + + #ifdef VERIF + printf("Formatting data and expected outputs...\r\n"); + #else + printf("Formatting data...\r\n"); + #endif + + // ==== Tensors slicing ==== + //for(int z = 0; z < input_channels; z++) + // for(int y = 0; y < height; y++) + // for(int x = 0; x < width; x++) + // input[x + width * (y + z * height)] = i[x + I_MAX * (y + z * I_MAX)]; + + for(int z = 0; z < output_channels ; z++) + for(int y = 0 ; y < (height - F + 1) ; y++) + for(int x = 0 ; x < (width - F + 1) ; x++) + { + o[x + (width - F + 1) * (y + z * (height - F + 1))] = 0; + // #ifdef VERIF + // golden_output[x + (width - F + 1) * (y + z * (height - F + 1))] = golden_o[x + (I_MAX - F + 1) * (y + z * (I_MAX - F + 1))]; + // #endif } - } - } -} -/* - -// Author : Theo Dupuis -// GR2M - 2022 -// Polytechnique Montreal -// Modified : Elisabeth Humblet, 2024 - -#include -#include -#include - -#include "ibsconv2d_tensor32.h" -#include "runtime.h" -#include "cache_metrics.h" - -//#ifndef SPIKE -//#include "printf.h" -//#endif -#include "util.h" - -#define UART_BASE 0xFFF0C2C000 - -#define UART_INTERRUPT_ENABLE UART_BASE + 1 -#define UART_LINE_CONTROL UART_BASE + 3 -#define UART_MODEM_CONTROL UART_BASE + 4 -#define UART_LINE_STATUS UART_BASE + 5 -#define UART_MODEM_STATUS UART_BASE + 6 -#define UART_DLAB_LSB UART_BASE + 0 -#define UART_DLAB_MSB UART_BASE + 1 + NCHW_to_NHWC_8b(i, i_nhwc, 1, input_channels, height, width); + printf(" done\r\n"); -void write_reg_u8(uintptr_t addr, uint8_t value) -{ - volatile uint8_t *loc_addr = (volatile uint8_t *)addr; - *loc_addr = value; + // ==== INITIALIZATION DONE ==== + ATOMIC_OP(init_done, 1, add, w); } -void init_uart(uint32_t freq, uint32_t baud) -{ - uint32_t divisor = freq / (baud << 4); - - write_reg_u8(UART_INTERRUPT_ENABLE, 0x00); // Disable all interrupts - write_reg_u8(UART_LINE_CONTROL, 0x80); // Enable DLAB (set baud rate divisor) - write_reg_u8(UART_DLAB_LSB, divisor); // divisor (lo byte) - write_reg_u8(UART_DLAB_MSB, (divisor >> 8) & 0xFF); // divisor (hi byte) - write_reg_u8(UART_LINE_CONTROL, 0x03); // 8 bits, no parity, one stop bit - write_reg_u8(UART_MODEM_CONTROL, 0x20); // Autoflow mode +void ulppack_conv2d_msparq(int16_t *o, int8_t *i, int8_t *f, int64_t H_in, int64_t W_in, int64_t C_in, int64_t F, int64_t C_out, int64_t precA, int64_t precW){ + int8_t *i_; + int16_t *o_; + int8_t *f_; + + for(int64_t c = 0; c < C_out; c++){ + #ifdef OUTPUT_MULTICORE + if(get_hartid() == c){ + #endif + o_ = o + c * (H_in - F + 1) *(W_in - F + 1); + i_ = i; + f_ = f + c * F * F * C_in; + #ifdef VMACSR + if (F == 7){ + if((precA <= 2 && precW < 2) || (precA < 2 && precW <= 2)){ + #ifdef INPUT_MULTICORE + ulppack_conv2d_vec8_7x7_tiling(o_, i_, f_, H_in, W_in, C_in, F, C_out); + #else + ulppack_conv2d_vec8_7x7(o_, i_, f_, H_in, W_in, C_in, F, C_out); + #endif + } else { + ulppack_conv2d_vec16_7x7(o_, i_, f_, H_in, W_in, C_in, F, C_out); + } + } + #else + if(F == 3){ + ulppack_conv2d_vec_3x3(o_, i_, f_, H_in, W_in, C_in, F, C_out); + }else + if (F == 7){ + if (precA <= 1 && precW <= 1){ + ulppack_conv2d_vec_7x7_A1W1(o_, i_, f_, H_in, W_in, C_in, F, C_out); + }else if (precA <= 2 && precW <= 2){ + ulppack_conv2d_vec_7x7_A2W2(o_, i_, f_, H_in, W_in, C_in, F, C_out); + }else if (precA <= 3 && precW <= 3){ + ulppack_conv2d_vec_7x7_A3W3(o_, i_, f_, H_in, W_in, C_in, F, C_out); + } + } + #endif + #ifdef OUTPUT_MULTICORE + } + #endif + } + ATOMIC_OP(conv_done, 1, add, w); } - - - - -#define NR_LANES 4 - -#define PRECA 2 -#define PRECW 2 - -#define F 7 // Max size of the kernel F x F -#define C_in 32 // Number of input input_channels -#define C_out 4 // Number of output_channels (or output input_channels C_out) -#define I_MAX 64 // Max H_in x W_in input size -#define I_START 64 // Start input size - -//int8_t i[I_MAX * I_MAX * C_in]; - -int8_t f[F * F * C_in * C_out]; - -int16_t o[I_MAX * I_MAX * C_out]; - -int main() { - -init_uart(50000000, 115200); - -printf("====================\r\n"); -printf("= ULPPACK CONV2D16 =\r\n"); -printf("====================\r\n"); -printf("Multicore acceleration\r\n"); -//for(int hart=0;hart<4;hart++){ - - for(int precA = PRECA ; precA <= PRECA ; precA ++){ - for(int precW = PRECW ; precW <= PRECW ; precW ++){ - for(int size = I_START ; size <= I_MAX ; size *= 2){ - - printf("\r\n"); - printf("----------------------------------------------------------------\r\n"); - printf("Calculating convolution between \r\n"); - printf("Input of [1 x %d x %d x %d] and Filters of [%d x %d x %d x %d] \r\n", C_in, size, size, C_out, C_in, F, F); - printf("Activation precision of %d and Weights precision of %d \r\n", precA, precW); - printf("Result (16b) is an output of [1 x %d x %d x %d] \r\n", C_out, size - F + 1, size - F + 1); - printf("----------------------------------------------------------------\r\n"); - printf("\r\n"); - - //printf("Core %d\r\n",hart); - printf("Computing results...\r\n"); - for (int core=0;core<4;core++){ - reset_L2_metrics(core); - init_L2_metrics(core); - } - start_timer(); - - ulppack_conv2d(o, o, f, size, size, C_in, F, C_out, precA, precW); - - stop_timer(); - for (int core=0;core<4;core++){ - stop_L2_metrics(core); - } - printf(" done\r\n"); - - ///////////// - // METRICS // - ///////////// - - int64_t runtime = get_timer(); - - float performance = (2.0 * C_out * C_in * F * F * (size - F + 1) * (size - F + 1) )/ runtime; - float utilization = 100 * performance / (4 * 2 * NR_LANES); - - // printf("Passed.\r\n"); - if (get_hartid()==0){ - printf("The execution took %d cycles.\r\n", runtime); - // printf("The performance is %f OP/cycle, the utilization is %f \r\n", performance, utilization); - } - for (int core=0;core<4;core++){ - if(get_hartid()==core){ - print_L2_metrics(core); - } - - } - }} -//} -} -*/ \ No newline at end of file +int main(int argc, char** argv){ + init_uart(50000000, 115200); + + // ===== INIT DEFINES ===== + int64_t precA = PRECA_MAX; + int64_t precW = PRECW_MAX; + + int64_t F = F_MAX; + int64_t input_channels = C_IN; + int64_t output_channels = C_OUT; + int8_t filter[output_channels * F * F * input_channels]; + + int size = I_START; + int64_t width = size; + int64_t height = size; + + int8_t input [width * height * input_channels]; + int8_t i_nhwc [width * height * input_channels]; + int16_t output [width * height * output_channels]; + int16_t golden_output [(width - F + 1) * (width - F + 1) * output_channels]; + int16_t golden_o[(I_MAX - F + 1) * (I_MAX - F + 1) * C_OUT]; + + + // ===== INITIALIZATION ===== + + if (argv[0][0] == 0){ + initialization(precA, precW, F, input_channels, output_channels, filter, size, width, height, input, i_nhwc, output, golden_output, golden_o); + } + while(init_done == 0); + + printf("Computing results...\r\n"); + + // ===== COMPUTING RESULTS ===== + + // --- Reset L2 metrics --- + reset_L2_metrics(argv[0][0]); + init_L2_metrics(argv[0][0]); + + // --- Timer and convolution --- + start_timer(); + ulppack_conv2d_msparq(o, i, f, height, width, input_channels, F, output_channels, precA, precW); + stop_timer(); + + // --- Wait for end of convolution --- + while(conv_done == 0); + + // --- Stop L2 metrics + stop_L2_metrics(argv[0][0]); + printf(" done\r\n"); + + if(argv[0][0] == 0){ + // ===== VERIFICATION ===== + int16_t golden_output_nhwc[(I_MAX - F + 1) * (I_MAX - F + 1) * C_OUT]; + NCHW_to_NHWC_16b(golden_o, golden_output_nhwc, 1, C_OUT, I_MAX - F + 1, I_MAX - F + 1); + + #ifdef VERIF + printf("Verifying results...\r\n"); + int error = verify_tensor(o, golden_o, (height - F + 1), (width - F + 1), output_channels); + if (error == 0) + printf(" done\r\n"); + else + printf(" ERROR\r\n"); + #else + int error = 0; + #endif + + // ===== METRICS ===== + int64_t runtime = get_timer(); + + if(error != 0){ + printf("Fail.\r\n"); + printf("Output NHWC\r\n"); + print_tensor_16_(o, (height - F + 1), (width - F + 1), output_channels); + printf("=========================================\r\n"); + printf("Expected output\r\n"); + print_tensor_16_(golden_o, (height - F + 1), (width - F + 1), output_channels); + } else { + printf("Passed.\r\n"); + printf("The execution took %d cycles.\r\n", runtime); + print_L2_metrics(argv[0][0]); + } + } +} \ No newline at end of file