diff --git a/apps/ULPPACK/main.c b/apps/ULPPACK/main.c
index 564de4da3..0cb32a6d6 100644
--- a/apps/ULPPACK/main.c
+++ b/apps/ULPPACK/main.c
@@ -1,22 +1,22 @@
-// Author : Theo Dupuis
-// GR2M - 2022
-// Polytechnique Montreal
+// Author: Elisabeth Humblet
+// Based on the work by: Theo Dupuis
+// GR2M - 2024
+// Polytechnique Montréal
 
-#include <stdint.h>
+#include <stdint.h> 
 #include <stdio.h>
-#include <string.h>
+#include <string.h> 
 
 #include "ibsconv2d_tensor32.h"
 #include "runtime.h"
 #include "cache_metrics.h"
 
-//#ifndef SPIKE
-//#include "printf.h"
-//#endif
-
-
 #include "util.h"
 
+// =============================
+// ====== UART FUNCTIONS =======
+// =============================
+
 #define UART_BASE 0xFFF0C2C000
 
 #define UART_INTERRUPT_ENABLE UART_BASE + 1
@@ -45,54 +45,64 @@ void init_uart(uint32_t freq, uint32_t baud)
     write_reg_u8(UART_MODEM_CONTROL, 0x20);    // Autoflow mode
 }
 
+// =============================
+// ==== MACROS DEFINITIONS =====
+// =============================
 
-
-
-
+// ---- Architecture ----
 #define NR_LANES 4
-//#define MULTIRUN
-
-// use to check if the results are correct
-// since we compute the expectude results with
-// scalar code and rolled loops, it had a significant
-// amout of time on simulation
 
+// ---- Debug ----
 //#define VERIF
 
-#define PRECA_MAX	2
-#define PRECW_MAX	2
+// ---- Precisions ----
+#define PRECA_MAX 1
+#define PRECW_MAX 1
+
+// ---- Tensors ----
+#define F_MAX     7             // Max size of the kernel
+#define C_IN      16            // Number of input channels
+#define C_OUT     1             // Number of output channels
+#define I_MAX     16            // Max H_in x W_in input size
+#define I_START   16            // Start input size
 
-#define F_MAX 		7		// Max size of the kernel F x F
-#define C_in 		16		// Number of input input_channels 
-#define C_out		4		// Number of output_channels (or output input_channels C_out)
-#define I_MAX 		64		// Max H_in x W_in input size
-#define I_START		64		// Start input size
+int8_t i     [I_MAX * I_MAX * C_IN];
+int8_t f     [F_MAX * F_MAX * C_IN * C_OUT];
+int8_t f_nhwc[F_MAX * F_MAX * C_IN * C_OUT];
+int16_t o    [(I_MAX - F_MAX + 1)*(I_MAX - F_MAX + 1) * C_OUT];
 
-int8_t i[I_MAX * I_MAX * C_in];
+// =============================
+// === MULTICORE DEFINITIONS ===
+// =============================
 
-int8_t f     [F_MAX * F_MAX * C_in * C_out];
-int8_t f_nhwc[F_MAX * F_MAX * C_in * C_out];
+volatile static uint32_t init_done = 0;
+volatile static uint32_t conv_done = 0;
 
+//#define INPUT_MULTICORE
+#define OUTPUT_MULTICORE
 
-//////////////////////////////////////
-//       utilities functions        //
-//////////////////////////////////////
+// =============================
+// ==== UTILITIES FUNCTIONS ====
+// =============================
 
 void iconv2d_tensor_naive(int16_t *o, int8_t *i, int8_t *f, int64_t R, int64_t C, int64_t W, int64_t F, int64_t K) {
 
-//treat pointers as 3D arrays
-int8_t (*i_)[R+F-1][C+F-1] = (int8_t (*)[R+F-1][C+F-1])i;
-int8_t (*f_)[W][F][F] = (int8_t (*)[W][F][F])f;
-int16_t (*o_)[R][C] = (int16_t (*)[R][C])o;
- 
-for(int k = 0 ; k < K ; k++) 
-	for(int ch = 0 ; ch < W ; ch++)
-		for(int r = 0 ; r < R ; r++)
-			for(int c = 0 ; c < C ; c++)
-				for(int fh = 0 ; fh < F ; fh++)
-					for(int fw = 0 ; fw < F ; fw++) {
-						o_[k][r][c] += i_[ch][r+fh][c+fw]*f_[k][ch][fh][fw];
-					}
+    //treat pointers as 3D arrays
+    int8_t (*i_)[R+F-1][C+F-1] = (int8_t (*)[R+F-1][C+F-1])i;
+    int8_t (*f_)[W][F][F] = (int8_t (*)[W][F][F])f;
+    int16_t (*o_)[R][C] = (int16_t (*)[R][C])o;
+    
+    for(int k = 0 ; k < K ; k++) 
+    	for(int ch = 0 ; ch < W ; ch++)
+    		for(int r = 0 ; r < R ; r++)
+    			for(int c = 0 ; c < C ; c++)
+    				for(int fh = 0 ; fh < F ; fh++)
+    					for(int fw = 0 ; fw < F ; fw++) {
+    						o_[k][r][c] += i_[ch][r+fh][c+fw]*f_[k][ch][fh][fw];
+    						if(get_hartid()==0){
+    						printf("k  %d, ch %d, r %d, c %d, fh %d, fw %d\r\n", k, ch, r, c, fh, fw);
+    						}
+    					}
 }
 
 void NCHW_to_NHWC_8b(int8_t * NCHW_format, int8_t * NHWC_format, int64_t N, int64_t C, int64_t H, int64_t W){
@@ -134,13 +144,9 @@ void init_tensor(int8_t *tensor, int64_t R, int64_t C, int64_t D, int precision)
             }
 }
 
-
-
-
-//////////////////////////////////////
-// Verification and debug fonctions //
-//////////////////////////////////////
-
+// =============================
+// === VERIF/DEBUG FUNCTIONS ===
+// =============================
 
 int verify_tensor(int16_t *tensor1, int16_t *tensor2, int64_t height, int64_t width, int64_t channels) {
   for (int h = 0; h < height; ++h)   //depth
@@ -156,416 +162,236 @@ return 0;
 }
 
 void print_tensor(uint8_t *tensor, uint64_t num_rows, uint64_t num_columns, uint64_t num_depth) {
-  printf("0x%8X\n", (uint64_t)tensor);
-  for (uint64_t k = 0; k < num_depth; ++k) {
-  	for (uint64_t i = 0; i < num_rows; ++i) {
-    	for (uint64_t j = 0; j < num_columns; ++j) {
-      printf("%10u ", tensor[(i+k*num_rows) * num_columns  + j ]);
-    	}
-    	printf("\n");
-  	 }
-  	 printf("\n");
-	}
+    printf("0x%8X\n", (uint64_t)tensor);
+    for (uint64_t k = 0; k < num_depth; ++k) {
+  	    for (uint64_t i = 0; i < num_rows; ++i) {
+        	for (uint64_t j = 0; j < num_columns; ++j) {
+                printf("%10u ", tensor[(i+k*num_rows) * num_columns  + j ]);
+        	}
+        	printf("\r\n");
+  	    }
+  	printf("\r\n");
+    }
 }
 
 void print_tensor_16_(uint16_t *tensor, uint64_t num_rows, uint64_t num_columns, uint64_t num_depth) {
-  printf("0x%8X\n", (uint64_t)tensor);
-  for (uint64_t k = 0; k < num_depth; ++k) {
-  	for (uint64_t i = 0; i < num_rows; ++i) {
-    	for (uint64_t j = 0; j < num_columns; ++j) {
-      printf("%10u ", tensor[(i+k*num_rows) * num_columns  + j ]);
-    	}
-    	printf("\n");
-  	 }
-  	 printf("\n");
+    printf("0x%8X\n", (uint64_t)tensor);
+    for (uint64_t k = 0; k < num_depth; ++k) {
+  	    for (uint64_t i = 0; i < num_rows; ++i) {
+        	for (uint64_t j = 0; j < num_columns; ++j) {
+                printf("%10u ", tensor[(i+k*num_rows) * num_columns  + j ]);
+        	}
+        	printf("\r\n");
+  	    }
+  	    printf("\r\n");
 	}
 }
 
-
-
-
-int main(int argc, char** argv) {
-init_uart(50000000, 115200);
-
-volatile static uint32_t amo_cnt = 0;
-
-while(argv[0][0] != amo_cnt);
-if(get_hartid()==0){
-printf("==============\r\n");
-printf("= ULPCONV2D16 =\r\n");
-printf("==============\r\n");
-}
-ATOMIC_OP(amo_cnt,0, add,w);
-	/////////////////////////////////
-	// SAME SIZE OUTPUT 64b -> 64b //
-	/////////////////////////////////
-     
-
-	
-
-for(int64_t precA = PRECA_MAX; precA <= PRECA_MAX; precA++){ 
-	for(int64_t precW = PRECW_MAX; precW <= PRECW_MAX; precW++){
-		while(argv[0][0] != amo_cnt);
-		if(get_hartid()==0){
-			printf("\r\n");
-			printf("************\r\n");
-			printf("*** A%dW%d ***\r\n", precA, precW);
-			printf("************\r\n");
-
-			printf("\r\n");
-			printf("Filling the input and filter tensors...\r\n");
-		}
-		ATOMIC_OP(amo_cnt,0,add,w);
-
-		start_timer();
-		init_tensor(i, I_MAX, I_MAX, C_in, precA);
-		init_tensor(f, F_MAX, F_MAX, C_in * C_out, precW);
-		stop_timer();
-		int64_t init_time = get_timer();
-			printf("                                                            done\r\n");
-
-		for(int64_t F = F_MAX ; F <= F_MAX ; F += 2){
-			start_timer();
-			int64_t input_channels = C_in; // channel size is fixed for simplicity
-			int64_t output_channels = C_out;
-			int8_t filter[output_channels * F * F * input_channels];
-			
-			for(int k = 0; k < output_channels ; k++)
-				for(int z = 0; z < input_channels ; z++)
-					for(int y = 0 ; y < F ; y++)
-						for(int x = 0 ; x < F ; x++)
-				  			filter[x + F * (y + F * (z + k * input_channels))] = f[x + F_MAX * (y + F_MAX * (z + k * input_channels))];
-			
-			#ifdef VERIF
-			printf("Computing the expected output for this kernel size...\r\n");
-			//Compute the expected output
-			int16_t golden_o[(I_MAX - F + 1) * (I_MAX - F + 1) * C_out]; 
-			
-			for(int z = 0; z < output_channels ; z++)
-				for(int y = 0 ; y < (I_MAX - F + 1) ; y++)
-					for(int x = 0 ; x < (I_MAX - F + 1) ; x++)
-					{
-						golden_o[x + (I_MAX - F + 1) * (y + z * (I_MAX - F + 1))] = 0;
-					}
-			
-			iconv2d_tensor_naive(golden_o, i, f, (I_MAX - F + 1), (I_MAX - F + 1), input_channels, F, output_channels);	
-			
-			printf("                                                            done\r\n");
-			#endif	
-			
-			
-			// FILTER TRANSPOSITION INTO NHWC format
-			
-			NCHW_to_NHWC_8b(f, f_nhwc, output_channels, input_channels, F, F);
-
-			stop_timer();
-			int64_t filter_timer = get_timer();
-
-			//printf("\nfilter %dx%d \r\n", F, F);
-
-			for(int size = I_START ; size <= I_MAX ; size*=2){
-				while(argv[0][0] != amo_cnt);
-				if (get_hartid()==0){
-				printf("\n");
-				printf("----------------------------------------------------------------\r\n");
-				printf("Calculating convolution between \r\n");
-				printf("Input of [1 x %d x %d x %d] and Filters of [%d x %d x %d x %d]  \r\n", C_in, size,  size, C_out, C_in, F, F);
-				printf("Activation precision of %d and Weights precision of %d  \r\n", precA, precW);
-				printf("Result (16b) is an output of [1 x %d x %d x %d] \r\n", C_out, size - F + 1, size - F + 1);
-				printf("----------------------------------------------------------------\r\n");
-				printf("\r\n");
-				
-				#ifdef VERIF
-				printf("Formatting data and expected outputs...\r\n");
-				#else
-				printf("Formatting data...\r\n");
-				#endif
-				}
-				ATOMIC_OP(amo_cnt,0,add,w);
-
-				int64_t width = size;
-				int64_t height = size;  
-				
-				int8_t input  [width * height * input_channels];
-				int8_t i_nhwc [width * height * input_channels];
-				int16_t output[width * height * output_channels];
-				int16_t golden_output[(width - F + 1) * (height- F + 1) * output_channels];
-
-				////////////////////////////////////////////////
-				// INPUT, FILTERS AND EXPECTED OUTPUT SLICING //
-				////////////////////////////////////////////////
-				start_timer();		
-				for(int z = 0; z < input_channels ; z++)
-					for(int y = 0 ; y < height ; y++)
-						for(int x = 0 ; x < width ; x++)
-				  			input[x + width * (y + z * height)] = i[x + I_MAX * (y + z * I_MAX)];
-				  			
-				for(int z = 0; z < output_channels ; z++)
-					for(int y = 0 ; y < (height - F + 1) ; y++)
-						for(int x = 0 ; x < (width - F + 1) ; x++)
-						{
-							output[x + (width - F + 1) * (y + z * (height - F + 1))] = 0;
-							#ifdef VERIF
-							golden_output[x + (width - F + 1) * (y + z * (height - F + 1))] = golden_o[x + (I_MAX - F + 1) * (y + z * (I_MAX - F + 1))];
-							#endif
-						}
-						
-				NCHW_to_NHWC_8b(input, i_nhwc, 1, input_channels, height, width);
-				stop_timer();
-				int64_t slicing_timer = get_timer();
-				///////////////////////////
-				// FONCTION TO BE TESTED //
-				///////////////////////////
-				
-				printf("                                                            done\r\n");
-				//print_tensor(f, F, F, input_channels);
-				//print_tensor(input, height, width, input_channels);
-				printf("Computing results...\r\n");
-
-				#ifdef MULTIRUN
-					start_timer();
-					ulppack_conv2d(output, input, f, height, width, input_channels, F, output_channels, precA, precW);
-					stop_timer();
-					int64_t run1 = get_timer();
-					start_timer();
-					ulppack_conv2d(output, input, f, height, width, input_channels, F, output_channels, precA, precW);
-					stop_timer();
-					int64_t run2 = get_timer();
-					start_timer();
-					ulppack_conv2d(output, input, f, height, width, input_channels, F, output_channels, precA, precW);
-					stop_timer();
-					int64_t run3 = get_timer();
-					start_timer();
-					ulppack_conv2d(output, input, f, height, width, input_channels, F, output_channels, precA, precW);
-					stop_timer();
-					int64_t run4 = get_timer();
-				
-				#else
-				for (int core=0;core<4;core++){
-					reset_L2_metrics(core);
-					init_L2_metrics(core);
-				}
-				
-				start_timer();
-				
-				ulppack_conv2d(output, input, f, height, width, input_channels, F, output_channels, precA, precW);
-			
-				stop_timer();
-				for (int core=0;core<4;core++){
-					stop_L2_metrics(core);
-				}
-				#endif
-				printf("                                                            done\r\n");                                
-				
-				//////////////////
-				// VERIFICATION //
-				//////////////////
-				
-				int16_t golden_output_nhwc[(I_MAX - F + 1) * (I_MAX - F + 1) * C_out];
-				
-				NCHW_to_NHWC_16b(golden_output, golden_output_nhwc, 1, C_out, I_MAX - F + 1, I_MAX - F + 1);
-				
-				#ifdef VERIF	
-				printf("Verifying results...\r\n");
-				int error = verify_tensor(output, golden_output, (height - F + 1), (width - F + 1), output_channels);
-				if (error == 0)
-					printf("                                                            done\r\n");
-				else
-					printf("   ERROR\r\n");
-				#else
-				//printf("-- Change macro to add verification step -- \r\n");
-				int error = 0;
-				#endif
-				
-				/////////////
-				// METRICS //
-				/////////////
-				
-				int64_t runtime = get_timer();
-				float performance = 2.0 * C_out * C_in * F * F * (size - F + 1) * (size - F + 1) / runtime;
-				float utilization = 100 * performance / (256 / (precA * precW)) * NR_LANES; 
-				
-				if (error != 0){
-					 printf("Fail.\r\n");
-					 printf("OUT NHWC\r\n");
-					 print_tensor_16_(output, (height - F + 1), (width - F + 1), output_channels);
-					 printf("EXPECTED OUT NHWC\r\n");
-				    print_tensor_16_(golden_output_nhwc, (height - F + 1), (width - F + 1), output_channels);
-					 printf("EXPECTED OUT\r\n");
-				    print_tensor_16_(golden_output, (height - F + 1), (width - F + 1), output_channels);
-				}
-				else {
-					while(argv[0][0] != amo_cnt);
-					//if (get_hartid()==0){
-					 printf("Passed.\r\n");					
-					 #ifdef MULTIRUN
-				     printf("The execution of Run 1 took %d cycles. \r\n", run1);
-					 printf("The execution of Run 2 took %d cycles. \r\n", run2);
-					 printf("The execution of Run 3 took %d cycles. \r\n", run3);
-					 printf("The execution of Run 4 took %d cycles. \r\n", run4);
-					 #else 	
-					 printf("The execution took %d cycles.\r\n", runtime);
-					 #endif
-					 //printf("The initialization took %d cycles. \r\n", init_time);
-					 //printf("The filter init took %d cycles. \r\n", filter_timer);
-					 //printf("The slicing took %d cycles.\r\n",slicing_timer);
-				  	 //printf("The performance is %f OP/cycle, the utilization is %f % \n", performance, utilization);
-					//}
-					for (int core=0;core<4;core++){
-						if(get_hartid()==core){
-						print_L2_metrics(core);
-						}						
-					}
-				  	 #ifdef PERF
-				  	 	printf("The execution of bit-serial packing took %d cycles.\r\n", runtime_bp);
-				  	 	printf("The execution of conv2d took %d cycles.\r\n", runtime - runtime_bp);
-				  	 #endif
-				  }
-					ATOMIC_OP(amo_cnt,1,add,w);
-				//start_timer();
-				//
-				//ulppack_conv2d(output, input, f, height, width, input_channels, F, output_channels, precA, precW);
-			//
-				//stop_timer();
-				//
-				//printf("                                                            done\r\n");  
-				//	int64_t runtime2 = get_timer();
-				//printf("The execution took %d cycles.\r\n", runtime2);
-			}
+void initialization(int64_t precA, int64_t precW, int64_t F, int64_t input_channels, int64_t output_channels, int8_t *filter, int size, int64_t width, int64_t height, int8_t *input, int8_t *i_nhwc, int16_t *output, int16_t *golden_output, int16_t *golden_o){
+    printf("===============\r\n");
+    printf("= ULPCONV2D16 =\r\n");
+    printf("===============\r\n");
+
+    printf("\r\n");
+    printf("************\r\n");
+    printf("*** A%dW%d ***\r\n", precA, precW);
+    printf("************\r\n");
+
+
+    // ==== Init tensors ====
+    printf("\r\n");
+    printf("Filling the input and filter tensors... \r\n");
+
+    init_tensor(i, I_MAX, I_MAX, C_IN, precA);
+    init_tensor(f, F_MAX, F_MAX, C_IN * C_OUT, precW);
+
+    printf("                                              done\r\n");
+
+    //for(int k = 0; k < output_channels ; k++)
+	//	for(int z = 0; z < input_channels ; z++)
+	//		for(int y = 0 ; y < F ; y++)
+	//			for(int x = 0 ; x < F ; x++)
+	//	  			filter[x + F * (y + F * (z + k * input_channels))] = f[x + F_MAX * (y + F_MAX * (z + k * input_channels))];
+
+    #ifdef VERIF
+        // ==== Expected output ====
+        printf("Computing the expected  output for this kernel size... \r\n");
+        for(int z = 0; z < output_channels ; z++)
+			for(int y = 0 ; y < (I_MAX - F + 1) ; y++)
+				for(int x = 0 ; x < (I_MAX - F + 1) ; x++)
+					golden_o[x + (I_MAX - F + 1) * (y + z * (I_MAX - F + 1))] = 0;
+        
+        iconv2d_tensor_naive(golden_o, i, f, (I_MAX - F + 1), (I_MAX - F + 1), input_channels, F, output_channels);
+        printf("                                              done\r\n");        
+    #endif
+
+    // ==== Transpose filter ====
+    NCHW_to_NHWC_8b(f, f_nhwc, output_channels, input_channels, F, F);
+
+    // ==== Information ====
+    printf("\r\n");
+    printf("----------------------------------------------------------------\r\n");
+	printf("Calculating convolution between \r\n");
+	printf("Input of [1 x %d x %d x %d] and Filters of [%d x %d x %d x %d]  \r\n", C_IN, size,  size, C_OUT, C_IN, F, F);
+	printf("Activation precision of %d and Weights precision of %d  \r\n", precA, precW);
+	printf("Result (16b) is an output of [1 x %d x %d x %d] \r\n", C_OUT, size - F + 1, size - F + 1);
+	printf("----------------------------------------------------------------\r\n");
+	printf("\r\n");
+
+    #ifdef VERIF
+    printf("Formatting data and expected outputs...\r\n");
+    #else
+    printf("Formatting data...\r\n");
+    #endif
+
+    // ==== Tensors slicing ====
+    //for(int z = 0; z < input_channels; z++)
+    //    for(int y = 0; y < height; y++)
+    //        for(int x = 0; x < width; x++)
+    //            input[x + width * (y + z * height)] = i[x + I_MAX * (y + z * I_MAX)];
+
+    for(int z = 0; z < output_channels ; z++)
+		for(int y = 0 ; y < (height - F + 1) ; y++)
+			for(int x = 0 ; x < (width - F + 1) ; x++)
+			{
+				o[x + (width - F + 1) * (y + z * (height - F + 1))] = 0;
+    //			#ifdef VERIF
+    //			golden_output[x + (width - F + 1) * (y + z * (height - F + 1))] = golden_o[x + (I_MAX - F + 1) * (y + z * (I_MAX - F + 1))];
+    //			#endif
 			}
-		}
-	}
-}
-/*
-
-// Author : Theo Dupuis
-// GR2M - 2022
-// Polytechnique Montreal
-// Modified : Elisabeth Humblet, 2024
-
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "ibsconv2d_tensor32.h"
-#include "runtime.h"
-#include "cache_metrics.h"
-
-//#ifndef SPIKE
-//#include "printf.h"
-//#endif
 
-#include "util.h"
-
-#define UART_BASE 0xFFF0C2C000
-
-#define UART_INTERRUPT_ENABLE UART_BASE + 1
-#define UART_LINE_CONTROL UART_BASE + 3
-#define UART_MODEM_CONTROL UART_BASE + 4
-#define UART_LINE_STATUS UART_BASE + 5
-#define UART_MODEM_STATUS UART_BASE + 6
-#define UART_DLAB_LSB UART_BASE + 0
-#define UART_DLAB_MSB UART_BASE + 1
+    NCHW_to_NHWC_8b(i, i_nhwc, 1, input_channels, height, width);
+    printf("                                              done\r\n");
 
-void write_reg_u8(uintptr_t addr, uint8_t value)
-{
-    volatile uint8_t *loc_addr = (volatile uint8_t *)addr;
-    *loc_addr = value;
+    // ==== INITIALIZATION DONE ====
+    ATOMIC_OP(init_done, 1, add, w);
 }
 
-void init_uart(uint32_t freq, uint32_t baud)
-{
-    uint32_t divisor = freq / (baud << 4);
-
-    write_reg_u8(UART_INTERRUPT_ENABLE, 0x00); // Disable all interrupts
-    write_reg_u8(UART_LINE_CONTROL, 0x80);     // Enable DLAB (set baud rate divisor)
-    write_reg_u8(UART_DLAB_LSB, divisor);         // divisor (lo byte)
-    write_reg_u8(UART_DLAB_MSB, (divisor >> 8) & 0xFF);  // divisor (hi byte)
-    write_reg_u8(UART_LINE_CONTROL, 0x03);     // 8 bits, no parity, one stop bit
-    write_reg_u8(UART_MODEM_CONTROL, 0x20);    // Autoflow mode
+void ulppack_conv2d_msparq(int16_t *o, int8_t *i, int8_t *f, int64_t H_in, int64_t W_in, int64_t C_in, int64_t F, int64_t C_out, int64_t precA, int64_t precW){
+    int8_t *i_;
+    int16_t *o_;
+    int8_t *f_;
+
+    for(int64_t c = 0; c < C_out; c++){
+        #ifdef OUTPUT_MULTICORE
+        if(get_hartid() == c){
+        #endif
+            o_ = o + c * (H_in - F + 1) *(W_in - F + 1);
+            i_ = i;
+            f_ = f + c * F * F * C_in;
+            #ifdef VMACSR
+                if (F == 7){
+                    if((precA <= 2 && precW < 2) || (precA < 2 && precW <= 2)){
+                        #ifdef INPUT_MULTICORE
+                            ulppack_conv2d_vec8_7x7_tiling(o_, i_, f_, H_in, W_in, C_in, F, C_out);
+                        #else 
+                            ulppack_conv2d_vec8_7x7(o_, i_, f_, H_in, W_in, C_in, F, C_out);
+                        #endif
+                    } else {
+                        ulppack_conv2d_vec16_7x7(o_, i_, f_, H_in, W_in, C_in, F, C_out);
+                    }
+                }
+            #else
+                if(F == 3){
+			    	ulppack_conv2d_vec_3x3(o_, i_, f_, H_in, W_in, C_in, F, C_out);
+			    }else 
+			    if (F == 7){
+			    	if (precA <= 1 && precW <= 1){
+			    		ulppack_conv2d_vec_7x7_A1W1(o_, i_, f_, H_in, W_in, C_in, F, C_out);
+			    	}else if (precA <= 2 && precW <= 2){
+			     		ulppack_conv2d_vec_7x7_A2W2(o_, i_, f_, H_in, W_in, C_in, F, C_out);
+			    	}else if (precA <= 3 && precW <= 3){
+			     		ulppack_conv2d_vec_7x7_A3W3(o_, i_, f_, H_in, W_in, C_in, F, C_out);
+			    	}
+			    }
+            #endif
+        #ifdef OUTPUT_MULTICORE
+        }
+        #endif
+    }
+    ATOMIC_OP(conv_done, 1, add, w);
 }
 
-
-
-
-
-#define NR_LANES 4
-
-#define PRECA	2
-#define PRECW	2
-
-#define F 			7		// Max size of the kernel F x F
-#define C_in 		32		// Number of input input_channels 
-#define C_out		4		// Number of output_channels (or output input_channels C_out)
-#define I_MAX 		64		// Max H_in x W_in input size
-#define I_START		64		// Start input size
-
-//int8_t i[I_MAX * I_MAX * C_in];
-
-int8_t f[F * F * C_in * C_out];
-
-int16_t o[I_MAX * I_MAX * C_out];
-
-int main() {
-
-init_uart(50000000, 115200);
-
-printf("====================\r\n");
-printf("= ULPPACK CONV2D16 =\r\n");
-printf("====================\r\n");
-printf("Multicore acceleration\r\n");
-//for(int hart=0;hart<4;hart++){
-
-	for(int precA = PRECA ; precA <= PRECA ; precA ++){
-	for(int precW = PRECW ; precW <= PRECW ; precW ++){
-			for(int size = I_START ; size <= I_MAX ; size *= 2){
-
-				printf("\r\n");
-				printf("----------------------------------------------------------------\r\n");
-				printf("Calculating convolution between \r\n");
-				printf("Input of [1 x %d x %d x %d] and Filters of [%d x %d x %d x %d]  \r\n", C_in, size,  size, C_out, C_in, F, F);
-				printf("Activation precision of %d and Weights precision of %d  \r\n", precA, precW);
-				printf("Result (16b) is an output of [1 x %d x %d x %d] \r\n", C_out, size - F + 1, size - F + 1);
-				printf("----------------------------------------------------------------\r\n");
-				printf("\r\n");
-
-				//printf("Core %d\r\n",hart);
-				printf("Computing results...\r\n");
-				for (int core=0;core<4;core++){
-					reset_L2_metrics(core);
-					init_L2_metrics(core);
-				}
-				start_timer();
-				
-				ulppack_conv2d(o, o, f, size, size, C_in, F, C_out, precA, precW);
-			
-				stop_timer();
-				for (int core=0;core<4;core++){
-					stop_L2_metrics(core);
-				}
-				printf("                                                            done\r\n");                                
-				
-				/////////////
-				// METRICS //
-				/////////////
-
-				int64_t runtime = get_timer();
-				
-				float performance = (2.0 * C_out * C_in * F * F * (size - F + 1) * (size - F + 1) )/ runtime;
-				float utilization = 100 * performance / (4 * 2 * NR_LANES); 
-
-				//	printf("Passed.\r\n");		
-				if (get_hartid()==0){			 	
-					printf("The execution took %d cycles.\r\n", runtime);
-				//  	 printf("The performance is %f OP/cycle, the utilization is %f  \r\n", performance, utilization);
-				}
-				for (int core=0;core<4;core++){
-					if(get_hartid()==core){
-					print_L2_metrics(core);
-				}
-
-			}
-	}}
-//}
-}
-*/
\ No newline at end of file
+int main(int argc, char** argv){
+    init_uart(50000000, 115200);
+
+    // ===== INIT DEFINES =====
+    int64_t precA = PRECA_MAX;
+    int64_t precW = PRECW_MAX;
+
+    int64_t F = F_MAX;
+    int64_t input_channels = C_IN;
+    int64_t output_channels = C_OUT;
+    int8_t filter[output_channels * F * F * input_channels];
+
+    int size = I_START;
+    int64_t width = size;
+    int64_t height = size;
+
+    int8_t input          [width * height * input_channels];
+    int8_t i_nhwc         [width * height * input_channels];
+    int16_t output        [width * height * output_channels];
+    int16_t golden_output [(width - F + 1) * (width - F + 1) * output_channels];
+    int16_t golden_o[(I_MAX - F + 1) * (I_MAX - F + 1) * C_OUT];
+
+
+    // ===== INITIALIZATION =====
+
+    if (argv[0][0] == 0){
+        initialization(precA, precW, F, input_channels, output_channels, filter, size, width, height, input, i_nhwc, output, golden_output, golden_o);
+    }
+    while(init_done == 0);
+
+    printf("Computing results...\r\n");
+
+    // ===== COMPUTING RESULTS =====
+
+    // --- Reset L2 metrics ---
+    reset_L2_metrics(argv[0][0]);
+    init_L2_metrics(argv[0][0]);
+
+    // --- Timer and convolution ---
+    start_timer();
+    ulppack_conv2d_msparq(o, i, f, height, width, input_channels, F, output_channels, precA, precW);
+    stop_timer();
+
+    // --- Wait for end of convolution ---
+    while(conv_done == 0);
+
+    // --- Stop L2 metrics
+    stop_L2_metrics(argv[0][0]);
+    printf("                                              done\r\n");
+
+    if(argv[0][0] == 0){
+        // ===== VERIFICATION =====
+        int16_t golden_output_nhwc[(I_MAX - F + 1) * (I_MAX - F + 1) * C_OUT];
+        NCHW_to_NHWC_16b(golden_o, golden_output_nhwc, 1, C_OUT, I_MAX - F + 1, I_MAX - F + 1);
+
+        #ifdef VERIF
+            printf("Verifying results...\r\n");
+            int error = verify_tensor(o, golden_o, (height - F + 1), (width - F + 1), output_channels);
+            if (error == 0)
+                printf("                                              done\r\n");
+            else 
+                printf("    ERROR\r\n");
+        #else
+            int error = 0;
+        #endif
+
+        // ===== METRICS =====
+        int64_t runtime = get_timer();
+
+        if(error != 0){
+            printf("Fail.\r\n");
+            printf("Output NHWC\r\n");
+            print_tensor_16_(o, (height - F + 1), (width - F + 1), output_channels);
+            printf("=========================================\r\n");
+            printf("Expected output\r\n");
+            print_tensor_16_(golden_o, (height - F + 1), (width - F + 1), output_channels);
+        } else {
+            printf("Passed.\r\n");
+            printf("The execution took %d cycles.\r\n", runtime);
+            print_L2_metrics(argv[0][0]);
+        }
+    }
+}
\ No newline at end of file