diff --git a/apps/bitpack/main.c b/apps/bitpack/main.c
deleted file mode 100644
index bbefde54b..000000000
--- a/apps/bitpack/main.c
+++ /dev/null
@@ -1,115 +0,0 @@
-#include <stdio.h>
-#include <stdint.h>
-#include <time.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <riscv_vector.h>
-
-#include "util.h"
-#include "printf.h"
-#include "runtime.h"
-
-#define RISC_V_ARA 
-
-void init_matrix(uint64_t* matrix, int num_rows, int num_columns, uint64_t MAX_VAL) {
-    for (int i = 0; i < num_rows; ++i) {
-        for (int j = 0; j < num_columns; ++j) {
-            matrix[i * num_columns + j] = (rand() % (MAX_VAL - 0 + 1)) + 0;
-        }
-    }
-}
-
-void transpose_matrix(uint64_t* matrix, int num_rows, int num_columns){
-    uint64_t temp;
-    for (int i = 0; i < num_rows; ++i) {
-        for (int j = i; j < num_columns; ++j) {
-            temp = matrix[i * num_columns + j];
-            matrix[i * num_columns + j] = matrix[j * num_columns + i];
-            matrix[j * num_columns + i] = temp;
-        }
-    }
-}
-
-// Naive implementation of bit packing
-void bitpack_naive(uint64_t* matrix, uint64_t* packed_data, int DATA_WIDTH, int dlen, int bitprec){
-    // Packed data pointer
-    uint64_t p_ptr = 0;
-    for (int i=0; i<dlen; i+=DATA_WIDTH){
-        for (int el=0; el<DATA_WIDTH; el++){
-            for (int bit_pos=0; bit_pos<bitprec; bit_pos++){
-                int bit_idx = p_ptr+bit_pos;
-                uint64_t data = (matrix[i+el] >> bit_pos) & 0x1;
-                packed_data[bit_idx] <<= 1;
-                packed_data[bit_idx] = (packed_data[bit_idx] | data);
-            }
-        }
-        p_ptr += bitprec;
-    }
-}
-
-void bitpack_init() {
-  asm volatile("vmv.v.i v0,  0");
-  asm volatile("vmv.v.i v1,  0");
-  asm volatile("vmv.v.i v2,  0");
-  asm volatile("vmv.v.i v3,  0");
-  asm volatile("vmv.v.i v4,  0");
-}
-
-// Vectorized implementation of bit packing
-void bitpack_vectorized(uint64_t* matrix, uint64_t* packed_data, int DATA_WIDTH, int dlen, int bitprec){
-    // Initialize vector RF
-    bitpack_init();
-    size_t vl=0;
-    uint64_t *p_ptr = matrix;
-    // Load shift values into v0
-    asm volatile("vle64.v v0, (%[A])" : : [A] "r" (vshift));
-    // Load mask values into v1
-    asm volatile("vle64.v v1, (%[A])" : : [A] "r" (vmask));
-    for (size_t c_n_count=dlen; c_n_count; c_n_count -= vl){
-        vl = vsetvl_e64m1(c_n_count);
-        asm volatile("vle64.v v2, (%[A])" : : [A] "r" (matrix));
-        asm volatile("vand.vv v3, v2, v1" ::);
-        for (int bit_pos=0; bit_pos<bitprec; bit_pos++){
-
-            int bit_idx = p_ptr+bit_pos;
-            uint64_t data = (matrix[i+el] >> bit_pos) & 0x1;
-            packed_data[bit_idx] <<= 1;
-            packed_data[bit_idx] = (packed_data[bit_idx] | data);
-        }
-        p_ptr += bitprec;
-    }
-}
-
-int main(){
-    printf("bitpack init!\n");
-    const uint64_t BITPREC=3;
-    const uint64_t MAT_SIZE_W = 8;
-    const uint64_t MAT_SIZE_H = 8;
-    const uint64_t DATA_WIDTH = 8;
-    assert((MAT_SIZE_W%DATA_WIDTH)==0 && "Matrix size must be multiple of data type");
-    const uint64_t PACKED_MAT_SIZE = (MAT_SIZE_H*MAT_SIZE_W)/(DATA_WIDTH) * BITPREC;
-    uint64_t tensor[MAT_SIZE_H*MAT_SIZE_W];
-    uint64_t packed_data[PACKED_MAT_SIZE];
-    uint64_t max_val = (1<<BITPREC) - 1;
-    srand(0);
-
-    printf("\n");
-    printf("------------------------------------------------------------\n");
-    printf("Bitpacking of a tensor [%dx%dx%dx%d] with %d precision\n", 1, 1, MAT_SIZE_H, MAT_SIZE_W, BITPREC);
-    printf("------------------------------------------------------------\n");
-    printf("\n");
-    init_matrix(tensor, MAT_SIZE_H, MAT_SIZE_W, max_val);
-    init_matrix(packed_data, 1, PACKED_MAT_SIZE, 0);
-    // // transpose_matrix(tensor, MAT_SIZE_W, MAT_SIZE_H);
-
-    // print_matrix(tensor, MAT_SIZE_H, MAT_SIZE_W, PRINT_INT);
-    start_timer();
-    bitpack_naive(tensor, packed_data, DATA_WIDTH, MAT_SIZE_H*MAT_SIZE_W, BITPREC);
-    stop_timer();
-    // Metrics
-    int64_t runtime = get_timer();
-    printf("bitpack_naive took %d cycles.\n", runtime);
-
-    // print_matrix(packed_data, 1, PACKED_MAT_SIZE, PRINT_BIN);
-    return 0;
-}
diff --git a/apps/bitpack/util.c b/apps/bitpack/util.c
deleted file mode 100644
index 057623b31..000000000
--- a/apps/bitpack/util.c
+++ /dev/null
@@ -1,39 +0,0 @@
-#include "util.h"
-#include "printf.h"
-
-// Assumes little endian
-void __printb(void *value, size_t size)
-{
-        unsigned char uc;
-        unsigned char bits[CHAR_BIT + 1];
-
-        bits[CHAR_BIT] = '\0';
-        for_endian(size) {
-                uc = ((unsigned char *) value)[i];
-                memset(bits, '0', CHAR_BIT);
-                for (int j = 0; uc && j < CHAR_BIT; ++j) {
-                        if (uc & MSB_MASK)
-                                bits[j] = '1';
-                        uc <<= 1;
-                }
-                printf("%s", bits);
-        }
-        printf("\n");
-}
-
-void print_matrix(uint64_t *mat, int num_rows, int num_columns, int bin_print_format)
-{
-    for (int i=0; i<num_rows; i++)
-    {
-        for(int j=0; j<num_columns; j++)
-        {
-            if (bin_print_format==PRINT_BIN) {
-                printb(mat[i * num_columns + j]);
-            }else{
-                printf("%llu ", mat[i * num_columns + j]);
-            }
-        }
-        printf("\n");
-    }
-    printf("\n");
-}
diff --git a/apps/bitpack/util.h b/apps/bitpack/util.h
deleted file mode 100644
index 8ef12d0e1..000000000
--- a/apps/bitpack/util.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#include <stdio.h>
-#include <stdint.h>
-#include <string.h>
-#include <limits.h>
-
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-#define for_endian(size) for (int i = 0; i < size; ++i)
-#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define for_endian(size) for (int i = size - 1; i >= 0; --i)
-#else
-#error "Endianness not detected"
-#endif
-
-#define printb(value)                                   \
-({                                                      \
-        typeof(value) _v = value;                       \
-        __printb((typeof(_v) *) &_v, sizeof(_v));       \
-})
-
-#define MSB_MASK 1 << (CHAR_BIT - 1)
-
-#define PRINT_INT 0
-#define PRINT_BIN 1
-
-
-void  __printb(void *value, size_t size);
-void print_matrix(uint64_t *mat, int num_rows, int num_columns, int bin_print_format);
-const uint64_t vshift[64] = {  0,  1,  2,  3,  4,  5,  6,  7, 
-                               8,  9, 10, 11, 12, 13, 14, 15,
-                              16, 17, 18, 19, 20, 21, 22, 23,
-                              24, 25, 26, 27, 28, 29, 30, 31,
-                              32, 33, 34, 35, 36, 37, 38, 39,
-                              40, 41, 42, 43, 44, 45, 46, 47,
-                              48, 49, 50, 51, 52, 53, 54, 55,
-                              56, 57, 58, 59, 60, 61, 62, 63};
-const uint64_t vmask[64] = { 1, 1, 1, 1, 1, 1, 1, 1,
-                             1, 1, 1, 1, 1, 1, 1, 1,
-                             1, 1, 1, 1, 1, 1, 1, 1,
-                             1, 1, 1, 1, 1, 1, 1, 1,
-                             1, 1, 1, 1, 1, 1, 1, 1,
-                             1, 1, 1, 1, 1, 1, 1, 1,
-                             1, 1, 1, 1, 1, 1, 1, 1,
-                             1, 1, 1, 1, 1, 1, 1, 1};
diff --git a/apps/bitserial_conv/kernel/.vscode/c_cpp_properties.json b/apps/bitserial_conv/kernel/.vscode/c_cpp_properties.json
deleted file mode 100644
index df8182dfe..000000000
--- a/apps/bitserial_conv/kernel/.vscode/c_cpp_properties.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "configurations": [
-    {
-      "name": "macos-gcc-arm64",
-      "includePath": [
-        "${workspaceFolder}/**"
-      ],
-      "compilerPath": "/usr/bin/clang",
-      "cStandard": "${default}",
-      "cppStandard": "${default}",
-      "intelliSenseMode": "macos-gcc-arm64",
-      "compilerArgs": []
-    }
-  ],
-  "version": 4
-}
\ No newline at end of file
diff --git a/apps/bitserial_conv/kernel/.vscode/launch.json b/apps/bitserial_conv/kernel/.vscode/launch.json
deleted file mode 100644
index b65adf0e5..000000000
--- a/apps/bitserial_conv/kernel/.vscode/launch.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{
-  "version": "0.2.0",
-  "configurations": [
-    {
-      "name": "C/C++ Runner: Debug Session",
-      "type": "lldb",
-      "request": "launch",
-      "args": [
-        ""
-      ],
-      "stopAtEntry": true,
-      "cwd": "/Users/hossein/MyRepos/ara/apps/bitserial_conv/kernel",
-      "environment": [],
-      "program": "/Users/hossein/MyRepos/ara/apps/bitserial_conv/kernel/build/Debug/outDebug",
-      "internalConsoleOptions": "openOnSessionStart",
-      "MIMode": "gdb",
-      "externalConsole": false
-    }
-  ]
-}
\ No newline at end of file
diff --git a/apps/bitserial_conv/kernel/.vscode/settings.json b/apps/bitserial_conv/kernel/.vscode/settings.json
deleted file mode 100644
index ce53bf0a9..000000000
--- a/apps/bitserial_conv/kernel/.vscode/settings.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-  "C_Cpp_Runner.cCompilerPath": "/usr/bin/clang",
-  "C_Cpp_Runner.cppCompilerPath": "/usr/bin/clang++",
-  "C_Cpp_Runner.debuggerPath": "/usr/bin/lldb",
-  "C_Cpp_Runner.cStandard": "",
-  "C_Cpp_Runner.cppStandard": "",
-  "C_Cpp_Runner.msvcBatchPath": "",
-  "C_Cpp_Runner.warnings": [
-    "-Wall",
-    "-Wextra",
-    "-Wpedantic"
-  ],
-  "C_Cpp_Runner.enableWarnings": true,
-  "C_Cpp_Runner.warningsAsError": false,
-  "C_Cpp_Runner.compilerArgs": [],
-  "C_Cpp_Runner.linkerArgs": [],
-  "C_Cpp_Runner.includePaths": [],
-  "C_Cpp_Runner.includeSearch": [
-    "*",
-    "**/*"
-  ],
-  "C_Cpp_Runner.excludeSearch": [
-    "**/build",
-    "**/build/**",
-    "**/.*",
-    "**/.*/**",
-    "**/.vscode",
-    "**/.vscode/**"
-  ]
-}
\ No newline at end of file
diff --git a/apps/bitserial_conv/kernel/conv2d_bitserial.c b/apps/bitserial_conv/kernel/conv2d_bitserial.c
deleted file mode 100644
index 235b63f9b..000000000
--- a/apps/bitserial_conv/kernel/conv2d_bitserial.c
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright 2022 ETH Zurich, University of Bologna and Polytechnique Montreal.
-// Solderpad Hardware License, Version 0.51, see LICENSE for details.
-// SPDX-License-Identifier: SHL-0.51
-//
-// Author: MohammadHossein AskariHemmat <m.h.askari.hemmat@gmail.com>
-
-#include "conv2d_bitserial.h"
-
-int im2col_get_pixel(int *im, int height, int width, int channels,
-                        int row, int col, int channel, int pad)
-{
-    row -= pad;
-    col -= pad;
-
-    if (row < 0 || col < 0 ||
-        row >= height || col >= width) return 0;
-    return im[col + width*(row + height*channel)];
-}
-
-//From Berkeley Vision's Caffe!
-//https://github.com/BVLC/caffe/blob/master/LICENSE
-void im2col_cpu(int* data_im,
-     int channels,  int height,  int width,
-     int ksize,  int stride, int pad, int* data_col) 
-{
-    int c,h,w;
-    int height_col = (height + 2*pad - ksize) / stride + 1;
-    int width_col = (width + 2*pad - ksize) / stride + 1;
-
-    int channels_col = channels * ksize * ksize;
-    for (c = 0; c < channels_col; ++c) {
-        int w_offset = c % ksize;
-        int h_offset = (c / ksize) % ksize;
-        int c_im = c / ksize / ksize;
-        for (h = 0; h < height_col; ++h) {
-            for (w = 0; w < width_col; ++w) {
-                int im_row = h_offset + h * stride;
-                int im_col = w_offset + w * stride;
-                int col_index = (c * height_col + h) * width_col + w;
-                data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
-                        im_row, im_col, c_im, pad);
-            }
-        }
-    }
-}
-
-void fill_cpu(int N, float ALPHA, float *X, int INCX)
-{
-    int i;
-    for(i = 0; i < N; ++i) X[i*INCX] = ALPHA;
-}
-
-void gemm(int M, int N, int K, float ALPHA, 
-        float *A, int lda, 
-        float *B, int ldb,
-        float *C, int ldc)
-{
-    int i,j,k;
-    for(i = 0; i < M; ++i){
-        for(k = 0; k < K; ++k){
-            register float A_PART = ALPHA*A[i*lda+k];
-            for(j = 0; j < N; ++j){
-                C[i*ldc+j] += A_PART*B[k*ldb+j];
-            }
-        }
-    }
-}
-
-
-void conv2d(convolutional_layer l, network net)
-{
-    int i, j;
-
-    fill_cpu(l.outputs*l.batch, 0, l.output, 1);
-
-    int m = l.n/l.groups;
-    int k = l.size*l.size*l.c/l.groups;
-    int n = l.out_w*l.out_h;
-    for(i = 0; i < l.batch; ++i){
-        for(j = 0; j < l.groups; ++j){
-            float *a = l.weights + j*l.nweights/l.groups;
-            float *b = net.workspace;
-            float *c = l.output + (i*l.groups + j)*n*m;
-            float *im =  net.input + (i*l.groups + j)*l.c/l.groups*l.h*l.w;
-
-            if (l.size == 1) {
-                b = im;
-            } else {
-                im2col_cpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
-            }
-            gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
-        }
-    }
-}
diff --git a/apps/bitserial_conv/kernel/conv2d_bitserial.h b/apps/bitserial_conv/kernel/conv2d_bitserial.h
deleted file mode 100644
index 42d934a17..000000000
--- a/apps/bitserial_conv/kernel/conv2d_bitserial.h
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright 2022 ETH Zurich, University of Bologna and Polytechnique Montreal.
-// Solderpad Hardware License, Version 0.51, see LICENSE for details.
-// SPDX-License-Identifier: SHL-0.51
-//
-// Author: MohammadHossein AskariHemmat <m.h.askari.hemmat@gmail.com>
-
-#ifndef BITSERIAL_MATMUL_H
-#define BITSERIAL_MATMUL_H
-
-#include <stdint.h>
-
-void bitserial_matmul_init();
-void bitserial_matmul_64(uint64_t* c, uint64_t* a, uint64_t* b, int aprec, int bprec);
-int im2col_get_pixel(int *im, int height, int width, int channels, int row, int col, int channel, int pad);
-void im2col_cpu(int* data_im, int channels,  int height,  int width, int ksize,  int stride, int pad, int* data_col);
-void fill_cpu(int N, float ALPHA, float *X, int INCX);
-void gemm(int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float *C, int ldc);
-void conv2d(convolutional_layer l, network net);
-
-#endif
-
-
-#ifndef CONVOLUTIONAL_LAYER_H
-#define CONVOLUTIONAL_LAYER_H
-
-#include "layer.h"
-
-typedef layer convolutional_layer;
-convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int groups, int size, int stride, int padding, int batch_normalize, int binary, int xnor, int adam);
-
-#endif
diff --git a/apps/bitserial_conv/kernel/conv2d_bitserial.py b/apps/bitserial_conv/kernel/conv2d_bitserial.py
deleted file mode 100644
index a6641a6a0..000000000
--- a/apps/bitserial_conv/kernel/conv2d_bitserial.py
+++ /dev/null
@@ -1,310 +0,0 @@
-import math
-import numpy as np
-import torch
-import torch.nn as nn
-from texttable import Texttable
-
-class SimpleConv(nn.Module):
-    def __init__(self, in_ch, out_ch, kernel_size, stride, padding, groups, dilation, weights):
-        super(SimpleConv, self).__init__()
-        self.conv1 = nn.Conv2d(in_ch, out_ch, kernel_size=kernel_size, stride=stride,
-                     padding=padding, groups=groups, bias=False, dilation=dilation)
-        self.conv1.weight.data = torch.from_numpy(weights)
-    def forward(self, x):
-        out = self.conv1(x)
-        return out
-
-def check_result(output, golden):
-    if (output.shape != golden.shape): 
-        print("Output size does not match golden output")
-        return False
-    else:
-        cnt = 0
-        for val,gold in zip(output.flatten(), golden.flatten()):
-            if val!=gold:
-                print("output[{}]:{} does not macth golden[{}]:{}".format(cnt, val, cnt, gold))
-                return False
-            cnt += 1
-    return True
-
-def bitpack(matrix, DATA_WIDTH, dlen, prec):
-    if DATA_WIDTH==64:
-        packed_data = np.zeros(math.ceil(len(matrix)/DATA_WIDTH)*prec).astype(np.int64)
-    elif DATA_WIDTH==32:
-        packed_data = np.zeros(math.ceil(len(matrix)/DATA_WIDTH)*prec).astype(np.int32)
-    elif DATA_WIDTH==16:
-        packed_data = np.zeros(math.ceil(len(matrix)/DATA_WIDTH)*prec).astype(np.int16)
-    elif DATA_WIDTH==8:
-        packed_data = np.zeros(math.ceil(len(matrix)/DATA_WIDTH)*prec).astype(np.int8)
-    elif DATA_WIDTH==4:
-        packed_data = np.zeros(math.ceil(len(matrix)/DATA_WIDTH)*prec).astype(np.int8)
-    else:
-        print("Unsupported element length: {} bits".format(DATA_WIDTH))
-        return None
-    p_ptr = 0
-    # make sure input is integer
-    matrix = [int(val) for val in matrix]
-    for i in range(0,dlen, DATA_WIDTH):
-        for el in range(0, DATA_WIDTH):
-            for bit_pos in range(0, prec):
-                bit_idx = p_ptr+bit_pos
-                if i+el >=dlen:
-                    break
-                data = (matrix[i+el] >> bit_pos) & 0x1
-                packed_data[bit_idx] <<= 1
-                packed_data[bit_idx] = (packed_data[bit_idx] | data)
-        p_ptr += prec
-
-    if DATA_WIDTH==64:
-        return packed_data.astype(np.uint64)
-    elif DATA_WIDTH==32:
-        return packed_data.astype(np.uint32)
-    elif DATA_WIDTH==16:
-        return packed_data.astype(np.uint16)
-    elif DATA_WIDTH==8:
-        return packed_data.astype(np.uint8)
-    elif DATA_WIDTH==4:
-        return packed_data.astype(np.uint8)
-
-def im2col_get_pixel(im,  height,  width, row, col, channel, pad):
-    global count
-    row -= pad
-    col -= pad
-    if (row < 0 or col < 0 or row >= height or col >= width):
-         return 0
-    return im[col + width*(row + height*channel)]
-
-def im2col(data_im, channels, height, width, ksize, stride, pad):
-    c,h,w = 0,0,0
-    height_col = int((height + 2*pad - ksize) / stride + 1)
-    width_col = int((width + 2*pad - ksize) / stride + 1)
-    channels_col = int(channels * ksize * ksize)
-    output = np.zeros((1,channels_col*height_col*width_col))
-    for c in  range(channels_col):
-        w_offset = int(c % ksize)
-        h_offset = int((c / ksize) % ksize)
-        c_im = int(c / ksize / ksize)
-        for h in range(height_col):
-            for w in range(width_col):
-                im_row = h_offset + h * stride
-                im_col = w_offset + w * stride
-                col_index = (c * height_col + h) * width_col + w
-                output[0,col_index] = im2col_get_pixel(data_im, height, width, im_row, im_col, c_im, pad)
-    return output
-
-def popcnt(a):
-    return bin(a).count("1")
-
-def bitserial_gemm(inputs, weights, packed_row_len, wprec, aprec, oshape, wshape, DATA_WIDTH):
-    '''
-        Example: 
-            - activation:[16x9]
-            - weight: [1x9]
-            - DATA_WIDTH: 4
-            - aprec: 2
-            - wprec: 2
-        Weight: 
-            [0. 0. 2. 1. 2. 3. 3. 2. 0.]
-        Activation:
-            [0. 0. 0. 0. 0. 3. 0. 3. 3.]
-            [0. 0. 0. 0. 3. 1. 3. 3. 3.]
-            [0. 0. 0. 3. 1. 0. 3. 3. 3.]
-            [0. 0. 0. 1. 0. 0. 3. 3. 0.]
-            [0. 0. 3. 0. 3. 3. 0. 1. 3.]
-            [0. 3. 1. 3. 3. 3. 1. 3. 1.]
-            [3. 1. 0. 3. 3. 3. 3. 1. 2.]
-            [1. 0. 0. 3. 3. 0. 1. 2. 0.]
-            [0. 3. 3. 0. 1. 3. 0. 0. 3.]
-            [3. 3. 3. 1. 3. 1. 0. 3. 2.]
-            [3. 3. 3. 3. 1. 2. 3. 2. 0.]
-            [3. 3. 0. 1. 2. 0. 2. 0. 0.]
-            [0. 1. 3. 0. 0. 3. 0. 0. 0.]
-            [1. 3. 1. 0. 3. 2. 0. 0. 0.]
-            [3. 1. 2. 3. 2. 0. 0. 0. 0.]
-            [1. 2. 0. 2. 0. 0. 0. 0. 0.]
-        - First output element:
-            - Using GEMM:
-                [0. 0. 2. 1. 2. 3. 3. 2. 0.]
-                [0. 0. 0. 0. 3. 1. 3. 3. 3.]
-                = 6+3+9+6
-            - Using bit serial:
-                +---+---------------+-----------------+------------------+
-                |idx|    INPUT      |   WEIGHT        | Partial Result   |
-                +---+---------------+-----------------+------------------+
-                | 0 | [   0]: 0000  |   [   0]: 0001  | 0                |
-                | 1 | [   1]: 0000  |   [   1]: 0010  |                  |
-                +---+---------------+-----------------+------------------+
-                | 2 | [   2]: 0101  |   [   2]: 0110  | 1+4+2+8 = 15     |
-                | 3 | [   3]: 0101  |   [   3]: 1111  |                  |
-                +---+---------------+-----------------+------------------+
-                | 4 | [   4]: 1000  |   [   4]: 0000  | 0                |
-                | 5 | [   5]: 1000  |   [   5]: 0000  |                  |
-                +---+---------------+-----------------+------------------+
-                | 6 | [   6]: 0111  |   [   0]: 0001  | 5                |
-                | 7 | [   7]: 0101  |   [   1]: 0010  |                  |
-                +---+---------------+-----------------+------------------+
-                | 8 | [   8]: 1100  |   [   2]: 0110  | 1+4+2+8=15       |
-                | 9 | [   9]: 1100  |   [   3]: 1111  |                  |
-                +---+---------------+-----------------+------------------+
-                |10 | [  10]: 0110  |   [   4]: 0000  | 0                |
-                |11 | [  11]: 0100  |   [   5]: 0000  |                  |
-                +---+---------------+-----------------+------------------+
-    '''
-    
-    out_ch,ow,oh = oshape
-    _, in_ch, ksize, ksize = wshape
-    output = np.zeros(out_ch*ow*oh)
-    # import ipdb as pdb; pdb.set_trace()   
-    optr = 0
-    weights = weights.reshape(out_ch, wprec*math.ceil(in_ch*ksize*ksize/DATA_WIDTH))
-    num_mul = 0
-    num_add = 0
-    num_popcnt = 0
-    num_shifts = 0
-    num_mem_ops = 0
-    for oc in range(out_ch):
-        weight_oc = weights[oc]
-        for i in range(0, len(inputs), aprec*packed_row_len):
-            # for j in range(0, len(weights), wprec*packed_row_len):
-                o_acc = 0
-                for k in range(0, packed_row_len):
-                    for ap in range(0, aprec):
-                        for wp in range(0, wprec):
-                            # import ipdb as pdb; pdb.set_trace()
-                            aidx = ap+k*aprec + i
-                            widx = wp+k*wprec
-                            # print("i:{}, oc:{}, k:{}, ap:{}, wp:{},  a[{}] w[{}]".format(i,oc,k,ap,wp,aidx,widx))
-                            # print("{:04b}".format(inputs[i*k+ap]))
-                            # print("{:04b}".format(weights[j*k+wp]))
-                            o_acc += popcnt((inputs[aidx]&weight_oc[widx])) << (ap+wp)
-                            num_add += 2
-                            num_popcnt += 1
-                            num_shifts += 1
-                            num_mem_ops += 2
-                # print("[{:4d}]: {:4d}".format(optr, o_acc))
-                # import ipdb as pdb; pdb.set_trace()
-                # print("{}\n".format(o_acc))
-                output[optr] = o_acc
-                optr += 1
-        # import ipdb as pdb; pdb.set_trace()
-        # pass
-    return output, [num_mul,num_add,num_popcnt,num_shifts,num_mem_ops]
-
-def print_packed_data(packed_data):
-    for id, val in enumerate(packed_data):
-        if DATA_WIDTH==64:
-            pos_val = val if val>0 else (np.uint64(val)&np.uint64(0xffffffffffffffff))
-            print('[{0:4d}]: {1:064b}'.format(id, pos_val))
-        elif DATA_WIDTH==32:
-            pos_val = val if val>0 else (np.uint32(val)&np.uint32(0xffffffff))
-            print('[{0:4d}]: {1:032b}'.format(id, pos_val))
-        elif DATA_WIDTH==16:
-            pos_val = val if val>0 else (np.uint16(val)&np.uint16(0xffff))
-            print('[{0:4d}]: {1:016b}'.format(id, pos_val))
-        elif DATA_WIDTH==8:
-            pos_val = val if val>0 else (np.uint8(val)&np.uint8(0xff))
-            print('[{0:4d}]: {1:08b}'.format(id, pos_val))
-        elif DATA_WIDTH==4:
-            pos_val = val if val>0 else (np.uint8(val)&np.uint8(0xff))
-            print('[{0:4d}]: {1:04b}'.format(id, pos_val))
-
-if __name__ == '__main__':
-    np.random.seed(seed=0)
-    batch = 1
-    iw = 28
-    ih = 28
-    in_ch = 32
-    out_ch = 64
-    ksize  = 3
-    stride = 1
-    padding = 1
-    groups = 1
-    dilation = 1
-    prec = 4
-    DATA_WIDTH = 8
-    VLEN = 1
-    aprec = prec
-    wprec = prec
-
-    # Initialize input and weight
-    max_int = (2**prec) - 1 
-    inputs = np.random.randint(max_int+1, size=(batch, in_ch, iw, ih)).astype(np.float32)
-    weights = np.random.randint(max_int+1, size=(out_ch, in_ch, ksize, ksize)).astype(np.float32)
-    torch_conv = SimpleConv(in_ch, out_ch, ksize, stride, padding, groups, dilation, weights)
-
-    golden_output = torch_conv(torch.from_numpy(inputs))
-    #================================================================
-    # Computing Convolution with im2col method with normal GEMM:
-    #================================================================
-    inputs_flatten = inputs.flatten()
-    inputs_im2col = im2col(inputs_flatten, in_ch, ih, iw, ksize, stride, padding)
-    inputs_im2col = inputs_im2col.reshape(in_ch*ksize*ksize,iw*ih)
-    weights_mat = weights.reshape(out_ch,in_ch*ksize*ksize)
-    output_im2col = np.dot(weights_mat, inputs_im2col)
-    num_mul = np.prod(output_im2col.shape) * weights_mat.shape[1]
-    num_add = np.prod(output_im2col.shape) * weights_mat.shape[1]
-    im2col_stats = [num_mul, num_add, 0, 0, "?"]
-    
-    ow = int((iw + 2*padding - ksize) / stride + 1)
-    oh = int((ih + 2*padding - ksize) / stride + 1)
-    output = output_im2col.reshape(1, out_ch, ow, oh)
-    if check_result(output, golden_output):
-        print("im2col conv matches pytorch conv")
-    else:
-        print("im2col conv does not match pytorch conv")
-
-    #================================================================
-    # Computing Convolution with im2col method with bitserial GEMM:
-    #================================================================
-    # import ipdb as pdb; pdb.set_trace()
-    # inputs_im2col = inputs_im2col.reshape(ksize*ksize,16).transpose()
-    inputs_im2col = inputs_im2col.reshape(in_ch*ksize*ksize,iw*ih).transpose()
-    if (in_ch*ksize*ksize)%DATA_WIDTH != 0:
-        i_org_shape = inputs_im2col.shape
-        inputs_im2col_fixed = np.zeros((i_org_shape[0], i_org_shape[1]+DATA_WIDTH-((in_ch*ksize*ksize)%DATA_WIDTH)))
-        for id, row in enumerate(inputs_im2col):
-            inputs_im2col_fixed[id][0:i_org_shape[1]] = row
-        w_org_shape = weights_mat.shape
-        weights_mat_fixed = np.zeros((w_org_shape[0], w_org_shape[1]+DATA_WIDTH-((in_ch*ksize*ksize)%DATA_WIDTH)))
-        for id, row in enumerate(weights_mat):
-            weights_mat_fixed[id][0:w_org_shape[1]] = row
-        inputs_im2col = inputs_im2col_fixed
-        weights_mat = weights_mat_fixed
-    # import ipdb as pdb; pdb.set_trace()
-    bit_packed_inputs = bitpack(inputs_im2col.flatten(), DATA_WIDTH, len(inputs_im2col.flatten()), aprec)
-    bit_packed_weights = bitpack(weights_mat.flatten(), DATA_WIDTH, len(weights_mat.flatten()), wprec)
-
-    # import ipdb as pdb; pdb.set_trace()
-    packed_row_len = int(inputs_im2col.shape[1]/DATA_WIDTH)
-
-    # print_packed_data(bit_packed_inputs)
-    # print_packed_data(bit_packed_weights)
-    # print("\n")
-    # # import ipdb as pdb; pdb.set_trace()
-    # print(inputs_im2col)
-    # print(weights_mat)
-    # print(golden_output)
-    # import ipdb as pdb; pdb.set_trace()
-    output_bitserial, bit_serial_stats = bitserial_gemm(bit_packed_inputs, bit_packed_weights, packed_row_len, wprec, aprec, [out_ch,ow,oh], [out_ch, in_ch, ksize, ksize], DATA_WIDTH)
-    output = output_bitserial.reshape(1, out_ch, ow, oh)
-    t = Texttable(max_width=160)
-    t.add_row(['Type', 'Num Mul', 'Num Add', 'Num Popcnt', 'Num Shifts', 'Num MemOps'])
-    t.add_row(["Bit-Serial", *bit_serial_stats])
-    t.add_row(["Im2Col GEMM", *im2col_stats])
-
-    if check_result(output, golden_output):
-        print("bitserial conv matches pytorch conv")
-    else:
-        print("bitserial conv does not match pytorch conv")
-    #================================================================
-    # Printing Results:
-    #================================================================
-    print("Computation cost for:")
-    print("\t Input Shape: {}x{}x{}".format(in_ch, iw, ih))
-    print("\t Weight Shape: {}x{}x{}x{}".format(out_ch, in_ch, ksize, ksize))
-    print("\t Data Width: {}".format(DATA_WIDTH))
-    print("\t Vector Length: {}".format(VLEN))
-    print("\t aprec: {}".format(aprec))
-    print("\t wprec: {}".format(wprec))
-    print(t.draw())
-
diff --git a/apps/bitserial_conv/kernel/darknet.h b/apps/bitserial_conv/kernel/darknet.h
deleted file mode 100644
index af82eaf00..000000000
--- a/apps/bitserial_conv/kernel/darknet.h
+++ /dev/null
@@ -1,271 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-
-typedef struct{
-    int *leaf;
-    int n;
-    int *parent;
-    int *child;
-    int *group;
-    char **name;
-
-    int groups;
-    int *group_size;
-    int *group_offset;
-} tree;
-tree *read_tree(char *filename);
-
-struct layer;
-typedef struct layer layer;
-
-typedef enum {
-    CONVOLUTIONAL,
-} LAYER_TYPE;
-
-struct layer{
-    LAYER_TYPE type;
-    int batch;
-    int forced;
-    int flipped;
-    int inputs;
-    int outputs;
-    int nweights;
-    int nbiases;
-    int extra;
-    int truths;
-    int h,w,c;
-    int out_h, out_w, out_c;
-    int n;
-    int max_boxes;
-    int groups;
-    int size;
-    int side;
-    int stride;
-    int reverse;
-    int flatten;
-    int spatial;
-    int pad;
-    int sqrt;
-    int flip;
-    int index;
-    int steps;
-    int hidden;
-    int truth;
-    float smooth;
-    float dot;
-    float angle;
-    float jitter;
-    float saturation;
-    float exposure;
-    float shift;
-    float ratio;
-    float learning_rate_scale;
-    float clip;
-    int noloss;
-    int softmax;
-    int classes;
-    int coords;
-    int background;
-    int rescore;
-    int objectness;
-    int joint;
-    int noadjust;
-    int reorg;
-    int log;
-    int tanh;
-    int *mask;
-    int total;
-
-    float alpha;
-    float beta;
-    float kappa;
-
-    float coord_scale;
-    float object_scale;
-    float noobject_scale;
-    float mask_scale;
-    float class_scale;
-    int bias_match;
-    int random;
-    float ignore_thresh;
-    float truth_thresh;
-    float thresh;
-    float focus;
-    int classfix;
-    int absolute;
-
-    int onlyforward;
-    int stopbackward;
-    int dontload;
-    int dontsave;
-    int dontloadscales;
-    int numload;
-
-    float temperature;
-    float probability;
-    float scale;
-
-    char  * cweights;
-    int   * indexes;
-    int   * input_layers;
-    int   * input_sizes;
-    int   * map;
-    int   * counts;
-    float ** sums;
-    float * rand;
-    float * cost;
-    float * state;
-    float * prev_state;
-    float * forgot_state;
-    float * forgot_delta;
-    float * state_delta;
-    float * combine_cpu;
-    float * combine_delta_cpu;
-
-    float * concat;
-    float * concat_delta;
-
-    float * biases;
-
-    float * weights;
-
-    float * delta;
-    float * output;
-    float * loss;
-    float * squared;
-    float * norms;
-
-    float * x;
-
-    float * m;
-    float * v;
-    
-    float * bias_m;
-    float * bias_v;
-    float * scale_m;
-    float * scale_v;
-
-
-    float *z_cpu;
-    float *r_cpu;
-    float *h_cpu;
-    float * prev_state_cpu;
-
-    float *temp_cpu;
-    float *temp2_cpu;
-    float *temp3_cpu;
-
-    float *dh_cpu;
-    float *hh_cpu;
-    float *prev_cell_cpu;
-    float *cell_cpu;
-    float *f_cpu;
-    float *i_cpu;
-    float *g_cpu;
-    float *o_cpu;
-    float *c_cpu;
-    float *dc_cpu; 
-
-    float * binary_input;
-
-    struct layer *input_layer;
-    struct layer *self_layer;
-    struct layer *output_layer;
-
-    struct layer *reset_layer;
-    struct layer *update_layer;
-    struct layer *state_layer;
-
-    struct layer *input_gate_layer;
-    struct layer *state_gate_layer;
-    struct layer *input_save_layer;
-    struct layer *state_save_layer;
-    struct layer *input_state_layer;
-    struct layer *state_state_layer;
-
-    struct layer *input_z_layer;
-    struct layer *state_z_layer;
-
-    struct layer *input_r_layer;
-    struct layer *state_r_layer;
-
-    struct layer *input_h_layer;
-    struct layer *state_h_layer;
-	
-    struct layer *wz;
-    struct layer *uz;
-    struct layer *wr;
-    struct layer *ur;
-    struct layer *wh;
-    struct layer *uh;
-    struct layer *uo;
-    struct layer *wo;
-    struct layer *uf;
-    struct layer *wf;
-    struct layer *ui;
-    struct layer *wi;
-    struct layer *ug;
-    struct layer *wg;
-
-    size_t workspace_size;
-};
-
-typedef struct network{
-    int n;
-    int batch;
-    size_t *seen;
-    int *t;
-    float epoch;
-    int subdivisions;
-    layer *layers;
-    float *output;
-
-    float learning_rate;
-    float momentum;
-    float decay;
-    float gamma;
-    float scale;
-    float power;
-    int time_steps;
-    int step;
-    int max_batches;
-    float *scales;
-    int   *steps;
-    int num_steps;
-    int burn_in;
-
-    int adam;
-    float B1;
-    float B2;
-    float eps;
-
-    int inputs;
-    int outputs;
-    int truths;
-    int notruth;
-    int h, w, c;
-    int max_crop;
-    int min_crop;
-    float max_ratio;
-    float min_ratio;
-    int center;
-    float angle;
-    float aspect;
-    float exposure;
-    float saturation;
-    float hue;
-    int random;
-
-    int gpu_index;
-    tree *hierarchy;
-
-    float *input;
-    float *truth;
-    float *delta;
-    float *workspace;
-    int train;
-    int index;
-    float *cost;
-    float clip;
-
-} network;
diff --git a/apps/bitserial_conv/kernel/im2col.c b/apps/bitserial_conv/kernel/im2col.c
deleted file mode 100644
index 63101cbea..000000000
--- a/apps/bitserial_conv/kernel/im2col.c
+++ /dev/null
@@ -1,36 +0,0 @@
-int im2col_get_pixel(int *im, int height, int width, int channels,
-                        int row, int col, int channel, int pad)
-{
-    row -= pad;
-    col -= pad;
-
-    if (row < 0 || col < 0 ||
-        row >= height || col >= width) return 0;
-    return im[col + width*(row + height*channel)];
-}
-
-//From Berkeley Vision's Caffe!
-//https://github.com/BVLC/caffe/blob/master/LICENSE
-void im2col_cpu(int* data_im,
-     int channels,  int height,  int width,
-     int ksize,  int stride, int pad, int* data_col) 
-{
-    int c,h,w;
-    int height_col = (height + 2*pad - ksize) / stride + 1;
-    int width_col = (width + 2*pad - ksize) / stride + 1;
-
-    int channels_col = channels * ksize * ksize;
-    for (c = 0; c < channels_col; ++c) {
-        int w_offset = c % ksize;
-        int h_offset = (c / ksize) % ksize;
-        int c_im = c / ksize / ksize;
-        for (h = 0; h < height_col; ++h) {
-            for (w = 0; w < width_col; ++w) {
-                int im_row = h_offset + h * stride;
-                int im_col = w_offset + w * stride;
-                int col_index = (c * height_col + h) * width_col + w;
-                data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
-                        im_row, im_col, c_im, pad);
-            }
-        }
-    }
diff --git a/apps/bitserial_conv/main.c b/apps/bitserial_conv/main.c
deleted file mode 100644
index eb5d3fc91..000000000
--- a/apps/bitserial_conv/main.c
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright 2020 ETH Zurich and University of Bologna.
-//
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Author: Matheus Cavalcante, ETH Zurich
-
-#include <stdint.h>
-#include <string.h>
-
-#include "printf.h"
-
-int main() {
-  printf("Ariane says Hello!\n");
-
-  return 0;
-}
diff --git a/apps/bitserial_matmul/a.out b/apps/bitserial_matmul/a.out
deleted file mode 100755
index 471d8245b..000000000
Binary files a/apps/bitserial_matmul/a.out and /dev/null differ
diff --git a/apps/bitserial_matmul/kernel/bitserial_matmul.c b/apps/bitserial_matmul/kernel/bitserial_matmul.c
deleted file mode 100644
index 41da0a4d7..000000000
--- a/apps/bitserial_matmul/kernel/bitserial_matmul.c
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright 2022 ETH Zurich, University of Bologna and Polytechnique Montreal.
-// Solderpad Hardware License, Version 0.51, see LICENSE for details.
-// SPDX-License-Identifier: SHL-0.51
-//
-// Author: MohammadHossein AskariHemmat <m.h.askari.hemmat@gmail.com>
-
-#include "bitserial_matmul.h"
-
-void bitserial_matmul_init() {
-  asm volatile("vmv.v.i v0,  0");
-  asm volatile("vmv.v.i v4,  0");
-  asm volatile("vmv.v.i v8,  0");
-  asm volatile("vmv.v.i v12, 0");
-  asm volatile("vmv.v.i v16, 0");
-}
-
-// given the input arrays with bit precision prec in bitpack format, this function
-// calculates the Matmul of the two Matrix and returns the resulting Matrix.
-// NOTE: a and b must have the same length. 
-// NOTE: a and b must be in bit packed format, the retuning matrix is in normal format.
-// c[64x64] = a[64]*b[64]
-void bitserial_matmul_64(uint64_t* c, uint64_t* a, uint64_t* b, int aprec, int bprec) {
-    uint64_t vl=0;
-    // Original pointers
-    const uint64_t *a_ = a;
-    const uint64_t *b_ = b;
-    uint64_t elen = 64;
-    asm volatile("vsetvli %0, %1, e64, m1, ta, ma \n" : "+r" (vl) : "r" (elen));
-    // start with a fresh temp registers
-    bitserial_matmul_init();
-    for (int row=0; row<64; row++){
-        // The following loop will compute one row of a 64 element output
-        for (int i=0; i<aprec; i++){
-            for (int j=0; j<bprec; j++){
-                // load a
-                asm volatile("vle64.v v0, (%[A])" : : [A] "r" (a));
-                // load b
-                asm volatile("vle64.v v1, (%[A])" : : [A] "r" (b));
-                // broadcast one row to the vector
-                asm volatile("vrgather.vx v4, v1, %0" : : "r" (row));
-                // v4 = v0 & v4
-                asm volatile("vand.vv v4, v0, v1" ::);
-                // v8 = vpopcnt(v4) 
-                __asm__ volatile(".byte 0x57, 0x04, 0x22, 0x06\n" ::);
-                // partial sum in v12
-                asm volatile("vadd.vv v12, v12, v8" ::);
-                a += vl;
-                b += vl;
-            }
-        }
-        // Done with the first row of b. Roll back a to the begining
-        a = a_;
-        b = b_;
-        // And save result (first row) to output
-        asm volatile("vse64.v v12, (%0);" ::"r"(c));
-        c += vl;
-        // Reset result vector register
-        asm volatile("vmv.v.i v12, 0");
-    }
-}
\ No newline at end of file
diff --git a/apps/bitserial_matmul/kernel/bitserial_matmul.h b/apps/bitserial_matmul/kernel/bitserial_matmul.h
deleted file mode 100644
index 31600ec1c..000000000
--- a/apps/bitserial_matmul/kernel/bitserial_matmul.h
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright 2022 ETH Zurich, University of Bologna and Polytechnique Montreal.
-// Solderpad Hardware License, Version 0.51, see LICENSE for details.
-// SPDX-License-Identifier: SHL-0.51
-//
-// Author: MohammadHossein AskariHemmat <m.h.askari.hemmat@gmail.com>
-
-#ifndef BITSERIAL_MATMUL_H
-#define BITSERIAL_MATMUL_H
-
-#include <stdint.h>
-
-void bitserial_matmul_init();
-void bitserial_matmul_64(uint64_t* c, uint64_t* a, uint64_t* b, int aprec, int bprec);
-
-#endif
diff --git a/apps/bitserial_matmul/main.c b/apps/bitserial_matmul/main.c
deleted file mode 100644
index ec75485b0..000000000
--- a/apps/bitserial_matmul/main.c
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright 2022 ETH Zurich, University of Bologna and Polytechnique Montreal.
-// Solderpad Hardware License, Version 0.51, see LICENSE for details.
-// SPDX-License-Identifier: SHL-0.51
-//
-// Author: MohammadHossein AskariHemmat <m.h.askari.hemmat@gmail.com>
-
-#include <stdint.h>
-#include <string.h>
-
-#include "printf.h"
-#include "runtime.h"
-#include "kernel/bitserial_matmul.h"
-
-// Define Matrix dimensions:
-// C = AB with A=[MxN], B=[NxP], C=[MxP]
-#define M 64
-#define N 64
-#define P 64
-
-uint64_t a[M * N] __attribute__((aligned(32 * NR_LANES), section(".l2")));
-uint64_t b[N * P] __attribute__((aligned(32 * NR_LANES), section(".l2")));
-uint64_t c[M * P] __attribute__((aligned(32 * NR_LANES), section(".l2")));
-
-// Initialize the matrices
-void init_matrix(uint64_t *matrix, uint64_t num_rows, uint64_t num_columns) {
-  for (uint64_t i = 0; i < num_rows; ++i) {
-    for (uint64_t j = 0; j < num_columns; ++j) {
-      matrix[i * num_columns + j] = i;
-    }
-  }
-}
-
-
-int main(void) {
-    const int s = 64;
-    const int prec = 2;
-    printf("\n");
-    printf("------------------------------------------------------------\n");
-    printf("Calculating a bit-serialized (%d x %d) x (%d x %d) matrix multiplication with 2 bit precision...\n", s,
-           s, s, s);
-    printf("------------------------------------------------------------\n");
-    printf("\n");
-    // Initialize Matrices
-    printf("Initializing matrices...\n");
-    // init_matrix(a, s, s);
-    // init_matrix(b, s, s);
-    // Matrices are initialized --> Start calculating
-    printf("Calculating bitserial_matmul_64...\n");
-    start_timer();
-    bitserial_matmul_64(c, a, b, 1, prec);
-    stop_timer();
-    // Metrics
-    int64_t runtime = get_timer();
-    float performance = 2.0 * s * s / runtime;
-    float utilization = 100 * performance / (2.0 * NR_LANES);
-
-    printf("The execution took %d cycles.\n", runtime);
-    printf("The performance is %f OP/cycle (%f%% utilization).\n", performance,
-           utilization);
-
-}
diff --git a/apps/bitserial_matmul/vpopcnt b/apps/bitserial_matmul/vpopcnt
deleted file mode 100755
index 471d8245b..000000000
Binary files a/apps/bitserial_matmul/vpopcnt and /dev/null differ
diff --git a/apps/bitserial_pack/data.S b/apps/bitserial_pack/data.S
deleted file mode 100644
index e69de29bb..000000000
diff --git a/apps/bitserial_pack/kernel/bitserial_pack.c b/apps/bitserial_pack/kernel/bitserial_pack.c
deleted file mode 100644
index fe8b72e43..000000000
--- a/apps/bitserial_pack/kernel/bitserial_pack.c
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright 2022 ETH Zurich, University of Bologna and Polytechnique Montreal.
-// Solderpad Hardware License, Version 0.51, see LICENSE for details.
-// SPDX-License-Identifier: SHL-0.51
-//
-// Author: MohammadHossein AskariHemmat <m.h.askari.hemmat@gmail.com>
-
-#include "bitserial_pack.h"
-
-void bitserial_pack_init() {
-  asm volatile("vmv.v.i v0,  0");
-  asm volatile("vmv.v.i v1,  0");
-  asm volatile("vmv.v.i v2,  0");
-}
-// Assuming an array of 2bit precision data is packed into 8bit chars, this function,
-// de-reference these chars into 64b words. Each call to VBPACK will consume 4 words
-// (4 lanes) of 64 bits.
-void bitserial_pack_64(uint64_t* mat_i, uint64_t* out, uint64_t len_mat_i) {
-    uint64_t vl=0;
-    uint64_t complete_words = len_mat_i/4;
-    uint64_t residuals = len_mat_i%4;
-    uint64_t elm_cnt = 0;
-    // Make sure SEW is set to 64
-    asm volatile("vsetvli zero, %0, e64, m1, ta, ma" ::"r"(4));
-    // start with a fresh temp registers
-    bitserial_pack_init();
-    for (uint64_t n=0; n<4; n++){
-        for (uint64_t i=0; i<4; i++){
-            asm volatile("vle64.v v1, (%[A])" : : [A] "r" (mat_i));
-            asm volatile(".byte 0x57, 0x81, 0x20, 0x0E\n" ::);
-        }
-        asm volatile("vse64.v v2, (%0)" : "+&r"(out));
-        out+= 4;
-    }
-}
\ No newline at end of file
diff --git a/apps/bitserial_pack/kernel/bitserial_pack.h b/apps/bitserial_pack/kernel/bitserial_pack.h
deleted file mode 100644
index 52009c7c1..000000000
--- a/apps/bitserial_pack/kernel/bitserial_pack.h
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright 2022 ETH Zurich, University of Bologna and Polytechnique Montreal.
-// Solderpad Hardware License, Version 0.51, see LICENSE for details.
-// SPDX-License-Identifier: SHL-0.51
-//
-// Author: MohammadHossein AskariHemmat <m.h.askari.hemmat@gmail.com>
-
-#ifndef BITSERIAL_PACK_H
-#define BITSERIAL_PACK_H
-
-#include <stdint.h>
-
-void bitserial_pack_init();
-// 2 bit precision packing
-void bitserial_pack_64(uint64_t* mat_i, uint64_t* out, uint64_t len_mat_i);
-
-#endif
-
-
diff --git a/apps/bitserial_pack/main.c b/apps/bitserial_pack/main.c
deleted file mode 100644
index 8cd1436a4..000000000
--- a/apps/bitserial_pack/main.c
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright 2022 ETH Zurich, University of Bologna and Polytechnique Montreal.
-// Solderpad Hardware License, Version 0.51, see LICENSE for details.
-// SPDX-License-Identifier: SHL-0.51
-//
-// Author: MohammadHossein AskariHemmat <m.h.askari.hemmat@gmail.com>
-
-#include <stdint.h>
-#include <string.h>
-
-#include "printf.h"
-#include "runtime.h"
-#include <inttypes.h>
-#include "kernel/bitserial_pack.h"
-
-// Define Matrix dimensions:
-// C = AB with A=[MxN], B=[NxP], C=[MxP]
-#define M 64
-#define N 64
-#define P 64
-
-uint64_t a[M * N] __attribute__((aligned(32 * NR_LANES), section(".l2")));
-// uint64_t b[N * P] __attribute__((aligned(32 * NR_LANES), section(".l2")));
-uint64_t c[M * P] __attribute__((aligned(32 * NR_LANES), section(".l2")));
-
-// Initialize the matrices
-void init_matrix(uint64_t *matrix, uint64_t num_rows, uint64_t num_columns) {
-  for (uint64_t i = 0; i < num_rows; ++i) {
-    for (uint64_t j = 0; j < num_columns; ++j) {
-      matrix[i * num_columns + j] = 0x0202020202020202;
-    }
-  }
-}
-
-
-int main(void) {
-    const int s = 4;
-    // Initialize Matrices
-    printf("Initializing matrices...\n");
-    init_matrix(a, s, s);
-    // init_matrix(b, s, s);
-    // Matrices are initialized --> Start calculating
-    for (int i=0; i<s*s; i++){
-        printf("[%4d]:  0x%" PRIx64 "\n", i, a[i]);
-    }
-    printf("Calculating bitserial_pack_64...\n");
-    start_timer();
-    bitserial_pack_64(a, c, s*s);
-    stop_timer();
-    int64_t runtime = get_timer();
-    printf("Results...\n");
-    for (int i=0; i<s*s; i++){
-        printf("[%4d]:  0x%" PRIx64 "\n", i, c[i]);
-    }
-
-    printf("The execution took %d cycles.\n", runtime);
-    // printf("The performance is %f OP/cycle (%f%% utilization).\n", performance,
-    //        utilization);
-
-}
\ No newline at end of file