diff --git a/apps/bitpack/main.c b/apps/bitpack/main.c deleted file mode 100644 index bbefde54b..000000000 --- a/apps/bitpack/main.c +++ /dev/null @@ -1,115 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "util.h" -#include "printf.h" -#include "runtime.h" - -#define RISC_V_ARA - -void init_matrix(uint64_t* matrix, int num_rows, int num_columns, uint64_t MAX_VAL) { - for (int i = 0; i < num_rows; ++i) { - for (int j = 0; j < num_columns; ++j) { - matrix[i * num_columns + j] = (rand() % (MAX_VAL - 0 + 1)) + 0; - } - } -} - -void transpose_matrix(uint64_t* matrix, int num_rows, int num_columns){ - uint64_t temp; - for (int i = 0; i < num_rows; ++i) { - for (int j = i; j < num_columns; ++j) { - temp = matrix[i * num_columns + j]; - matrix[i * num_columns + j] = matrix[j * num_columns + i]; - matrix[j * num_columns + i] = temp; - } - } -} - -// Naive implementation of bit packing -void bitpack_naive(uint64_t* matrix, uint64_t* packed_data, int DATA_WIDTH, int dlen, int bitprec){ - // Packed data pointer - uint64_t p_ptr = 0; - for (int i=0; i> bit_pos) & 0x1; - packed_data[bit_idx] <<= 1; - packed_data[bit_idx] = (packed_data[bit_idx] | data); - } - } - p_ptr += bitprec; - } -} - -void bitpack_init() { - asm volatile("vmv.v.i v0, 0"); - asm volatile("vmv.v.i v1, 0"); - asm volatile("vmv.v.i v2, 0"); - asm volatile("vmv.v.i v3, 0"); - asm volatile("vmv.v.i v4, 0"); -} - -// Vectorized implementation of bit packing -void bitpack_vectorized(uint64_t* matrix, uint64_t* packed_data, int DATA_WIDTH, int dlen, int bitprec){ - // Initialize vector RF - bitpack_init(); - size_t vl=0; - uint64_t *p_ptr = matrix; - // Load shift values into v0 - asm volatile("vle64.v v0, (%[A])" : : [A] "r" (vshift)); - // Load mask values into v1 - asm volatile("vle64.v v1, (%[A])" : : [A] "r" (vmask)); - for (size_t c_n_count=dlen; c_n_count; c_n_count -= vl){ - vl = vsetvl_e64m1(c_n_count); - asm volatile("vle64.v v2, (%[A])" : : [A] "r" (matrix)); - asm volatile("vand.vv v3, v2, v1" ::); - for (int bit_pos=0; bit_pos> bit_pos) & 0x1; - packed_data[bit_idx] <<= 1; - packed_data[bit_idx] = (packed_data[bit_idx] | data); - } - p_ptr += bitprec; - } -} - -int main(){ - printf("bitpack init!\n"); - const uint64_t BITPREC=3; - const uint64_t MAT_SIZE_W = 8; - const uint64_t MAT_SIZE_H = 8; - const uint64_t DATA_WIDTH = 8; - assert((MAT_SIZE_W%DATA_WIDTH)==0 && "Matrix size must be multiple of data type"); - const uint64_t PACKED_MAT_SIZE = (MAT_SIZE_H*MAT_SIZE_W)/(DATA_WIDTH) * BITPREC; - uint64_t tensor[MAT_SIZE_H*MAT_SIZE_W]; - uint64_t packed_data[PACKED_MAT_SIZE]; - uint64_t max_val = (1< -#include -#include -#include - -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -#define for_endian(size) for (int i = 0; i < size; ++i) -#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define for_endian(size) for (int i = size - 1; i >= 0; --i) -#else -#error "Endianness not detected" -#endif - -#define printb(value) \ -({ \ - typeof(value) _v = value; \ - __printb((typeof(_v) *) &_v, sizeof(_v)); \ -}) - -#define MSB_MASK 1 << (CHAR_BIT - 1) - -#define PRINT_INT 0 -#define PRINT_BIN 1 - - -void __printb(void *value, size_t size); -void print_matrix(uint64_t *mat, int num_rows, int num_columns, int bin_print_format); -const uint64_t vshift[64] = { 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31, - 32, 33, 34, 35, 36, 37, 38, 39, - 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, - 56, 57, 58, 59, 60, 61, 62, 63}; -const uint64_t vmask[64] = { 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1}; diff --git a/apps/bitserial_conv/kernel/.vscode/c_cpp_properties.json b/apps/bitserial_conv/kernel/.vscode/c_cpp_properties.json deleted file mode 100644 index df8182dfe..000000000 --- a/apps/bitserial_conv/kernel/.vscode/c_cpp_properties.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "configurations": [ - { - "name": "macos-gcc-arm64", - "includePath": [ - "${workspaceFolder}/**" - ], - "compilerPath": "/usr/bin/clang", - "cStandard": "${default}", - "cppStandard": "${default}", - "intelliSenseMode": "macos-gcc-arm64", - "compilerArgs": [] - } - ], - "version": 4 -} \ No newline at end of file diff --git a/apps/bitserial_conv/kernel/.vscode/launch.json b/apps/bitserial_conv/kernel/.vscode/launch.json deleted file mode 100644 index b65adf0e5..000000000 --- a/apps/bitserial_conv/kernel/.vscode/launch.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "version": "0.2.0", - "configurations": [ - { - "name": "C/C++ Runner: Debug Session", - "type": "lldb", - "request": "launch", - "args": [ - "" - ], - "stopAtEntry": true, - "cwd": "/Users/hossein/MyRepos/ara/apps/bitserial_conv/kernel", - "environment": [], - "program": "/Users/hossein/MyRepos/ara/apps/bitserial_conv/kernel/build/Debug/outDebug", - "internalConsoleOptions": "openOnSessionStart", - "MIMode": "gdb", - "externalConsole": false - } - ] -} \ No newline at end of file diff --git a/apps/bitserial_conv/kernel/.vscode/settings.json b/apps/bitserial_conv/kernel/.vscode/settings.json deleted file mode 100644 index ce53bf0a9..000000000 --- a/apps/bitserial_conv/kernel/.vscode/settings.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "C_Cpp_Runner.cCompilerPath": "/usr/bin/clang", - "C_Cpp_Runner.cppCompilerPath": "/usr/bin/clang++", - "C_Cpp_Runner.debuggerPath": "/usr/bin/lldb", - "C_Cpp_Runner.cStandard": "", - "C_Cpp_Runner.cppStandard": "", - "C_Cpp_Runner.msvcBatchPath": "", - "C_Cpp_Runner.warnings": [ - "-Wall", - "-Wextra", - "-Wpedantic" - ], - "C_Cpp_Runner.enableWarnings": true, - "C_Cpp_Runner.warningsAsError": false, - "C_Cpp_Runner.compilerArgs": [], - "C_Cpp_Runner.linkerArgs": [], - "C_Cpp_Runner.includePaths": [], - "C_Cpp_Runner.includeSearch": [ - "*", - "**/*" - ], - "C_Cpp_Runner.excludeSearch": [ - "**/build", - "**/build/**", - "**/.*", - "**/.*/**", - "**/.vscode", - "**/.vscode/**" - ] -} \ No newline at end of file diff --git a/apps/bitserial_conv/kernel/conv2d_bitserial.c b/apps/bitserial_conv/kernel/conv2d_bitserial.c deleted file mode 100644 index 235b63f9b..000000000 --- a/apps/bitserial_conv/kernel/conv2d_bitserial.c +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright 2022 ETH Zurich, University of Bologna and Polytechnique Montreal. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 -// -// Author: MohammadHossein AskariHemmat - -#include "conv2d_bitserial.h" - -int im2col_get_pixel(int *im, int height, int width, int channels, - int row, int col, int channel, int pad) -{ - row -= pad; - col -= pad; - - if (row < 0 || col < 0 || - row >= height || col >= width) return 0; - return im[col + width*(row + height*channel)]; -} - -//From Berkeley Vision's Caffe! -//https://github.com/BVLC/caffe/blob/master/LICENSE -void im2col_cpu(int* data_im, - int channels, int height, int width, - int ksize, int stride, int pad, int* data_col) -{ - int c,h,w; - int height_col = (height + 2*pad - ksize) / stride + 1; - int width_col = (width + 2*pad - ksize) / stride + 1; - - int channels_col = channels * ksize * ksize; - for (c = 0; c < channels_col; ++c) { - int w_offset = c % ksize; - int h_offset = (c / ksize) % ksize; - int c_im = c / ksize / ksize; - for (h = 0; h < height_col; ++h) { - for (w = 0; w < width_col; ++w) { - int im_row = h_offset + h * stride; - int im_col = w_offset + w * stride; - int col_index = (c * height_col + h) * width_col + w; - data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, - im_row, im_col, c_im, pad); - } - } - } -} - -void fill_cpu(int N, float ALPHA, float *X, int INCX) -{ - int i; - for(i = 0; i < N; ++i) X[i*INCX] = ALPHA; -} - -void gemm(int M, int N, int K, float ALPHA, - float *A, int lda, - float *B, int ldb, - float *C, int ldc) -{ - int i,j,k; - for(i = 0; i < M; ++i){ - for(k = 0; k < K; ++k){ - register float A_PART = ALPHA*A[i*lda+k]; - for(j = 0; j < N; ++j){ - C[i*ldc+j] += A_PART*B[k*ldb+j]; - } - } - } -} - - -void conv2d(convolutional_layer l, network net) -{ - int i, j; - - fill_cpu(l.outputs*l.batch, 0, l.output, 1); - - int m = l.n/l.groups; - int k = l.size*l.size*l.c/l.groups; - int n = l.out_w*l.out_h; - for(i = 0; i < l.batch; ++i){ - for(j = 0; j < l.groups; ++j){ - float *a = l.weights + j*l.nweights/l.groups; - float *b = net.workspace; - float *c = l.output + (i*l.groups + j)*n*m; - float *im = net.input + (i*l.groups + j)*l.c/l.groups*l.h*l.w; - - if (l.size == 1) { - b = im; - } else { - im2col_cpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b); - } - gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); - } - } -} diff --git a/apps/bitserial_conv/kernel/conv2d_bitserial.h b/apps/bitserial_conv/kernel/conv2d_bitserial.h deleted file mode 100644 index 42d934a17..000000000 --- a/apps/bitserial_conv/kernel/conv2d_bitserial.h +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2022 ETH Zurich, University of Bologna and Polytechnique Montreal. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 -// -// Author: MohammadHossein AskariHemmat - -#ifndef BITSERIAL_MATMUL_H -#define BITSERIAL_MATMUL_H - -#include - -void bitserial_matmul_init(); -void bitserial_matmul_64(uint64_t* c, uint64_t* a, uint64_t* b, int aprec, int bprec); -int im2col_get_pixel(int *im, int height, int width, int channels, int row, int col, int channel, int pad); -void im2col_cpu(int* data_im, int channels, int height, int width, int ksize, int stride, int pad, int* data_col); -void fill_cpu(int N, float ALPHA, float *X, int INCX); -void gemm(int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float *C, int ldc); -void conv2d(convolutional_layer l, network net); - -#endif - - -#ifndef CONVOLUTIONAL_LAYER_H -#define CONVOLUTIONAL_LAYER_H - -#include "layer.h" - -typedef layer convolutional_layer; -convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int groups, int size, int stride, int padding, int batch_normalize, int binary, int xnor, int adam); - -#endif diff --git a/apps/bitserial_conv/kernel/conv2d_bitserial.py b/apps/bitserial_conv/kernel/conv2d_bitserial.py deleted file mode 100644 index a6641a6a0..000000000 --- a/apps/bitserial_conv/kernel/conv2d_bitserial.py +++ /dev/null @@ -1,310 +0,0 @@ -import math -import numpy as np -import torch -import torch.nn as nn -from texttable import Texttable - -class SimpleConv(nn.Module): - def __init__(self, in_ch, out_ch, kernel_size, stride, padding, groups, dilation, weights): - super(SimpleConv, self).__init__() - self.conv1 = nn.Conv2d(in_ch, out_ch, kernel_size=kernel_size, stride=stride, - padding=padding, groups=groups, bias=False, dilation=dilation) - self.conv1.weight.data = torch.from_numpy(weights) - def forward(self, x): - out = self.conv1(x) - return out - -def check_result(output, golden): - if (output.shape != golden.shape): - print("Output size does not match golden output") - return False - else: - cnt = 0 - for val,gold in zip(output.flatten(), golden.flatten()): - if val!=gold: - print("output[{}]:{} does not macth golden[{}]:{}".format(cnt, val, cnt, gold)) - return False - cnt += 1 - return True - -def bitpack(matrix, DATA_WIDTH, dlen, prec): - if DATA_WIDTH==64: - packed_data = np.zeros(math.ceil(len(matrix)/DATA_WIDTH)*prec).astype(np.int64) - elif DATA_WIDTH==32: - packed_data = np.zeros(math.ceil(len(matrix)/DATA_WIDTH)*prec).astype(np.int32) - elif DATA_WIDTH==16: - packed_data = np.zeros(math.ceil(len(matrix)/DATA_WIDTH)*prec).astype(np.int16) - elif DATA_WIDTH==8: - packed_data = np.zeros(math.ceil(len(matrix)/DATA_WIDTH)*prec).astype(np.int8) - elif DATA_WIDTH==4: - packed_data = np.zeros(math.ceil(len(matrix)/DATA_WIDTH)*prec).astype(np.int8) - else: - print("Unsupported element length: {} bits".format(DATA_WIDTH)) - return None - p_ptr = 0 - # make sure input is integer - matrix = [int(val) for val in matrix] - for i in range(0,dlen, DATA_WIDTH): - for el in range(0, DATA_WIDTH): - for bit_pos in range(0, prec): - bit_idx = p_ptr+bit_pos - if i+el >=dlen: - break - data = (matrix[i+el] >> bit_pos) & 0x1 - packed_data[bit_idx] <<= 1 - packed_data[bit_idx] = (packed_data[bit_idx] | data) - p_ptr += prec - - if DATA_WIDTH==64: - return packed_data.astype(np.uint64) - elif DATA_WIDTH==32: - return packed_data.astype(np.uint32) - elif DATA_WIDTH==16: - return packed_data.astype(np.uint16) - elif DATA_WIDTH==8: - return packed_data.astype(np.uint8) - elif DATA_WIDTH==4: - return packed_data.astype(np.uint8) - -def im2col_get_pixel(im, height, width, row, col, channel, pad): - global count - row -= pad - col -= pad - if (row < 0 or col < 0 or row >= height or col >= width): - return 0 - return im[col + width*(row + height*channel)] - -def im2col(data_im, channels, height, width, ksize, stride, pad): - c,h,w = 0,0,0 - height_col = int((height + 2*pad - ksize) / stride + 1) - width_col = int((width + 2*pad - ksize) / stride + 1) - channels_col = int(channels * ksize * ksize) - output = np.zeros((1,channels_col*height_col*width_col)) - for c in range(channels_col): - w_offset = int(c % ksize) - h_offset = int((c / ksize) % ksize) - c_im = int(c / ksize / ksize) - for h in range(height_col): - for w in range(width_col): - im_row = h_offset + h * stride - im_col = w_offset + w * stride - col_index = (c * height_col + h) * width_col + w - output[0,col_index] = im2col_get_pixel(data_im, height, width, im_row, im_col, c_im, pad) - return output - -def popcnt(a): - return bin(a).count("1") - -def bitserial_gemm(inputs, weights, packed_row_len, wprec, aprec, oshape, wshape, DATA_WIDTH): - ''' - Example: - - activation:[16x9] - - weight: [1x9] - - DATA_WIDTH: 4 - - aprec: 2 - - wprec: 2 - Weight: - [0. 0. 2. 1. 2. 3. 3. 2. 0.] - Activation: - [0. 0. 0. 0. 0. 3. 0. 3. 3.] - [0. 0. 0. 0. 3. 1. 3. 3. 3.] - [0. 0. 0. 3. 1. 0. 3. 3. 3.] - [0. 0. 0. 1. 0. 0. 3. 3. 0.] - [0. 0. 3. 0. 3. 3. 0. 1. 3.] - [0. 3. 1. 3. 3. 3. 1. 3. 1.] - [3. 1. 0. 3. 3. 3. 3. 1. 2.] - [1. 0. 0. 3. 3. 0. 1. 2. 0.] - [0. 3. 3. 0. 1. 3. 0. 0. 3.] - [3. 3. 3. 1. 3. 1. 0. 3. 2.] - [3. 3. 3. 3. 1. 2. 3. 2. 0.] - [3. 3. 0. 1. 2. 0. 2. 0. 0.] - [0. 1. 3. 0. 0. 3. 0. 0. 0.] - [1. 3. 1. 0. 3. 2. 0. 0. 0.] - [3. 1. 2. 3. 2. 0. 0. 0. 0.] - [1. 2. 0. 2. 0. 0. 0. 0. 0.] - - First output element: - - Using GEMM: - [0. 0. 2. 1. 2. 3. 3. 2. 0.] - [0. 0. 0. 0. 3. 1. 3. 3. 3.] - = 6+3+9+6 - - Using bit serial: - +---+---------------+-----------------+------------------+ - |idx| INPUT | WEIGHT | Partial Result | - +---+---------------+-----------------+------------------+ - | 0 | [ 0]: 0000 | [ 0]: 0001 | 0 | - | 1 | [ 1]: 0000 | [ 1]: 0010 | | - +---+---------------+-----------------+------------------+ - | 2 | [ 2]: 0101 | [ 2]: 0110 | 1+4+2+8 = 15 | - | 3 | [ 3]: 0101 | [ 3]: 1111 | | - +---+---------------+-----------------+------------------+ - | 4 | [ 4]: 1000 | [ 4]: 0000 | 0 | - | 5 | [ 5]: 1000 | [ 5]: 0000 | | - +---+---------------+-----------------+------------------+ - | 6 | [ 6]: 0111 | [ 0]: 0001 | 5 | - | 7 | [ 7]: 0101 | [ 1]: 0010 | | - +---+---------------+-----------------+------------------+ - | 8 | [ 8]: 1100 | [ 2]: 0110 | 1+4+2+8=15 | - | 9 | [ 9]: 1100 | [ 3]: 1111 | | - +---+---------------+-----------------+------------------+ - |10 | [ 10]: 0110 | [ 4]: 0000 | 0 | - |11 | [ 11]: 0100 | [ 5]: 0000 | | - +---+---------------+-----------------+------------------+ - ''' - - out_ch,ow,oh = oshape - _, in_ch, ksize, ksize = wshape - output = np.zeros(out_ch*ow*oh) - # import ipdb as pdb; pdb.set_trace() - optr = 0 - weights = weights.reshape(out_ch, wprec*math.ceil(in_ch*ksize*ksize/DATA_WIDTH)) - num_mul = 0 - num_add = 0 - num_popcnt = 0 - num_shifts = 0 - num_mem_ops = 0 - for oc in range(out_ch): - weight_oc = weights[oc] - for i in range(0, len(inputs), aprec*packed_row_len): - # for j in range(0, len(weights), wprec*packed_row_len): - o_acc = 0 - for k in range(0, packed_row_len): - for ap in range(0, aprec): - for wp in range(0, wprec): - # import ipdb as pdb; pdb.set_trace() - aidx = ap+k*aprec + i - widx = wp+k*wprec - # print("i:{}, oc:{}, k:{}, ap:{}, wp:{}, a[{}] w[{}]".format(i,oc,k,ap,wp,aidx,widx)) - # print("{:04b}".format(inputs[i*k+ap])) - # print("{:04b}".format(weights[j*k+wp])) - o_acc += popcnt((inputs[aidx]&weight_oc[widx])) << (ap+wp) - num_add += 2 - num_popcnt += 1 - num_shifts += 1 - num_mem_ops += 2 - # print("[{:4d}]: {:4d}".format(optr, o_acc)) - # import ipdb as pdb; pdb.set_trace() - # print("{}\n".format(o_acc)) - output[optr] = o_acc - optr += 1 - # import ipdb as pdb; pdb.set_trace() - # pass - return output, [num_mul,num_add,num_popcnt,num_shifts,num_mem_ops] - -def print_packed_data(packed_data): - for id, val in enumerate(packed_data): - if DATA_WIDTH==64: - pos_val = val if val>0 else (np.uint64(val)&np.uint64(0xffffffffffffffff)) - print('[{0:4d}]: {1:064b}'.format(id, pos_val)) - elif DATA_WIDTH==32: - pos_val = val if val>0 else (np.uint32(val)&np.uint32(0xffffffff)) - print('[{0:4d}]: {1:032b}'.format(id, pos_val)) - elif DATA_WIDTH==16: - pos_val = val if val>0 else (np.uint16(val)&np.uint16(0xffff)) - print('[{0:4d}]: {1:016b}'.format(id, pos_val)) - elif DATA_WIDTH==8: - pos_val = val if val>0 else (np.uint8(val)&np.uint8(0xff)) - print('[{0:4d}]: {1:08b}'.format(id, pos_val)) - elif DATA_WIDTH==4: - pos_val = val if val>0 else (np.uint8(val)&np.uint8(0xff)) - print('[{0:4d}]: {1:04b}'.format(id, pos_val)) - -if __name__ == '__main__': - np.random.seed(seed=0) - batch = 1 - iw = 28 - ih = 28 - in_ch = 32 - out_ch = 64 - ksize = 3 - stride = 1 - padding = 1 - groups = 1 - dilation = 1 - prec = 4 - DATA_WIDTH = 8 - VLEN = 1 - aprec = prec - wprec = prec - - # Initialize input and weight - max_int = (2**prec) - 1 - inputs = np.random.randint(max_int+1, size=(batch, in_ch, iw, ih)).astype(np.float32) - weights = np.random.randint(max_int+1, size=(out_ch, in_ch, ksize, ksize)).astype(np.float32) - torch_conv = SimpleConv(in_ch, out_ch, ksize, stride, padding, groups, dilation, weights) - - golden_output = torch_conv(torch.from_numpy(inputs)) - #================================================================ - # Computing Convolution with im2col method with normal GEMM: - #================================================================ - inputs_flatten = inputs.flatten() - inputs_im2col = im2col(inputs_flatten, in_ch, ih, iw, ksize, stride, padding) - inputs_im2col = inputs_im2col.reshape(in_ch*ksize*ksize,iw*ih) - weights_mat = weights.reshape(out_ch,in_ch*ksize*ksize) - output_im2col = np.dot(weights_mat, inputs_im2col) - num_mul = np.prod(output_im2col.shape) * weights_mat.shape[1] - num_add = np.prod(output_im2col.shape) * weights_mat.shape[1] - im2col_stats = [num_mul, num_add, 0, 0, "?"] - - ow = int((iw + 2*padding - ksize) / stride + 1) - oh = int((ih + 2*padding - ksize) / stride + 1) - output = output_im2col.reshape(1, out_ch, ow, oh) - if check_result(output, golden_output): - print("im2col conv matches pytorch conv") - else: - print("im2col conv does not match pytorch conv") - - #================================================================ - # Computing Convolution with im2col method with bitserial GEMM: - #================================================================ - # import ipdb as pdb; pdb.set_trace() - # inputs_im2col = inputs_im2col.reshape(ksize*ksize,16).transpose() - inputs_im2col = inputs_im2col.reshape(in_ch*ksize*ksize,iw*ih).transpose() - if (in_ch*ksize*ksize)%DATA_WIDTH != 0: - i_org_shape = inputs_im2col.shape - inputs_im2col_fixed = np.zeros((i_org_shape[0], i_org_shape[1]+DATA_WIDTH-((in_ch*ksize*ksize)%DATA_WIDTH))) - for id, row in enumerate(inputs_im2col): - inputs_im2col_fixed[id][0:i_org_shape[1]] = row - w_org_shape = weights_mat.shape - weights_mat_fixed = np.zeros((w_org_shape[0], w_org_shape[1]+DATA_WIDTH-((in_ch*ksize*ksize)%DATA_WIDTH))) - for id, row in enumerate(weights_mat): - weights_mat_fixed[id][0:w_org_shape[1]] = row - inputs_im2col = inputs_im2col_fixed - weights_mat = weights_mat_fixed - # import ipdb as pdb; pdb.set_trace() - bit_packed_inputs = bitpack(inputs_im2col.flatten(), DATA_WIDTH, len(inputs_im2col.flatten()), aprec) - bit_packed_weights = bitpack(weights_mat.flatten(), DATA_WIDTH, len(weights_mat.flatten()), wprec) - - # import ipdb as pdb; pdb.set_trace() - packed_row_len = int(inputs_im2col.shape[1]/DATA_WIDTH) - - # print_packed_data(bit_packed_inputs) - # print_packed_data(bit_packed_weights) - # print("\n") - # # import ipdb as pdb; pdb.set_trace() - # print(inputs_im2col) - # print(weights_mat) - # print(golden_output) - # import ipdb as pdb; pdb.set_trace() - output_bitserial, bit_serial_stats = bitserial_gemm(bit_packed_inputs, bit_packed_weights, packed_row_len, wprec, aprec, [out_ch,ow,oh], [out_ch, in_ch, ksize, ksize], DATA_WIDTH) - output = output_bitserial.reshape(1, out_ch, ow, oh) - t = Texttable(max_width=160) - t.add_row(['Type', 'Num Mul', 'Num Add', 'Num Popcnt', 'Num Shifts', 'Num MemOps']) - t.add_row(["Bit-Serial", *bit_serial_stats]) - t.add_row(["Im2Col GEMM", *im2col_stats]) - - if check_result(output, golden_output): - print("bitserial conv matches pytorch conv") - else: - print("bitserial conv does not match pytorch conv") - #================================================================ - # Printing Results: - #================================================================ - print("Computation cost for:") - print("\t Input Shape: {}x{}x{}".format(in_ch, iw, ih)) - print("\t Weight Shape: {}x{}x{}x{}".format(out_ch, in_ch, ksize, ksize)) - print("\t Data Width: {}".format(DATA_WIDTH)) - print("\t Vector Length: {}".format(VLEN)) - print("\t aprec: {}".format(aprec)) - print("\t wprec: {}".format(wprec)) - print(t.draw()) - diff --git a/apps/bitserial_conv/kernel/darknet.h b/apps/bitserial_conv/kernel/darknet.h deleted file mode 100644 index af82eaf00..000000000 --- a/apps/bitserial_conv/kernel/darknet.h +++ /dev/null @@ -1,271 +0,0 @@ -#include -#include - -typedef struct{ - int *leaf; - int n; - int *parent; - int *child; - int *group; - char **name; - - int groups; - int *group_size; - int *group_offset; -} tree; -tree *read_tree(char *filename); - -struct layer; -typedef struct layer layer; - -typedef enum { - CONVOLUTIONAL, -} LAYER_TYPE; - -struct layer{ - LAYER_TYPE type; - int batch; - int forced; - int flipped; - int inputs; - int outputs; - int nweights; - int nbiases; - int extra; - int truths; - int h,w,c; - int out_h, out_w, out_c; - int n; - int max_boxes; - int groups; - int size; - int side; - int stride; - int reverse; - int flatten; - int spatial; - int pad; - int sqrt; - int flip; - int index; - int steps; - int hidden; - int truth; - float smooth; - float dot; - float angle; - float jitter; - float saturation; - float exposure; - float shift; - float ratio; - float learning_rate_scale; - float clip; - int noloss; - int softmax; - int classes; - int coords; - int background; - int rescore; - int objectness; - int joint; - int noadjust; - int reorg; - int log; - int tanh; - int *mask; - int total; - - float alpha; - float beta; - float kappa; - - float coord_scale; - float object_scale; - float noobject_scale; - float mask_scale; - float class_scale; - int bias_match; - int random; - float ignore_thresh; - float truth_thresh; - float thresh; - float focus; - int classfix; - int absolute; - - int onlyforward; - int stopbackward; - int dontload; - int dontsave; - int dontloadscales; - int numload; - - float temperature; - float probability; - float scale; - - char * cweights; - int * indexes; - int * input_layers; - int * input_sizes; - int * map; - int * counts; - float ** sums; - float * rand; - float * cost; - float * state; - float * prev_state; - float * forgot_state; - float * forgot_delta; - float * state_delta; - float * combine_cpu; - float * combine_delta_cpu; - - float * concat; - float * concat_delta; - - float * biases; - - float * weights; - - float * delta; - float * output; - float * loss; - float * squared; - float * norms; - - float * x; - - float * m; - float * v; - - float * bias_m; - float * bias_v; - float * scale_m; - float * scale_v; - - - float *z_cpu; - float *r_cpu; - float *h_cpu; - float * prev_state_cpu; - - float *temp_cpu; - float *temp2_cpu; - float *temp3_cpu; - - float *dh_cpu; - float *hh_cpu; - float *prev_cell_cpu; - float *cell_cpu; - float *f_cpu; - float *i_cpu; - float *g_cpu; - float *o_cpu; - float *c_cpu; - float *dc_cpu; - - float * binary_input; - - struct layer *input_layer; - struct layer *self_layer; - struct layer *output_layer; - - struct layer *reset_layer; - struct layer *update_layer; - struct layer *state_layer; - - struct layer *input_gate_layer; - struct layer *state_gate_layer; - struct layer *input_save_layer; - struct layer *state_save_layer; - struct layer *input_state_layer; - struct layer *state_state_layer; - - struct layer *input_z_layer; - struct layer *state_z_layer; - - struct layer *input_r_layer; - struct layer *state_r_layer; - - struct layer *input_h_layer; - struct layer *state_h_layer; - - struct layer *wz; - struct layer *uz; - struct layer *wr; - struct layer *ur; - struct layer *wh; - struct layer *uh; - struct layer *uo; - struct layer *wo; - struct layer *uf; - struct layer *wf; - struct layer *ui; - struct layer *wi; - struct layer *ug; - struct layer *wg; - - size_t workspace_size; -}; - -typedef struct network{ - int n; - int batch; - size_t *seen; - int *t; - float epoch; - int subdivisions; - layer *layers; - float *output; - - float learning_rate; - float momentum; - float decay; - float gamma; - float scale; - float power; - int time_steps; - int step; - int max_batches; - float *scales; - int *steps; - int num_steps; - int burn_in; - - int adam; - float B1; - float B2; - float eps; - - int inputs; - int outputs; - int truths; - int notruth; - int h, w, c; - int max_crop; - int min_crop; - float max_ratio; - float min_ratio; - int center; - float angle; - float aspect; - float exposure; - float saturation; - float hue; - int random; - - int gpu_index; - tree *hierarchy; - - float *input; - float *truth; - float *delta; - float *workspace; - int train; - int index; - float *cost; - float clip; - -} network; diff --git a/apps/bitserial_conv/kernel/im2col.c b/apps/bitserial_conv/kernel/im2col.c deleted file mode 100644 index 63101cbea..000000000 --- a/apps/bitserial_conv/kernel/im2col.c +++ /dev/null @@ -1,36 +0,0 @@ -int im2col_get_pixel(int *im, int height, int width, int channels, - int row, int col, int channel, int pad) -{ - row -= pad; - col -= pad; - - if (row < 0 || col < 0 || - row >= height || col >= width) return 0; - return im[col + width*(row + height*channel)]; -} - -//From Berkeley Vision's Caffe! -//https://github.com/BVLC/caffe/blob/master/LICENSE -void im2col_cpu(int* data_im, - int channels, int height, int width, - int ksize, int stride, int pad, int* data_col) -{ - int c,h,w; - int height_col = (height + 2*pad - ksize) / stride + 1; - int width_col = (width + 2*pad - ksize) / stride + 1; - - int channels_col = channels * ksize * ksize; - for (c = 0; c < channels_col; ++c) { - int w_offset = c % ksize; - int h_offset = (c / ksize) % ksize; - int c_im = c / ksize / ksize; - for (h = 0; h < height_col; ++h) { - for (w = 0; w < width_col; ++w) { - int im_row = h_offset + h * stride; - int im_col = w_offset + w * stride; - int col_index = (c * height_col + h) * width_col + w; - data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, - im_row, im_col, c_im, pad); - } - } - } diff --git a/apps/bitserial_conv/main.c b/apps/bitserial_conv/main.c deleted file mode 100644 index eb5d3fc91..000000000 --- a/apps/bitserial_conv/main.c +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2020 ETH Zurich and University of Bologna. -// -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Author: Matheus Cavalcante, ETH Zurich - -#include -#include - -#include "printf.h" - -int main() { - printf("Ariane says Hello!\n"); - - return 0; -} diff --git a/apps/bitserial_matmul/a.out b/apps/bitserial_matmul/a.out deleted file mode 100755 index 471d8245b..000000000 Binary files a/apps/bitserial_matmul/a.out and /dev/null differ diff --git a/apps/bitserial_matmul/kernel/bitserial_matmul.c b/apps/bitserial_matmul/kernel/bitserial_matmul.c deleted file mode 100644 index 41da0a4d7..000000000 --- a/apps/bitserial_matmul/kernel/bitserial_matmul.c +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2022 ETH Zurich, University of Bologna and Polytechnique Montreal. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 -// -// Author: MohammadHossein AskariHemmat - -#include "bitserial_matmul.h" - -void bitserial_matmul_init() { - asm volatile("vmv.v.i v0, 0"); - asm volatile("vmv.v.i v4, 0"); - asm volatile("vmv.v.i v8, 0"); - asm volatile("vmv.v.i v12, 0"); - asm volatile("vmv.v.i v16, 0"); -} - -// given the input arrays with bit precision prec in bitpack format, this function -// calculates the Matmul of the two Matrix and returns the resulting Matrix. -// NOTE: a and b must have the same length. -// NOTE: a and b must be in bit packed format, the retuning matrix is in normal format. -// c[64x64] = a[64]*b[64] -void bitserial_matmul_64(uint64_t* c, uint64_t* a, uint64_t* b, int aprec, int bprec) { - uint64_t vl=0; - // Original pointers - const uint64_t *a_ = a; - const uint64_t *b_ = b; - uint64_t elen = 64; - asm volatile("vsetvli %0, %1, e64, m1, ta, ma \n" : "+r" (vl) : "r" (elen)); - // start with a fresh temp registers - bitserial_matmul_init(); - for (int row=0; row<64; row++){ - // The following loop will compute one row of a 64 element output - for (int i=0; i - -#ifndef BITSERIAL_MATMUL_H -#define BITSERIAL_MATMUL_H - -#include - -void bitserial_matmul_init(); -void bitserial_matmul_64(uint64_t* c, uint64_t* a, uint64_t* b, int aprec, int bprec); - -#endif diff --git a/apps/bitserial_matmul/main.c b/apps/bitserial_matmul/main.c deleted file mode 100644 index ec75485b0..000000000 --- a/apps/bitserial_matmul/main.c +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2022 ETH Zurich, University of Bologna and Polytechnique Montreal. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 -// -// Author: MohammadHossein AskariHemmat - -#include -#include - -#include "printf.h" -#include "runtime.h" -#include "kernel/bitserial_matmul.h" - -// Define Matrix dimensions: -// C = AB with A=[MxN], B=[NxP], C=[MxP] -#define M 64 -#define N 64 -#define P 64 - -uint64_t a[M * N] __attribute__((aligned(32 * NR_LANES), section(".l2"))); -uint64_t b[N * P] __attribute__((aligned(32 * NR_LANES), section(".l2"))); -uint64_t c[M * P] __attribute__((aligned(32 * NR_LANES), section(".l2"))); - -// Initialize the matrices -void init_matrix(uint64_t *matrix, uint64_t num_rows, uint64_t num_columns) { - for (uint64_t i = 0; i < num_rows; ++i) { - for (uint64_t j = 0; j < num_columns; ++j) { - matrix[i * num_columns + j] = i; - } - } -} - - -int main(void) { - const int s = 64; - const int prec = 2; - printf("\n"); - printf("------------------------------------------------------------\n"); - printf("Calculating a bit-serialized (%d x %d) x (%d x %d) matrix multiplication with 2 bit precision...\n", s, - s, s, s); - printf("------------------------------------------------------------\n"); - printf("\n"); - // Initialize Matrices - printf("Initializing matrices...\n"); - // init_matrix(a, s, s); - // init_matrix(b, s, s); - // Matrices are initialized --> Start calculating - printf("Calculating bitserial_matmul_64...\n"); - start_timer(); - bitserial_matmul_64(c, a, b, 1, prec); - stop_timer(); - // Metrics - int64_t runtime = get_timer(); - float performance = 2.0 * s * s / runtime; - float utilization = 100 * performance / (2.0 * NR_LANES); - - printf("The execution took %d cycles.\n", runtime); - printf("The performance is %f OP/cycle (%f%% utilization).\n", performance, - utilization); - -} diff --git a/apps/bitserial_matmul/vpopcnt b/apps/bitserial_matmul/vpopcnt deleted file mode 100755 index 471d8245b..000000000 Binary files a/apps/bitserial_matmul/vpopcnt and /dev/null differ diff --git a/apps/bitserial_pack/data.S b/apps/bitserial_pack/data.S deleted file mode 100644 index e69de29bb..000000000 diff --git a/apps/bitserial_pack/kernel/bitserial_pack.c b/apps/bitserial_pack/kernel/bitserial_pack.c deleted file mode 100644 index fe8b72e43..000000000 --- a/apps/bitserial_pack/kernel/bitserial_pack.c +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2022 ETH Zurich, University of Bologna and Polytechnique Montreal. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 -// -// Author: MohammadHossein AskariHemmat - -#include "bitserial_pack.h" - -void bitserial_pack_init() { - asm volatile("vmv.v.i v0, 0"); - asm volatile("vmv.v.i v1, 0"); - asm volatile("vmv.v.i v2, 0"); -} -// Assuming an array of 2bit precision data is packed into 8bit chars, this function, -// de-reference these chars into 64b words. Each call to VBPACK will consume 4 words -// (4 lanes) of 64 bits. -void bitserial_pack_64(uint64_t* mat_i, uint64_t* out, uint64_t len_mat_i) { - uint64_t vl=0; - uint64_t complete_words = len_mat_i/4; - uint64_t residuals = len_mat_i%4; - uint64_t elm_cnt = 0; - // Make sure SEW is set to 64 - asm volatile("vsetvli zero, %0, e64, m1, ta, ma" ::"r"(4)); - // start with a fresh temp registers - bitserial_pack_init(); - for (uint64_t n=0; n<4; n++){ - for (uint64_t i=0; i<4; i++){ - asm volatile("vle64.v v1, (%[A])" : : [A] "r" (mat_i)); - asm volatile(".byte 0x57, 0x81, 0x20, 0x0E\n" ::); - } - asm volatile("vse64.v v2, (%0)" : "+&r"(out)); - out+= 4; - } -} \ No newline at end of file diff --git a/apps/bitserial_pack/kernel/bitserial_pack.h b/apps/bitserial_pack/kernel/bitserial_pack.h deleted file mode 100644 index 52009c7c1..000000000 --- a/apps/bitserial_pack/kernel/bitserial_pack.h +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2022 ETH Zurich, University of Bologna and Polytechnique Montreal. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 -// -// Author: MohammadHossein AskariHemmat - -#ifndef BITSERIAL_PACK_H -#define BITSERIAL_PACK_H - -#include - -void bitserial_pack_init(); -// 2 bit precision packing -void bitserial_pack_64(uint64_t* mat_i, uint64_t* out, uint64_t len_mat_i); - -#endif - - diff --git a/apps/bitserial_pack/main.c b/apps/bitserial_pack/main.c deleted file mode 100644 index 8cd1436a4..000000000 --- a/apps/bitserial_pack/main.c +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2022 ETH Zurich, University of Bologna and Polytechnique Montreal. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 -// -// Author: MohammadHossein AskariHemmat - -#include -#include - -#include "printf.h" -#include "runtime.h" -#include -#include "kernel/bitserial_pack.h" - -// Define Matrix dimensions: -// C = AB with A=[MxN], B=[NxP], C=[MxP] -#define M 64 -#define N 64 -#define P 64 - -uint64_t a[M * N] __attribute__((aligned(32 * NR_LANES), section(".l2"))); -// uint64_t b[N * P] __attribute__((aligned(32 * NR_LANES), section(".l2"))); -uint64_t c[M * P] __attribute__((aligned(32 * NR_LANES), section(".l2"))); - -// Initialize the matrices -void init_matrix(uint64_t *matrix, uint64_t num_rows, uint64_t num_columns) { - for (uint64_t i = 0; i < num_rows; ++i) { - for (uint64_t j = 0; j < num_columns; ++j) { - matrix[i * num_columns + j] = 0x0202020202020202; - } - } -} - - -int main(void) { - const int s = 4; - // Initialize Matrices - printf("Initializing matrices...\n"); - init_matrix(a, s, s); - // init_matrix(b, s, s); - // Matrices are initialized --> Start calculating - for (int i=0; i