pulp-platform · mbertuletti · Feb 13, 2023 · Mar 17, 2023 · Apr 4, 2023 · Apr 25, 2023
@@ -218,6 +218,7 @@ toolchain/riscv-opcodes/*:
 
 format:
 	$(ROOT_DIR)/scripts/run_clang_format.py --clang-format-executable=$(LLVM_INSTALL_DIR)/bin/clang-format -i -r $(ROOT_DIR)
+	find ./software/data -name '*.py' -exec autopep8 --in-place --aggressive {} +
 
 clean: clean-riscv-tests
 	rm -rf $(INSTALL_DIR)
@@ -14,3 +14,5 @@ pandas
 progressbar2
 tabulate
 sympy
+scipy
+pyflexfloat
@@ -22,8 +22,18 @@ ALLPYS := $(patsubst %.py,%.h,$(wildcard $(DATA_DIR)/*.py))
 BINARIES := $(addprefix $(BIN_DIR)/,$(APPS))
 ALL := $(APPS)
 
-ALL_GCC := $(filter-out matmul_f16 matmul_f32, $(ALL))
-ALL_LLVM := $(filter-out synth_i32 chest_q16 cfft_radix2_q16 cfft_radix4_q16, $(ALL))
+FP_APPS := axpy_f16 axpy_f32
+FP_APPS += cfft_radix4_f16 chest_f16 cholesky_f16
+FP_APPS += cmatmul_f16 matmul_f16 matmul_f32
+FP_APPS += dotp_f16 dotp_f32
+FP_APPS += mimo_mmse_f32 mimo_mmse_f16 ofdm
+
+I_APPS := synth_i32
+I_APPS += cfft_radix2_q16 cfft_radix4_q16 chest_q16 cholesky_q16 cholesky_q32
+I_APPS += cmatmul_q16 mimo_mmse_q16
+
+ALL_GCC := $(filter-out $(FP_APPS), $(ALL))
+ALL_LLVM := $(filter-out $(I_APPS), $(ALL))
 
 # Make all applications
 all: $(ALL_GCC)

@@ -0,0 +1,71 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "dma.h"
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+
+#include "data_axpy_f16.h"
+#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
+
+// Vectors for kernel computation
+__fp16 l1_X[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+__fp16 l1_Y[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+
+#include "baremetal/mempool_axpy_f16.h"
+#include "baremetal/mempool_checks.h"
+
+int main() {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  uint32_t time_init, time_end;
+  mempool_barrier_init(core_id);
+
+  time_init = 0;
+  time_end = 0;
+  if (core_id == 0) {
+    dma_memcpy_blocking(l1_X, l2_X, LEN * sizeof(int16_t));
+    dma_memcpy_blocking(l1_Y, l2_Y, LEN * sizeof(int16_t));
+  }
+  uint32_t register volatile a = *(uint32_t *)&(A)&0x0000FFFF;
+  mempool_barrier(num_cores);
+
+  //  // SINGLE
+  //  time_init = mempool_get_timer();
+  //  axpy_f16s(A, l1_X, l1_Y, LEN);
+  //  time_end = mempool_get_timer();
+
+  //  // PARALLEL
+  //  time_init = mempool_get_timer();
+  //  axpy_f16vecp_unrolled4(A, l1_X, l1_Y, LEN, num_cores);
+  //  time_end = mempool_get_timer();
+
+  // PARALLEL, LOCAL ACCESSES
+  time_init = mempool_get_timer();
+  // axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, LEN);
+  mempool_start_benchmark();
+  axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, LEN);
+  mempool_stop_benchmark();
+  time_end = mempool_get_timer();
+
+  mempool_barrier(num_cores);
+  // Check results
+  if (core_id == 0) {
+    uint32_t clock_cycles = (time_end - time_init);
+    printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
+  }
+  mempool_check_f16(l1_Y, l2_out, 100, 0.1f, 0);
+  mempool_barrier(num_cores);
+
+  return 0;
+}
@@ -0,0 +1,59 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "dma.h"
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+
+#include "data_axpy_f32.h"
+#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
+
+// Vectors for kernel computation
+float l1_X[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+float l1_Y[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+
+#include "baremetal/mempool_axpy_f32.h"
+#include "baremetal/mempool_checks.h"
+
+int main() {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  uint32_t time_init, time_end;
+  mempool_barrier_init(core_id);
+
+  time_init = 0;
+  time_end = 0;
+  if (core_id == 0) {
+    dma_memcpy_blocking(l1_X, l2_X, LEN * sizeof(int32_t));
+    dma_memcpy_blocking(l1_Y, l2_Y, LEN * sizeof(int32_t));
+  }
+  float register volatile a = A;
+  mempool_barrier(num_cores);
+
+  // PARALLEL
+  time_init = mempool_get_timer();
+  // axpy_f32p(a, l1_X, l1_Y, LEN, num_cores);
+  // axpy_f32p_unrolled4(a, l1_X, l1_Y, LEN, num_cores);
+  axpy_f32p_local_unrolled4(a, l1_X, l1_Y, LEN);
+  time_end = mempool_get_timer();
+
+  // Check results
+  if (core_id == 0) {
+    uint32_t clock_cycles = (time_end - time_init);
+    printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
+  }
+  mempool_check_f32(l1_Y, l2_out, 100, 0.1f, 0);
+  mempool_barrier(num_cores);
+
+  return 0;
+}
@@ -7,7 +7,7 @@
 #include <stdint.h>
 #include <string.h>
 
-#include "baremetal/mempool_axpy_i32p.h"
+#include "baremetal/mempool_axpy_i32.h"
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"

@@ -0,0 +1,182 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* Mempool runtime libraries */
+#include "builtins_v2.h"
+#include "dma.h"
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+
+/* CFFT data libraries */
+#include "data_cfft_radix4_f16.h"
+
+/* CHOOSE ONE */
+//#define PARALLEL // Parallel FFT not "memory-aware".
+//#define FOLDED // Parallel FFT with "memory-aware" load/store.
+#define SCHEDULED // Folded FFTs arranged in rows and cols.'''
+
+// Bitreversal index from table.
+#define BITREVERSETABLE
+// Independent FFTs scheduled on one row (default 1).
+#define N_FFTs_ROW 1
+// Independent FFTs scheduled on columns (default 1).
+#define N_FFTs_COL 1
+#if (N_FFTs_COL > MAX_COL)
+#error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)]
+#endif
+// Also the twiddles have "memory-aware" load/stores.
+#define FOLDED_TWIDDLES
+
+#include "baremetal/mempool_cfft_q16_bitreversal.h"
+#include "baremetal/mempool_checks.h"
+#include "baremetal/mempool_radix4_cfft_butterfly_f16.h"
+#include "baremetal/mempool_radix4_cfft_f16p.h"
+
+#if (defined(SINGLE) || defined(PARALLEL))
+__fp16 l1_pSrc[2 * N_CSAMPLES]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+__fp16 l1_pDst[2 * N_CSAMPLES]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+__fp16 l1_twiddleCoef_f16_src[2 * 3 * N_CSAMPLES / 4]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+__fp16 l1_twiddleCoef_f16_dst[2 * 3 * N_CSAMPLES / 4]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+#endif
+
+#if (defined(SCHEDULED) || defined(FOLDED))
+__fp16 l1_pSrc[N_FFTs_ROW * 8 * N_BANKS]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+__fp16 l1_pDst[N_FFTs_ROW * 8 * N_BANKS]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+__fp16 l1_twiddleCoef_f16_src[8 * N_BANKS]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+__fp16 l1_twiddleCoef_f16_dst[8 * N_BANKS]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+#endif
+
+int main() {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  __fp16 *pRes = (__fp16 *)0;
+  mempool_barrier_init(core_id);
+
+  /* INITIALIZATION */
+
+#if (defined(SINGLE) || defined(PARALLEL))
+  if (core_id == 0) {
+    dma_memcpy_blocking(l1_pSrc, l2_pSrc, N_CSAMPLES * sizeof(int32_t));
+    dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16,
+                        3 * (N_CSAMPLES / 4) * sizeof(int32_t));
+    dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
+                        BITREVINDEXTABLE_LENGTH * sizeof(int16_t));
+    printf("01: END INITIALIZATION\n");
+  }
+  mempool_barrier(num_cores);
+#endif
+
+#if (defined(SCHEDULED) || defined(FOLDED))
+
+  if (core_id == 0) {
+    for (uint32_t j = 0; j < N_FFTs_ROW; j++) {
+      for (uint32_t i = 0; i < N_FFTs_COL; i++) {
+        dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * N_BANKS),
+                            l2_pSrc, N_CSAMPLES * sizeof(int32_t));
+      }
+    }
+    dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
+                        BITREVINDEXTABLE_LENGTH * sizeof(int32_t));
+  }
+  mempool_barrier(num_cores);
+
+#ifdef FOLDED_TWIDDLES
+  for (uint32_t j = 0; j < N_FFTs_COL; j++) {
+    uint32_t N_WORDS_COL = N_CSAMPLES >> 2;
+    for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) {
+      *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL)] =
+          *(v2h *)&l2_twiddleCoef_f16[2 * i];
+      *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL + 1 * N_BANKS)] =
+          *(v2h *)&l2_twiddleCoef_f16[2 * (i * 2U)];
+      *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL + 2 * N_BANKS)] =
+          *(v2h *)&l2_twiddleCoef_f16[2 * (i * 3U)];
+    }
+  }
+#else
+  if (core_id == 0) {
+    dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16,
+                        3 * (N_CSAMPLES / 4) * sizeof(int32_t));
+  }
+#endif
+  mempool_barrier(num_cores);
+
+  if (core_id == 0) {
+    printf("01: END INITIALIZATION\n");
+  }
+  mempool_barrier(num_cores);
+#endif
+
+  /* COMPUTATION */
+
+#ifdef PARALLEL
+  mempool_start_benchmark();
+  mempool_radix4_cfft_f16p(l1_pSrc, N_CSAMPLES, l1_twiddleCoef_f16_src, 1,
+                           num_cores);
+  mempool_bitrevtable_q16p_xpulpimg((int16_t *)l1_pSrc, BITREVINDEXTABLE_LENGTH,
+                                    l1_BitRevIndexTable, num_cores);
+  mempool_stop_benchmark();
+  pRes = l1_pSrc;
+#endif
+
+#ifdef FOLDED
+  if (core_id < (N_CSAMPLES / 16)) {
+    mempool_start_benchmark();
+    mempool_radix4_cfft_f16p_folded(l1_pSrc, l1_pDst, N_CSAMPLES,
+                                    l1_twiddleCoef_f16_src,
+                                    l1_twiddleCoef_f16_dst, (N_CSAMPLES / 16));
+    pRes = ((LOG2 / 2) % 2) == 0 ? l1_pSrc : l1_pDst;
+    mempool_bitrevtable_q16p_xpulpimg((int16_t *)pRes, BITREVINDEXTABLE_LENGTH,
+                                      l1_BitRevIndexTable, (N_CSAMPLES / 16));
+    mempool_stop_benchmark();
+  }
+#endif
+
+#ifdef SCHEDULED
+  uint32_t CORES_USED = (N_CSAMPLES / 4) / BANKING_FACTOR;
+  if (core_id < N_FFTs_COL * CORES_USED) {
+    mempool_start_benchmark();
+    mempool_radix4_cfft_f16p_scheduler(
+        l1_pSrc, l1_pDst, N_CSAMPLES, N_FFTs_ROW, N_FFTs_COL,
+        l1_twiddleCoef_f16_src, l1_twiddleCoef_f16_dst, l1_BitRevIndexTable,
+        BITREVINDEXTABLE_LENGTH, 1, CORES_USED);
+    mempool_log_partial_barrier(2, core_id, N_FFTs_COL * CORES_USED);
+    mempool_stop_benchmark();
+  }
+#ifdef BITREVERSETABLE
+  pRes = ((LOG2 / 2) % 2) == 0 ? l1_pSrc : l1_pDst;
+#else
+  pRes = ((LOG2 / 2) % 2) == 0 ? l1_pDst : l1_pSrc;
+#endif
+#endif
+
+  mempool_barrier(num_cores);
+  if (core_id == 0) {
+    printf("02: END COMPUTATION\n");
+  }
+
+  mempool_check_f16(pRes, l2_pRes, 2 * N_CSAMPLES, 0.05f, 0);
+  mempool_barrier(num_cores);
+  return 0;
+}