From 13ef7599e18cc7599bbef69b94224b8d5ec93210 Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Wed, 16 Oct 2024 08:46:23 +0200 Subject: [PATCH] [software] Clean up folded MIMO-MMSE and solution of triangular system --- software/apps/baremetal/mimo_mmse_f16/main.c | 87 +++-- software/apps/baremetal/mimo_mmse_f32/main.c | 120 +++---- software/apps/baremetal/mimo_mmse_f8/main.c | 67 ++-- software/apps/baremetal/mimo_mmse_q16/main.c | 8 +- software/data/data_mimo_mmse_f32.h.tpl | 2 +- software/data/generate_mimo_mmse.py | 4 +- software/kernels/baremetal/mempool_checks.h | 51 +-- .../kernels/baremetal/mempool_cholesky_f16s.h | 179 ++++------ .../kernels/baremetal/mempool_cholesky_f32s.h | 109 +----- .../baremetal/mempool_linearsolver_f16s.h | 311 ++++-------------- .../baremetal/mempool_linearsolver_f32s.h | 193 ++--------- .../baremetal/mempool_linearsolver_q16s.h | 84 +---- .../baremetal/mempool_mimo_mmse_f16s.h | 110 +++---- .../baremetal/mempool_mimo_mmse_f32p.h | 55 +--- .../baremetal/mempool_mimo_mmse_f32s.h | 92 ++---- .../kernels/baremetal/mempool_mimo_mmse_f8s.h | 31 +- 16 files changed, 467 insertions(+), 1036 deletions(-) diff --git a/software/apps/baremetal/mimo_mmse_f16/main.c b/software/apps/baremetal/mimo_mmse_f16/main.c index 218f4d247..ffd879c91 100644 --- a/software/apps/baremetal/mimo_mmse_f16/main.c +++ b/software/apps/baremetal/mimo_mmse_f16/main.c @@ -18,9 +18,9 @@ #include "baremetal/mempool_mimo_mmse_f16s.h" #include "data_mimo_mmse_f16.h" -#define ZF (0) // When asserted use zero-forcing +#define ZF (0) // When asserted use zero-forcing +#define FOLD (0) // When asserted fold matrices in memory #define NUM_BANKS (BANKING_FACTOR * NUM_CORES) -//#define DOUBLE_BUFFERING /********************************************************** ********************************************************** @@ -35,13 +35,21 @@ #ifndef DOUBLE_BUFFERING -__fp16 l1_H[2 * N_TX * N_RX * N_ITR] +#if FOLD +#define NUM_ROW (1 + ((N_ITR * N_TX - 1) / NUM_BANKS)) +__fp16 l1_G[2 * N_TX * NUM_BANKS * NUM_ROW] + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); +__fp16 l1_L[2 * N_TX * NUM_BANKS * NUM_ROW] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); +#else __fp16 l1_G[2 * N_TX * N_TX * N_ITR] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); __fp16 l1_L[2 * N_TX * N_TX * N_ITR] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); +#endif +__fp16 l1_H[2 * N_TX * N_RX * N_ITR] + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); __fp16 l1_S[2 * N_TX * N_ITR] __attribute__((aligned(sizeof(int32_t)), section(".l1"))); __fp16 l1_y[2 * N_RX * N_ITR] @@ -62,9 +70,9 @@ int main() { mempool_barrier_init(core_id); // Initialize barrier and synchronize #endif -#ifdef BANSHEE /* Initialize matrices */ if (core_id == 0) { +#ifdef BANSHEE for (uint32_t i = 0; i < N_RX * N_TX * N_ITR; i++) { (*(uint32_t *)&l1_H[2 * i]) = *(uint32_t *)&l2_H[2 * i]; } @@ -74,14 +82,14 @@ int main() { for (uint32_t i = 0; i < N_TX * N_ITR; i++) { (*(uint32_t *)&l1_S[2 * i]) = *(uint32_t *)&l2_S[2 * i]; } - } #else - /* Initialize matrices */ - if (core_id == 0) { dma_memcpy_blocking(l1_H, l2_H, 2 * N_TX * N_RX * N_ITR * sizeof(int16_t)); dma_memcpy_blocking(l1_y, l2_y, 2 * N_RX * N_ITR * sizeof(int16_t)); dma_memcpy_blocking(l1_S, l2_S, 2 * N_TX * N_ITR * sizeof(int16_t)); +#endif + printf("Data transferred\n"); } +#ifndef BANSHEE mempool_barrier(num_cores); #endif @@ -99,16 +107,16 @@ int main() { __fp16 *Ptry3 = y3 + itr * (2 * N_TX); __fp16 *Ptrx = l1_x + itr * (2 * N_TX); #ifdef VEC - mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, ZF); + mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, FOLD, ZF); mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX); - mempool_cholesky_f16vecs(PtrG, PtrL, N_TX); + mempool_cholesky_f16vecs(PtrG, PtrL, N_TX, 0); #else - mempool_hermitian_f16s(PtrH, PtrG, PtrS, N_RX, N_TX, 0, ZF); - mempool_MVP_conjtransp_f16s(PtrH, Ptry, Ptry2, N_RX, N_TX, 0); - mempool_cholesky_f16s(PtrG, PtrL, N_TX); + mempool_hermitian_f16s(PtrH, PtrG, PtrS, N_RX, N_TX, FOLD, ZF); + mempool_MVP_conjtransp_f16s(PtrH, Ptry, Ptry2, N_RX, N_TX); + mempool_cholesky_f16s(PtrG, PtrL, N_TX, 0); #endif - mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX); - mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX); + mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX, 0, FOLD); + mempool_Ltrisol_f16s(PtrL, Ptry3, Ptrx, N_TX, 1, FOLD); } mempool_stop_benchmark(); } @@ -116,35 +124,51 @@ int main() { #ifdef PARALLEL mempool_start_benchmark(); + uint32_t time_init = mempool_get_timer(); // Parallel subcarrier loop for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) { + __fp16 *PtrH = l1_H + itr * (2 * N_TX * N_RX); __fp16 *Ptry = l1_y + itr * (2 * N_RX); __fp16 *PtrS = l1_S + itr * (2 * N_TX); // Auxiliary vectors +#if FOLD + __fp16 *PtrG = l1_G + (itr % NUM_ROW) * (2 * N_TX * NUM_BANKS) + + (itr / NUM_ROW) * (2 * N_TX); + __fp16 *PtrL = l1_L + (itr % NUM_ROW) * (2 * N_TX * NUM_BANKS) + + (itr / NUM_ROW) * (2 * N_TX); + __fp16 *Ptry2 = + y2 + (itr % NUM_ROW) * NUM_BANKS + (itr / NUM_ROW) * (2 * N_TX); + __fp16 *Ptry3 = + y3 + (itr % NUM_ROW) * NUM_BANKS + (itr / NUM_ROW) * (2 * N_TX); + __fp16 *Ptrx = l1_x + itr * (2 * N_TX); +#else __fp16 *PtrG = l1_G + itr * (2 * N_TX * N_TX); __fp16 *PtrL = l1_L + itr * (2 * N_TX * N_TX); __fp16 *Ptry2 = y2 + itr * (2 * N_TX); __fp16 *Ptry3 = y3 + itr * (2 * N_TX); __fp16 *Ptrx = l1_x + itr * (2 * N_TX); +#endif + #ifdef VEC - mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, ZF); + mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, ZF, FOLD); mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX); - mempool_cholesky_f16vecs(PtrG, PtrL, N_TX); + mempool_cholesky_f16vecs(PtrG, PtrL, N_TX, FOLD); #else - mempool_hermitian_f16s(PtrH, PtrG, PtrS, N_RX, N_TX, 0, ZF); - mempool_MVP_conjtransp_f16s(PtrH, Ptry, Ptry2, N_RX, N_TX, 0); - mempool_cholesky_f16s(PtrG, PtrL, N_TX); + mempool_hermitian_f16s(PtrH, PtrG, PtrS, N_RX, N_TX, ZF, FOLD); + mempool_MVP_conjtransp_f16s(PtrH, Ptry, Ptry2, N_RX, N_TX); + mempool_cholesky_f16s(PtrG, PtrL, N_TX, FOLD); #endif - mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX); - mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX); + mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX, 0, FOLD); + mempool_Ltrisol_f16s(PtrL, Ptry3, Ptrx, N_TX, 1, FOLD); } mempool_barrier(num_cores); + uint32_t time_end = mempool_get_timer(); mempool_stop_benchmark(); #endif - // Check the result #ifdef BANSHEE + // Check the result if (core_id == 0) { for (uint32_t i = 0; i < 2 * N_TX * N_ITR; i++) { uint32_t x = (*(uint32_t *)&l1_x[i]) & 0x0000FFFF; @@ -152,6 +176,9 @@ int main() { } } #else + if (core_id == 0) { + printf("Runtime: %d\n", time_end - time_init); + } mempool_barrier(num_cores); #endif @@ -264,11 +291,11 @@ int main() { __fp16 *PtrL = L + itr * (2 * N_TX * N_TX); __fp16 *Ptry2 = y2 + itr * (2 * N_TX); __fp16 *Ptry3 = y3 + itr * (2 * N_TX); - mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, ZF); + mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, 0, ZF); mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX); - mempool_cholesky_f16vecs(PtrG, PtrL, N_TX); - mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX); - mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX); + mempool_cholesky_f16vecs(PtrG, PtrL, N_TX, 0); + mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX, 0, 0); + mempool_Ltrisol_f16s(PtrL, Ptry3, Ptrx, N_TX, 1, 0); } #endif @@ -291,7 +318,7 @@ int main() { __fp16 *PtrS = cmpt_S + itr * (2 * N_TX); __fp16 *PtrG = G + itr * (2 * N_TX * N_TX); __fp16 *Ptry2 = y2 + itr * (2 * N_TX); - mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, ZF); + mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, 0, ZF); mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX); } mempool_log_barrier(2, core_id); @@ -313,9 +340,9 @@ int main() { __fp16 *PtrL = L + itr * (2 * N_TX * N_TX); __fp16 *Ptry2 = y2 + itr * (2 * N_TX); __fp16 *Ptry3 = y3 + itr * (2 * N_TX); - mempool_cholesky_f16vecs(PtrG, PtrL, N_TX); - mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX); - mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX); + mempool_cholesky_f16vecs(PtrG, PtrL, N_TX, 0); + mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX, 0, 0); + mempool_Ltrisol_f16s(PtrL, Ptry3, Ptrx, N_TX, 1, 0); } #endif diff --git a/software/apps/baremetal/mimo_mmse_f32/main.c b/software/apps/baremetal/mimo_mmse_f32/main.c index aa95b1919..194c4c71c 100644 --- a/software/apps/baremetal/mimo_mmse_f32/main.c +++ b/software/apps/baremetal/mimo_mmse_f32/main.c @@ -21,25 +21,25 @@ #include "data_mimo_mmse_f32.h" -//#define SINGLE -//#define JACOBI -#define PARALLEL +#define SINGLE float l1_H[2 * N_TX * N_RX * N_ITR] - __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)), - section(".l1_prio"))); + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); float l1_G[2 * N_TX * N_TX * N_ITR] - __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)), - section(".l1_prio"))); + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); float l1_L[2 * N_TX * N_TX * N_ITR] - __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)), - section(".l1_prio"))); -float l1_Sigma[N_TX * N_ITR] __attribute__((section(".l1_prio"))); - -float l1_y[2 * N_RX * N_ITR] __attribute__((section(".l1_prio"))); -float y2[2 * N_TX * N_ITR] __attribute__((section(".l1_prio"))); -float y3[2 * N_TX * N_ITR] __attribute__((section(".l1_prio"))); -float l1_x[2 * N_TX * N_ITR] __attribute__((section(".l1_prio"))); + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); +float l1_S[2 * N_TX * N_ITR] + __attribute__((aligned(sizeof(int32_t)), section(".l1"))); +float l1_y[2 * N_RX * N_ITR] + __attribute__((aligned(sizeof(int32_t)), section(".l1"))); + +float y2[2 * N_TX * N_ITR] + __attribute__((aligned(sizeof(int32_t)), section(".l1"))); +float y3[2 * N_TX * N_ITR] + __attribute__((aligned(sizeof(int32_t)), section(".l1"))); +float l1_x[2 * N_TX * N_ITR] + __attribute__((aligned(sizeof(int32_t)), section(".l1"))); // Driver program int main() { @@ -52,7 +52,7 @@ int main() { if (core_id == 0) { dma_memcpy_blocking(l1_H, l2_H, 2 * N_ITR * N_RX * N_TX * sizeof(int32_t)); dma_memcpy_blocking(l1_y, l2_y, 2 * N_ITR * N_RX * sizeof(int32_t)); - dma_memcpy_blocking(l1_Sigma, l2_Sigma, N_ITR * N_TX * sizeof(int32_t)); + dma_memcpy_blocking(l1_S, l2_S, 2 * N_ITR * N_TX * sizeof(int32_t)); } mempool_barrier(num_cores); @@ -60,25 +60,15 @@ int main() { /* Benchmark */ if (core_id == 0) { mempool_start_benchmark(); - mempool_hermitian_f32s(l1_H, l1_G, l1_Sigma, N_RX, N_TX, 0, 0); + mempool_hermitian_f32s(l1_H, l1_G, l1_S, N_RX, N_TX, 0, 0); mempool_MVP_conjtransp_f32s(l1_H, l1_y, y2, N_RX, N_TX, 0); - mempool_cholesky_f32s(l1_G, l1_L, N_TX); - mempool_Ltrisol_f32s(l1_L, y2, y3, N_TX); - mempool_Lttrisol_f32s(l1_L, y3, l1_x, N_TX); - mempool_stop_benchmark(); - } - mempool_barrier(num_cores); -#endif - #ifdef JACOBI - /* Benchmark */ - if (core_id == 0) { - mempool_start_benchmark(); - mempool_hermitian_f32s(l1_H, l1_G, l1_Sigma, N_RX, N_TX, 0, 0); - mempool_MVP_conjtransp_f32s(l1_H, l1_y, y2, N_RX, N_TX, 0); - mempool_stop_benchmark(); - mempool_start_benchmark(); mempool_jacobi_f32s(l1_G, y2, l1_x, N_TX, 0.005f, 20U); +#else + mempool_cholesky_f32s(l1_G, l1_L, N_TX, 0); + mempool_Ltrisol_f32s(l1_L, y2, y3, N_TX, 0, 0); + mempool_Ltrisol_f32s(l1_L, y3, l1_x, N_TX, 1, 0); +#endif mempool_stop_benchmark(); } mempool_barrier(num_cores); @@ -88,21 +78,35 @@ int main() { // Each iteration is assigned to a processor mempool_start_benchmark(); for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) { + // Inputs float *PtrH = l1_H + itr * (2 * N_TX * N_RX); - float *PtrSigma = l1_Sigma + itr * N_TX; + float *PtrS = l1_S + itr * (2 * N_TX); float *Ptry = l1_y + itr * (2 * N_RX); // Intermediate results and outputs +#if FOLD + __fp16 *PtrG = l1_G + (itr % NUM_ROW) * (2 * N_TX * NUM_BANKS) + + (itr / NUM_ROW) * (2 * N_TX); + __fp16 *PtrL = l1_L + (itr % NUM_ROW) * (2 * N_TX * NUM_BANKS) + + (itr / NUM_ROW) * (2 * N_TX); + __fp16 *Ptry2 = + y2 + (itr % NUM_ROW) * NUM_BANKS + (itr / NUM_ROW) * (2 * N_TX); + __fp16 *Ptry3 = + y3 + (itr % NUM_ROW) * NUM_BANKS + (itr / NUM_ROW) * (2 * N_TX); + __fp16 *Ptrx = l1_x + itr * (2 * N_TX); +#else float *PtrG = l1_G + itr * (2 * N_TX * N_TX); float *PtrL = l1_L + itr * (2 * N_TX * N_TX); float *Ptry2 = y2 + itr * (2 * N_TX); float *Ptry3 = y3 + itr * (2 * N_TX); float *Ptrx = l1_x + itr * (2 * N_TX); - mempool_hermitian_f32s(PtrH, PtrG, PtrSigma, N_RX, N_TX, 0, 0); - mempool_MVP_conjtransp_f32s(PtrH, Ptry, Ptry2, N_RX, N_TX, 0); - mempool_cholesky_f32s(PtrG, PtrL, N_TX); - mempool_Ltrisol_f32s(PtrL, Ptry2, Ptry3, N_TX); - mempool_Lttrisol_f32s(PtrL, Ptry3, Ptrx, N_TX); +#endif + + mempool_hermitian_f32s(PtrH, PtrG, PtrS, N_RX, N_TX, 0, FOLD); + mempool_MVP_conjtransp_f32s(PtrH, Ptry, Ptry2, N_RX, N_TX); + mempool_cholesky_f32s(PtrG, PtrL, N_TX, FOLD); + mempool_Ltrisol_f32s(PtrL, Ptry2, Ptry3, N_TX, 0, FOLD); + mempool_Ltrisol_f32s(PtrL, Ptry3, Ptrx, N_TX, 1, FOLD); } mempool_log_barrier(2, core_id); mempool_stop_benchmark(); @@ -118,9 +122,9 @@ int main() { for (uint32_t itr = pool_id; itr < N_ITR; itr += num_pools) { float *PtrH = l1_H + itr * (2 * N_TX * N_RX); float *PtrG = l1_G + itr * (2 * N_TX * N_TX); - float *PtrSigma = l1_Sigma + itr * N_TX; - mempool_hermitian_f32p(PtrH, PtrG, PtrSigma, N_RX, N_TX, 0, 0, - core_id % N_TX, N_TX); + float *PtrS = l1_S + itr * N_TX; + mempool_hermitian_f32p(PtrH, PtrG, PtrS, N_RX, N_TX, 0, 0, core_id % N_TX, + N_TX); } mempool_stop_benchmark(); mempool_start_benchmark(); @@ -135,37 +139,9 @@ int main() { float *Ptry3 = y3 + itr * (2 * N_TX); float *Ptrx = l1_x + itr * (2 * N_TX); mempool_MVP_conjtransp_f32s(PtrH, Ptry, Ptry2, N_RX, N_TX, 0); - mempool_cholesky_f32s(PtrG, PtrL, N_TX); - mempool_Ltrisol_f32s(PtrL, Ptry2, Ptry3, N_TX); - mempool_Lttrisol_f32s(PtrL, Ptry3, Ptrx, N_TX); - } - mempool_log_barrier(2, core_id); - mempool_stop_benchmark(); -#endif - -#if defined(FOLDED) && defined(__XDIVSQRT) - mempool_start_benchmark(); - for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) { - // Inputs - float *PtrH = l1_H + itr * (2 * N_TX * N_RX); - float *PtrSigma = l1_Sigma + itr * N_TX; - float *Ptry = l1_y + itr * (2 * N_RX); - // Intermediate results and outputs - float *PtrG = l1_G + (itr % num_cores) * N_TX + - (itr / num_cores) * (2 * N_TX * N_BANKS); - float *PtrL = l1_L + (itr % num_cores) * N_TX + - (itr / num_cores) * (2 * N_TX * N_BANKS); - float *Ptry2 = - y2 + (itr % num_cores) * N_TX + (itr / num_cores) * (2 * N_BANKS); - float *Ptry3 = - y3 + (itr % num_cores) * N_TX + (itr / num_cores) * (2 * N_BANKS); - float *Ptrx = - l1_x + (itr % num_cores) * N_TX + (itr / num_cores) * (2 * N_BANKS); - mempool_hermitian_f32s(PtrH, PtrG, PtrSigma, N_RX, N_TX, 1, 0); - mempool_MVP_conjtransp_f32s(PtrH, Ptry, Ptry2, N_RX, N_TX, 1); - mempool_cholesky_folded_f32s(PtrG, PtrL, N_TX); - mempool_Ltrisol_folded_f32s(PtrL, Ptry2, Ptry3, N_TX); - mempool_Lttrisol_folded_f32s(PtrL, Ptry3, Ptrx, N_TX); + mempool_cholesky_f32s(PtrG, PtrL, N_TX, 0); + mempool_Ltrisol_f32s(PtrL, Ptry2, Ptry3, N_TX, 0, 0); + mempool_Ltrisol_f32s(PtrL, Ptry3, Ptrx, N_TX, 1, 0); } mempool_log_barrier(2, core_id); mempool_stop_benchmark(); diff --git a/software/apps/baremetal/mimo_mmse_f8/main.c b/software/apps/baremetal/mimo_mmse_f8/main.c index cd6258ad6..294e917bc 100644 --- a/software/apps/baremetal/mimo_mmse_f8/main.c +++ b/software/apps/baremetal/mimo_mmse_f8/main.c @@ -14,26 +14,34 @@ #include "baremetal/mempool_checks.h" #include "baremetal/mempool_cholesky_f16s.h" -#include "baremetal/mempool_cholesky_f8s.h" #include "baremetal/mempool_linearsolver_f16s.h" -#include "baremetal/mempool_linearsolver_f8s.h" #include "baremetal/mempool_mimo_mmse_f8s.h" #include "data_mimo_mmse_f8.h" -#define ZF (0) // When asserted use zero-forcing +#define ZF (0) // When asserted use zero-forcing +#define FOLD (0) // When asserted fold matrixes in memory #define NUM_BANKS (BANKING_FACTOR * NUM_CORES) +#if FOLD + +#define NUM_ROW (1 + ((N_ITR * N_TX - 1) / NUM_BANKS)) +__fp16 l1_G[2 * N_TX * NUM_BANKS * NUM_ROW] + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); +__fp16 l1_L[2 * N_TX * NUM_BANKS * NUM_ROW] + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); +#else +__fp16 l1_G[2 * N_TX * N_TX * N_ITR] + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); +__fp16 l1_L[2 * N_TX * N_TX * N_ITR] + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); +#endif + __fp8 l1_H[2 * N_TX * N_RX * N_ITR] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); __fp8 l1_S[2 * N_TX * N_ITR] __attribute__((aligned(sizeof(int32_t)), section(".l1"))); __fp8 l1_y[2 * N_RX * N_ITR] __attribute__((aligned(sizeof(int32_t)), section(".l1"))); - -__fp16 l1_G[2 * N_TX * N_TX * N_ITR] - __attribute__((aligned(NUM_BANKS * sizeof(int32_t)), section(".l1_prio"))); -__fp16 l1_L[2 * N_TX * N_TX * N_ITR] - __attribute__((aligned(NUM_BANKS * sizeof(int32_t)), section(".l1_prio"))); __fp16 y2[2 * N_TX * N_ITR] __attribute__((aligned(sizeof(int32_t)), section(".l1"))); __fp16 y3[2 * N_TX * N_ITR] @@ -90,17 +98,17 @@ int main() { __fp16 *Ptrx = l1_x + itr * (2 * N_TX); // 8b #ifdef VEC - mempool_hermitian_f8vecs(PtrH, PtrS, PtrG, N_RX, N_TX, ZF); + mempool_hermitian_f8vecs(PtrH, PtrS, PtrG, N_RX, N_TX, ZF, 0); mempool_MVP_conjtransp_f8vecs(PtrH, Ptry, Ptry2, N_RX, N_TX); - mempool_cholesky_f16vecs(PtrG, PtrL, N_TX); + mempool_cholesky_f16vecs(PtrG, PtrL, N_TX, 0); #else - mempool_hermitian_f8s(PtrH, PtrS, PtrG, N_RX, N_TX, ZF); + mempool_hermitian_f8s(PtrH, PtrS, PtrG, N_RX, N_TX, ZF, 0); mempool_MVP_conjtransp_f8s(PtrH, Ptry, Ptry2, N_RX, N_TX); - mempool_cholesky_f16s(PtrG, PtrL, N_TX); + mempool_cholesky_f16s(PtrG, PtrL, N_TX, 0); #endif // 16b - mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX); - mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX); + mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX, 0, 0); + mempool_Ltrisol_f16s(PtrL, Ptry3, Ptrx, N_TX, 1, 0); } mempool_stop_benchmark(); } @@ -108,31 +116,47 @@ int main() { #ifdef PARALLEL mempool_start_benchmark(); + uint32_t time_init = mempool_get_timer(); // Parallel subcarrier loop for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) { + __fp8 *PtrH = l1_H + itr * (2 * N_TX * N_RX); __fp8 *Ptry = l1_y + itr * (2 * N_RX); __fp8 *PtrS = l1_S + itr * (2 * N_TX); // Auxiliary vectors +#if FOLD + __fp16 *PtrG = l1_G + (itr % NUM_ROW) * (2 * N_TX * NUM_BANKS) + + (itr / NUM_ROW) * (2 * N_TX); + __fp16 *PtrL = l1_L + (itr % NUM_ROW) * (2 * N_TX * NUM_BANKS) + + (itr / NUM_ROW) * (2 * N_TX); + __fp16 *Ptry2 = + y2 + (itr % NUM_ROW) * NUM_BANKS + (itr / NUM_ROW) * (2 * N_TX); + __fp16 *Ptry3 = + y3 + (itr % NUM_ROW) * NUM_BANKS + (itr / NUM_ROW) * (2 * N_TX); + __fp16 *Ptrx = l1_x + itr * (2 * N_TX); +#else __fp16 *PtrG = l1_G + itr * (2 * N_TX * N_TX); __fp16 *PtrL = l1_L + itr * (2 * N_TX * N_TX); __fp16 *Ptry2 = y2 + itr * (2 * N_TX); __fp16 *Ptry3 = y3 + itr * (2 * N_TX); __fp16 *Ptrx = l1_x + itr * (2 * N_TX); +#endif + #ifdef VEC - mempool_hermitian_f8vecs(PtrH, PtrS, PtrG, N_RX, N_TX, ZF); + mempool_hermitian_f8vecs(PtrH, PtrS, PtrG, N_RX, N_TX, ZF, FOLD); mempool_MVP_conjtransp_f8vecs(PtrH, Ptry, Ptry2, N_RX, N_TX); - mempool_cholesky_f16vecs(PtrG, PtrL, N_TX); + mempool_cholesky_f16vecs(PtrG, PtrL, N_TX, FOLD); #else - mempool_hermitian_f8s(PtrH, PtrS, PtrG, N_RX, N_TX, ZF); + mempool_hermitian_f8s(PtrH, PtrS, PtrG, N_RX, N_TX, ZF, FOLD); mempool_MVP_conjtransp_f8s(PtrH, Ptry, Ptry2, N_RX, N_TX); - mempool_cholesky_f16s(PtrG, PtrL, N_TX); + mempool_cholesky_f16s(PtrG, PtrL, N_TX, FOLD); #endif // 16b - mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX); - mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX); + mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX, 0, FOLD); + mempool_Ltrisol_f16s(PtrL, Ptry3, Ptrx, N_TX, 1, FOLD); } mempool_barrier(num_cores); + uint32_t time_end = mempool_get_timer(); mempool_stop_benchmark(); #endif @@ -145,6 +169,9 @@ int main() { } } #else + if (core_id == 0) { + printf("Runtime: %d\n", time_end - time_init); + } mempool_barrier(num_cores); #endif diff --git a/software/apps/baremetal/mimo_mmse_q16/main.c b/software/apps/baremetal/mimo_mmse_q16/main.c index c7dcda78d..24fd9e44d 100644 --- a/software/apps/baremetal/mimo_mmse_q16/main.c +++ b/software/apps/baremetal/mimo_mmse_q16/main.c @@ -64,8 +64,8 @@ int main() { mempool_MVP_conjtransp_q16vecs((v2s *)l1_H, (v2s *)l1_y, (v2s *)y2, N_RX, N_TX, 0); mempool_cholesky_q16vecs(l1_G, l1_L, N_TX); - mempool_Ltrisol_q16vecs(l1_L, y2, y3, N_TX); - mempool_Lttrisol_q16vecs(l1_L, y3, l1_x, N_TX); + mempool_Ltrisol_q16vecs(l1_L, y2, y3, N_TX, 0); + mempool_Ltrisol_q16vecs(l1_L, y3, l1_x, N_TX, 1); mempool_stop_benchmark(); } mempool_barrier(num_cores); @@ -92,8 +92,8 @@ int main() { mempool_MVP_conjtransp_q16vecs((v2s *)PtrH, (v2s *)Ptry, (v2s *)Ptry2, N_RX, N_TX, 0); mempool_cholesky_q16vecs(PtrG, PtrL, N_TX); - mempool_Ltrisol_q16vecs(PtrL, Ptry2, Ptry3, N_TX); - mempool_Lttrisol_q16vecs(PtrL, Ptry3, Ptrx, N_TX); + mempool_Ltrisol_q16vecs(PtrL, Ptry2, Ptry3, N_TX, 0); + mempool_Ltrisol_q16vecs(PtrL, Ptry3, Ptrx, N_TX, 1); } mempool_log_barrier(2, core_id); mempool_stop_benchmark(); diff --git a/software/data/data_mimo_mmse_f32.h.tpl b/software/data/data_mimo_mmse_f32.h.tpl index 858deb254..c7bed1889 100644 --- a/software/data/data_mimo_mmse_f32.h.tpl +++ b/software/data/data_mimo_mmse_f32.h.tpl @@ -27,7 +27,7 @@ float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_G[${2 * N_tx float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_y[${2 * N_rx * N_itr}] = ${array_to_cstr(y)}; -float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_Sigma[${N_tx * N_itr}] = ${array_to_cstr(N)}; +float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_S[${2 * N_tx * N_itr}] = ${array_to_cstr(N)}; // Outputs diff --git a/software/data/generate_mimo_mmse.py b/software/data/generate_mimo_mmse.py index e4dd80932..f8918f561 100644 --- a/software/data/generate_mimo_mmse.py +++ b/software/data/generate_mimo_mmse.py @@ -52,13 +52,15 @@ def generate_fmmse(N_tx, N_rx, N_itr, my_type): G = np.matmul(H_h, H) + N * np.eye(H.shape[1]) N = N * np.ones(N_tx) - # Cholesky decomposition + # Cholesky decomposition L = np.linalg.cholesky(G) # Linear system solution y1 = np.transpose(np.dot(H_h, y)) y2 = solve_triangular(L, y1, lower=True) x = solve_triangular(np.asmatrix(L).H, y2) + H = np.reshape(np.asarray(H), (N_tx * N_rx), order='C') + G = np.reshape(np.asarray(G), (N_tx * N_tx), order='C') N = np.column_stack((N.real, N.imag)).astype(my_type).flatten() H = np.column_stack((H.real, H.imag)).astype(my_type).flatten() G = np.column_stack((G.real, G.imag)).astype(my_type).flatten() diff --git a/software/kernels/baremetal/mempool_checks.h b/software/kernels/baremetal/mempool_checks.h index d680764c1..700354436 100644 --- a/software/kernels/baremetal/mempool_checks.h +++ b/software/kernels/baremetal/mempool_checks.h @@ -15,17 +15,19 @@ void mempool_check_q32(int32_t *__restrict__ pRes, int32_t *__restrict__ pExp, uint32_t NEL, int32_t TOL, bool verbose) { uint32_t core_id = mempool_get_core_id(); - int32_t error; + if (core_id == 0) { + uint32_t ERRORS = 0; for (uint32_t i = 0; i < NEL; i++) { int32_t exp = pExp[i]; int32_t res = pRes[i]; - error = exp - res; - bool print = ((error > TOL) || (error < (-TOL))) || verbose; + int32_t diff = exp - res; + uint32_t error = ((diff > TOL) || (diff < (-TOL))) ? 1 : 0; + uint32_t print = error || verbose; + ERRORS += error; if (print) { printf("CHECK(%d): EXP = %08X - RESP = %08X\n", i, exp, res); - ERRORS++; } } printf("%d ERRORS out of %d CHECKS\n", ERRORS, NEL); @@ -44,17 +46,19 @@ void mempool_check_q32(int32_t *__restrict__ pRes, int32_t *__restrict__ pExp, void mempool_check_q16(int16_t *__restrict__ pRes, int16_t *__restrict__ pExp, uint32_t NEL, int16_t TOL, bool verbose) { uint32_t core_id = mempool_get_core_id(); - int16_t error; + if (core_id == 0) { + uint32_t ERRORS = 0; for (uint32_t i = 0; i < NEL; i++) { int16_t exp = (int16_t)pExp[i]; int16_t res = (int16_t)pRes[i]; - error = (int16_t)(exp - res); - bool print = ((error > TOL) || (error < (-TOL))) | verbose; + int16_t diff = (int16_t)(exp - res); + uint32_t error = ((diff > TOL) || (diff < (-TOL))) ? 1 : 0; + uint32_t print = error || verbose; + ERRORS += error; if (print) { printf("CHECK(%d): EXP = %08X - RESP = %08X\n", i, exp, res); - ERRORS++; } } printf("%d ERRORS out of %d CHECKS\n", ERRORS, NEL); @@ -74,20 +78,24 @@ void mempool_check_q16(int16_t *__restrict__ pRes, int16_t *__restrict__ pExp, void mempool_check_f32(float *__restrict__ pRes, float *__restrict__ pExp, uint32_t NEL, float TOL, bool verbose) { uint32_t core_id = mempool_get_core_id(); - float error = 0.0f; + if (core_id == 0) { + uint32_t ERRORS = 0; for (uint32_t i = 0; i < NEL; i++) { float exp = pExp[i]; float res = pRes[i]; - asm volatile("fsub.s %[error], %[res], %[exp];" - : [error] "+&r"(error) + float diff; + asm volatile("fsub.s %[diff], %[res], %[exp];" + : [diff] "+&r"(diff) : [res] "r"(res), [exp] "r"(exp) :); - bool print = ((error > TOL) || (error < (-TOL))) || verbose; + uint32_t error = ((diff > TOL) || (diff < (-TOL))) ? 1 : 0; + uint32_t print = error || verbose; + ERRORS += error; if (print) { - printf("CHECK(%d): EXP = %08X - RESP = %08X\n", i, exp, res); - ERRORS++; + printf("CHECK(%d): EXP = %08X - RESP = %08X\n", i, *(int32_t *)&exp, + *(int32_t *)&res); } } printf("%d ERRORS out of %d CHECKS\n", ERRORS, NEL); @@ -106,22 +114,25 @@ void mempool_check_f32(float *__restrict__ pRes, float *__restrict__ pExp, void mempool_check_f16(__fp16 *__restrict__ pRes, __fp16 *__restrict__ pExp, uint32_t NEL, float TOL, bool verbose) { uint32_t core_id = mempool_get_core_id(); - float error; + if (core_id == 0) { uint32_t ERRORS = 0; for (uint32_t i = 0; i < NEL; i++) { __fp16 exp = pExp[i]; __fp16 res = pRes[i]; - asm volatile("fsub.h %[error], %[res], %[exp];" - "fcvt.s.h %[error], %[error];" - : [error] "+&r"(error) + float diff; + asm volatile("fsub.h %[diff], %[res], %[exp];" + "fcvt.s.h %[diff], %[diff];" + : [diff] "+&r"(diff) : [res] "r"(res), [exp] "r"(exp) :); - bool print = ((error > TOL) || (error < (-TOL))) || verbose; + + uint32_t error = ((diff > TOL) || (diff < (-TOL))) ? 1 : 0; + uint32_t print = error || verbose; + ERRORS += error; if (print) { printf("CHECK(%d): EXP = %08X - RESP = %08X\n", i, *(int32_t *)&exp, *(int32_t *)&res); - ERRORS++; } } printf("%d ERRORS out of %d CHECKS\n", ERRORS, NEL); diff --git a/software/kernels/baremetal/mempool_cholesky_f16s.h b/software/kernels/baremetal/mempool_cholesky_f16s.h index 56483490e..121267545 100644 --- a/software/kernels/baremetal/mempool_cholesky_f16s.h +++ b/software/kernels/baremetal/mempool_cholesky_f16s.h @@ -16,9 +16,11 @@ @param[in] pSrc points to input matrix @param[in] pL points to output lower triangular matrix @param[in] n dimension of the input data + @param[in] folded matrices are folded row-wise in memory @return none */ -void mempool_cholesky_f16s(__fp16 *pSrc, __fp16 *pL, const uint32_t n) { +void mempool_cholesky_f16s(__fp16 *pSrc, __fp16 *pL, const uint32_t n, + const uint32_t folded) { __fp16 sum; __fp16 a, b; @@ -27,91 +29,15 @@ void mempool_cholesky_f16s(__fp16 *pSrc, __fp16 *pL, const uint32_t n) { __fp16 ap, bp; // Pivot element __fp16 as, bs; // Sum element uint32_t i, j, k; + const uint32_t offset = folded ? N_BANKS : n; for (j = 0; j < n; j++) { // Elements on diagonal (input matrix is positive-definite) - ap = pSrc[2U * (j * n + j)]; - sum = (__fp16)0.0f; - for (k = 0; k < j; k++) { - a = pL[2U * (j * n + k)]; - b = pL[2U * (j * n + k) + 1]; - asm volatile("fmadd.h %[sum], %[a], %[a], %[sum];" - "fmadd.h %[sum], %[b], %[b], %[sum];" - : [sum] "+&r"(sum) - : [a] "r"(a), [b] "r"(b) - :); - } - asm volatile("fsub.h %[ap], %[ap], %[sum];" - "fsqrt.h %[ap], %[ap];" - : [ap] "+&r"(ap) - : [sum] "r"(sum) - :); - pL[2U * (j * n + j)] = ap; - - // Elements on rows - for (i = j + 1; i < n; i++) { - // Pivot - ap = pSrc[2U * (i * n + j)]; - bp = pSrc[2U * (i * n + j) + 1]; - // Diag - diag = pL[2U * (j * n + j)]; - // Sum -> s = s + (ac + bd) + j*(bc - ad) - as = (__fp16)0.0f; - bs = (__fp16)0.0f; - for (k = 0; k < j; k++) { - a = pL[2U * (i * n + k)]; - b = pL[2U * (i * n + k) + 1]; - c = pL[2U * (j * n + k)]; - d = pL[2U * (j * n + k) + 1]; - asm volatile("fmadd.h %[as], %[a], %[c], %[as];" - "fmadd.h %[as], %[b], %[d], %[as];" - "fmadd.h %[bs], %[b], %[c], %[bs];" - "fnmsub.h %[bs], %[a], %[d], %[bs];" - : [as] "+&r"(as), [bs] "+&r"(bs) - : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d) - :); - } - asm volatile("fsub.h %[ap], %[ap], %[as];" - "fsub.h %[bp], %[bp], %[bs];" - "fdiv.h %[ap], %[ap], %[diag];" - "fdiv.h %[bp], %[bp], %[diag];" - : [ap] "+&r"(ap), [bp] "+&r"(bp) - : [diag] "r"(diag), [as] "r"(as), [bs] "r"(bs) - :); - pL[2U * (i * n + j)] = ap; - pL[2U * (i * n + j) + 1] = bp; - } - } - return; -} - -/** - @brief Cholesky decomposition with Crout algorithm. - Output data is folded to the core's local memory. - @param[in] pSrc points to input matrix - @param[in] pL points to output lower triangular matrix - @param[in] n dimension of the input data - @return none -*/ -void mempool_cholesky_folded_f16s(__fp16 *pSrc, __fp16 *pL, const uint32_t n) { - - register __fp16 sum; - __fp16 a, b; - __fp16 c, d; - __fp16 diag; // Diagonal element - __fp16 ap, bp; // Pivot element - __fp16 as, bs; // Sum element - - uint32_t i, j, k; - - for (j = 0; j < n; j++) { - - // Elements on diagonal (input matrix is positive-definite) - ap = pSrc[2U * (j * N_BANKS + j)]; + ap = pSrc[2U * (j * offset + j)]; sum = (__fp16)0.0f; for (k = 0; k < j; k++) { - a = pL[2U * (j * N_BANKS + k)]; - b = pL[2U * (j * N_BANKS + k) + 1]; + a = pL[2U * (j * offset + k)]; + b = pL[2U * (j * offset + k) + 1]; asm volatile("fmadd.h %[sum], %[a], %[a], %[sum];" "fmadd.h %[sum], %[b], %[b], %[sum];" : [sum] "+&r"(sum) @@ -123,23 +49,23 @@ void mempool_cholesky_folded_f16s(__fp16 *pSrc, __fp16 *pL, const uint32_t n) { : [ap] "+&r"(ap) : [sum] "r"(sum) :); - pL[2U * (j * N_BANKS + j)] = ap; + pL[2U * (j * offset + j)] = ap; // Elements on rows for (i = j + 1; i < n; i++) { // Pivot - ap = pSrc[2U * (i * N_BANKS + j)]; - bp = pSrc[2U * (i * N_BANKS + j) + 1]; + ap = pSrc[2U * (i * offset + j)]; + bp = pSrc[2U * (i * offset + j) + 1]; // Diag - diag = pL[2U * (j * N_BANKS + j)]; + diag = pL[2U * (j * offset + j)]; // Sum -> s = s + (ac + bd) + j*(bc - ad) as = (__fp16)0.0f; bs = (__fp16)0.0f; for (k = 0; k < j; k++) { - a = pL[2U * (i * N_BANKS + k)]; - b = pL[2U * (i * N_BANKS + k) + 1]; - c = pL[2U * (j * N_BANKS + k)]; - d = pL[2U * (j * N_BANKS + k) + 1]; + a = pL[2U * (i * offset + k)]; + b = pL[2U * (i * offset + k) + 1]; + c = pL[2U * (j * offset + k)]; + d = pL[2U * (j * offset + k) + 1]; asm volatile("fmadd.h %[as], %[a], %[c], %[as];" "fmadd.h %[as], %[b], %[d], %[as];" "fmadd.h %[bs], %[b], %[c], %[bs];" @@ -155,8 +81,8 @@ void mempool_cholesky_folded_f16s(__fp16 *pSrc, __fp16 *pL, const uint32_t n) { : [ap] "+&r"(ap), [bp] "+&r"(bp) : [diag] "r"(diag), [as] "r"(as), [bs] "r"(bs) :); - pL[2U * (i * N_BANKS + j)] = ap; - pL[2U * (i * N_BANKS + j) + 1] = bp; + pL[2U * (i * offset + j)] = ap; + pL[2U * (i * offset + j) + 1] = bp; } } return; @@ -170,55 +96,64 @@ void mempool_cholesky_folded_f16s(__fp16 *pSrc, __fp16 *pL, const uint32_t n) { @param[in] n dimension of the input data @return none */ -void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n) { +void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n, + const uint32_t folded) { __fp16 diag; v2h apbp, dgdg; v2h ab, cd; - uint32_t i, j, k; + const uint32_t offset = folded ? N_BANKS : n; + for (j = 0; j < n; j++) { // Elements on diagonal (input matrix is positive-definite) - __fp16 ap = pSrc[2U * (j * n + j)]; + __fp16 ap = pSrc[2U * (j * offset + j)]; float sum = 0.0f; for (k = 0; k < j; k++) { - ab = (*(v2h *)&pL[2U * (j * n + k)]); + ab = (*(v2h *)&pL[2U * (j * offset + k)]); asm volatile("vfdotpex.s.h %0, %1, %1;" : "+&r"(sum) : "r"(ab) :); } asm volatile("fcvt.h.s %0, %0;" : "+&r"(sum) :); asm volatile("fsub.h %0, %0, %1;" : "+&r"(ap) : "r"(sum)); asm volatile("fsqrt.h %0, %0;" : "+&r"(ap) :); - pL[2U * (j * n + j)] = ap; + pL[2U * (j * offset + j)] = ap; // Elements on rows - uint32_t volatile __srt_iloop__ = (j + 1); - uint32_t volatile __end_kloop__ = j; + uint32_t __srt_iloop__ = (j + 1); + uint32_t __end_kloop__ = j; #ifdef __CDOTP for (i = __srt_iloop__; i < n; i++) { - apbp = (*(v2h *)&pSrc[2U * (i * n + j)]); // Pivot - diag = pL[2U * (j * n + j)]; // Diag + apbp = (*(v2h *)&pSrc[2U * (i * offset + j)]); // Pivot + diag = pL[2U * (j * offset + j)]; // Diag + v2h asbs = (v2h)0.0f; + float as = 0.0f; + float bs = 0.0f; for (k = 0; k < __end_kloop__; k++) { - ab = (*(v2h *)&pL[2U * (i * n + k)]); - cd = (*(v2h *)&pL[2U * (j * n + k)]); - asm volatile("fcndotpex.s.h %0, %1, %2;" - : "+&r"(apbp) - : "r"(ab), "r"(cd)); + ab = (*(v2h *)&pL[2U * (i * offset + k)]); + cd = (*(v2h *)&pL[2U * (j * offset + k)]); + asm volatile("fccdotpex.s.h %0, %1, %2;" + : "+&r"(asbs) + : "r"(cd), "r"(ab)); } + asm volatile("pv.shuffle2.h %0, %0, %[mask];" + : "+&r"(asbs) + : [mask] "r"(0x00020003)); asm volatile("pv.pack %0, %1, %1;" : "+&r"(dgdg) : "r"(diag)); + asm volatile("vfsub.h %0, %0, %1;" : "+&r"(apbp) : "r"(asbs)); asm volatile("vfdiv.h %0, %0, %1;" : "+&r"(apbp) : "r"(dgdg)); - (*(v2h *)&pL[2U * (i * n + j)]) = apbp; + (*(v2h *)&pL[2U * (i * offset + j)]) = apbp; } #else for (i = __srt_iloop__; i < n; i++) { - apbp = (*(v2h *)&pSrc[2U * (i * n + j)]); // Pivot - diag = pL[2U * (j * n + j)]; // Diag + apbp = (*(v2h *)&pSrc[2U * (i * offset + j)]); // Pivot + diag = pL[2U * (j * offset + j)]; // Diag float as = 0.0f; float bs = 0.0f; v2h asbs; for (k = 0; k < __end_kloop__; k++) { - ab = (*(v2h *)&pL[2U * (i * n + k)]); - cd = (*(v2h *)&pL[2U * (j * n + k)]); + ab = (*(v2h *)&pL[2U * (i * offset + k)]); + cd = (*(v2h *)&pL[2U * (j * offset + k)]); v2h ndc; const uint32_t neg_mask = 0x00008000; const uint32_t shuffle_mask = 0x00020003; @@ -237,7 +172,7 @@ void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n) { asm volatile("pv.pack %0, %1, %1;" : "+&r"(dgdg) : "r"(diag)); asm volatile("vfsub.h %0, %0, %1;" : "+&r"(apbp) : "r"(asbs)); asm volatile("vfdiv.h %0, %0, %1;" : "+&r"(apbp) : "r"(dgdg)); - (*(v2h *)&pL[2U * (i * n + j)]) = apbp; + (*(v2h *)&pL[2U * (i * offset + j)]) = apbp; } #endif } @@ -430,7 +365,8 @@ void mempool_cholesky_f16vecs_unroll4(__fp16 *pSrc, __fp16 *pL, @param[in] n dimension of the input data @return none */ -void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n) { +void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n, + const uint32_t folded) { float sum; // register float sum float as, bs; // real and imaginary sums @@ -444,14 +380,15 @@ void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n) { v2h ab, cd, ndc; uint32_t i, j, k; + const uint32_t offset = folded ? N_BANKS : n; for (j = 0; j < n; j++) { // Elements on diagonal (input matrix is positive-definite) - ap = pSrc[2U * (j * n + j)]; + ap = pSrc[2U * (j * offset + j)]; asm volatile("fcvt.s.h %0, %1;" : "=r"(sum) : "r"(ap) :); for (k = 0; k < j; k++) { - ab = (*(v2h *)&pL[2U * (j * n + k)]); + ab = (*(v2h *)&pL[2U * (j * offset + k)]); asm volatile("vfndotpex.s.h %[sum], %[ab], %[ab];" : [sum] "+&r"(sum) : [ab] "r"(ab) @@ -459,15 +396,15 @@ void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n) { } sum = (float)sqrt(sum); asm volatile("fcvt.h.s %0, %1;" : "=r"(ap) : "r"(sum) :); - pL[2U * (j * n + j)] = ap; + pL[2U * (j * offset + j)] = ap; // Elements on rows for (i = j + 1; i < n; i++) { // Pivot - ap = pSrc[2U * (i * n + j)]; - bp = pSrc[2U * (i * n + j + 1)]; + ap = pSrc[2U * (i * offset + j)]; + bp = pSrc[2U * (i * offset + j + 1)]; // Diag - diag = pL[2U * (j * n + j)]; + diag = pL[2U * (j * offset + j)]; // Sum -> s = s + (ac + bd) + j*(bc - ad) as = (float)0.0f; @@ -476,8 +413,8 @@ void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n) { asm volatile("fcvt.s.h %0, %1;" : "=r"(bp_f32) : "r"(bp) :); asm volatile("fcvt.s.h %0, %1;" : "=r"(diag_f32) : "r"(diag) :); for (k = 0; k < j; k++) { - ab = (*(v2h *)&pL[2U * (i * n + k)]); - cd = (*(v2h *)&pL[2U * (j * n + k)]); + ab = (*(v2h *)&pL[2U * (i * offset + k)]); + cd = (*(v2h *)&pL[2U * (j * offset + k)]); const uint32_t neg_mask = 0x00008000; const uint32_t shuffle_mask = 0x00020003; asm volatile( @@ -494,7 +431,7 @@ void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n) { as = (ap_f32 - as) / diag_f32; bs = (bp_f32 - bs) / diag_f32; asm volatile("vfcpka.h.s %0, %1, %2;" : "=r"(asbs) : "r"(as), "r"(bs) :); - (*(v2h *)&pL[2U * (i * n + j)]) = asbs; + (*(v2h *)&pL[2U * (i * offset + j)]) = asbs; } } return; diff --git a/software/kernels/baremetal/mempool_cholesky_f32s.h b/software/kernels/baremetal/mempool_cholesky_f32s.h index 2bad891f5..63fd878dc 100644 --- a/software/kernels/baremetal/mempool_cholesky_f32s.h +++ b/software/kernels/baremetal/mempool_cholesky_f32s.h @@ -16,7 +16,8 @@ @param[in] n dimension of the input data @return none */ -void mempool_cholesky_f32s(float *pSrc, float *pL, const uint32_t n) { +void mempool_cholesky_f32s(float *pSrc, float *pL, const uint32_t n, + const uint32_t folded) { register float sum; float a, b; @@ -24,17 +25,17 @@ void mempool_cholesky_f32s(float *pSrc, float *pL, const uint32_t n) { float diag; // Diagonal element float ap, bp; // Pivot element float as, bs; // Sum element - uint32_t i, j, k; + const uint32_t offset = folded ? N_BANKS : n; for (j = 0; j < n; j++) { // Elements on diagonal (input matrix is positive-definite) - ap = pSrc[2U * (j * n + j)]; + ap = pSrc[2U * (j * offset + j)]; sum = 0.0f; for (k = 0; k < j; k++) { - a = pL[2U * (j * n + k)]; - b = pL[2U * (j * n + k) + 1]; + a = pL[2U * (j * offset + k)]; + b = pL[2U * (j * offset + k) + 1]; asm volatile("fmadd.s %[sum], %[a], %[a], %[sum];" "fmadd.s %[sum], %[b], %[b], %[sum];" : [sum] "+&r"(sum) @@ -51,98 +52,18 @@ void mempool_cholesky_f32s(float *pSrc, float *pL, const uint32_t n) { // Elements on rows for (i = j + 1; i < n; i++) { // Pivot - ap = pSrc[2U * (i * n + j)]; - bp = pSrc[2U * (i * n + j) + 1]; - // Diag - diag = pL[2U * (j * n + j)]; - // Sum -> s = s + (ac + bd) + j*(bc - ad) - as = 0.0f; - bs = 0.0f; - for (k = 0; k < j; k++) { - a = pL[2U * (i * n + k)]; - b = pL[2U * (i * n + k) + 1]; - c = pL[2U * (j * n + k)]; - d = pL[2U * (j * n + k) + 1]; - asm volatile("fmadd.s %[as], %[a], %[c], %[as];" - "fmadd.s %[as], %[b], %[d], %[as];" - "fmadd.s %[bs], %[b], %[c], %[bs];" - "fnmsub.s %[bs], %[a], %[d], %[bs];" - : [as] "+&r"(as), [bs] "+&r"(bs) - : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d) - :); - } - asm volatile("fsub.s %[ap], %[ap], %[as];" - "fsub.s %[bp], %[bp], %[bs];" - "fdiv.s %[ap], %[ap], %[diag];" - "fdiv.s %[bp], %[bp], %[diag];" - : [ap] "+&r"(ap), [bp] "+&r"(bp) - : [diag] "r"(diag), [as] "r"(as), [bs] "r"(bs) - :); - pL[2U * (i * n + j)] = ap; - pL[2U * (i * n + j) + 1] = bp; - } - } - return; -} - -void mempool_cholesky_folded_f32s(float *pSrc, float *pL, const uint32_t n) { - - register float sum; - - // first matrix row: - // item[0-2] item[1-3] - // memrow[0] x x x x - // memrow[1] x x x x - // second matrix row: - // memrow[2] x x x x - // memrow[3] x x x x - // third matrix row: - // memrow[4] x x x x - // memrow[5] x x x x - // i * memrow_xrow * N_BANKS + (j / local_items) * N_BANKS + j % local_items - - float a, b; - float c, d; - float diag; // Diagonal element - float ap, bp; // Pivot element - float as, bs; // Sum element - uint32_t banks_per_row = (n / 2) * N_BANKS; - - uint32_t i, j, k; - for (j = 0; j < n; j++) { - // Elements on diagonal (input matrix is positive-definite) - ap = pSrc[j * banks_per_row + (j / 2) * N_BANKS + 2 * (j % 2)]; - sum = 0.0f; - for (k = 0; k < j; k++) { - a = pL[j * banks_per_row + (k / 2) * N_BANKS + 2 * (k % 2)]; - b = pL[j * banks_per_row + (k / 2) * N_BANKS + 2 * (k % 2) + 1]; - asm volatile("fmadd.s %[sum], %[a], %[a], %[sum];" - "fmadd.s %[sum], %[b], %[b], %[sum];" - : [sum] "+&r"(sum) - : [a] "r"(a), [b] "r"(b) - :); - } - asm volatile("fsub.s %[ap], %[ap], %[sum];" - "fsqrt.s %[ap], %[ap];" - : [ap] "+&r"(ap) - : [sum] "r"(sum) - :); - pL[j * banks_per_row + (j / 2) * N_BANKS + 2 * (j % 2)] = ap; - // Elements on rows - for (i = j + 1; i < n; i++) { - // Pivot - ap = pSrc[i * banks_per_row + (j / 2) * N_BANKS + 2 * (j % 2)]; - bp = pSrc[i * banks_per_row + (j / 2) * N_BANKS + 2 * (j % 2) + 1]; + ap = pSrc[2U * (i * offset + j)]; + bp = pSrc[2U * (i * offset + j) + 1]; // Diag - diag = pL[j * banks_per_row + (j / 2) * N_BANKS + 2 * (j % 2)]; + diag = pL[2U * (j * offset + j)]; // Sum -> s = s + (ac + bd) + j*(bc - ad) as = 0.0f; bs = 0.0f; for (k = 0; k < j; k++) { - a = pL[i * banks_per_row + (k / 2) * N_BANKS + 2 * (k % 2)]; - b = pL[i * banks_per_row + (k / 2) * N_BANKS + 2 * (k % 2) + 1]; - c = pL[j * banks_per_row + (k / 2) * N_BANKS + 2 * (k % 2)]; - d = pL[j * banks_per_row + (k / 2) * N_BANKS + 2 * (k % 2) + 1]; + a = pL[2U * (i * offset + k)]; + b = pL[2U * (i * offset + k) + 1]; + c = pL[2U * (j * offset + k)]; + d = pL[2U * (j * offset + k) + 1]; asm volatile("fmadd.s %[as], %[a], %[c], %[as];" "fmadd.s %[as], %[b], %[d], %[as];" "fmadd.s %[bs], %[b], %[c], %[bs];" @@ -158,8 +79,8 @@ void mempool_cholesky_folded_f32s(float *pSrc, float *pL, const uint32_t n) { : [ap] "+&r"(ap), [bp] "+&r"(bp) : [diag] "r"(diag), [as] "r"(as), [bs] "r"(bs) :); - pL[i * banks_per_row + (j / 2) * N_BANKS + 2 * (j % 2)] = ap; - pL[i * banks_per_row + (j / 2) * N_BANKS + 2 * (j % 2) + 1] = bp; + pL[2U * (i * offset + j)] = ap; + pL[2U * (i * offset + j) + 1] = bp; } } return; diff --git a/software/kernels/baremetal/mempool_linearsolver_f16s.h b/software/kernels/baremetal/mempool_linearsolver_f16s.h index 32ab942ca..c4e134527 100644 --- a/software/kernels/baremetal/mempool_linearsolver_f16s.h +++ b/software/kernels/baremetal/mempool_linearsolver_f16s.h @@ -15,10 +15,13 @@ @param[in] in known variables vector @param[in] x unknown solutions vector @param[in] n dimension of the system + @param[in] transposed solve transposed system + @param[in] folded matrix L is folded row-wise in memory @return none */ -void mempool_Ltrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, const uint32_t n) { +void mempool_Ltrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, const uint32_t n, + const uint32_t transposed, const uint32_t folded) { uint32_t i, j; __fp16 a, b; @@ -27,77 +30,41 @@ void mempool_Ltrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, const uint32_t n) { __fp16 as, bs; __fp16 ax, bx; __fp16 diag; + const uint32_t offset = folded ? N_BANKS : n; // Solve for each variable x_i in turn for (i = 0; i < n; i++) { - diag = pL[2 * (i * n + i)]; - as = (__fp16)in[2U * i]; - bs = (__fp16)in[2U * i + 1]; + uint32_t ridx = transposed ? (n - i - 1) : i; + diag = pL[2U * (ridx * offset + ridx)]; + as = (__fp16)in[2U * ridx]; + bs = (__fp16)in[2U * ridx + 1]; // Use the previously solved variables to calculate the sum for (j = 0; j < i; j++) { - a = pL[2U * (i * n + j)]; - b = pL[2U * (i * n + j) + 1]; - c = x[2U * j]; - d = x[2U * j + 1]; - asm volatile("fnmsub.h %[as], %[a], %[c], %[as];" - "fnmsub.h %[bs], %[a], %[d], %[bs];" - "fmadd.h %[as], %[b], %[d], %[as];" - "fnmsub.h %[bs], %[b], %[c], %[bs];" - : [as] "+&r"(as), [bs] "+&r"(bs) - : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d) - :); - } - // Subtract the sum from b_i and divide by the diagonal element L[i][i] - asm volatile("fdiv.h %[ax], %[as], %[diag];" - "fdiv.h %[bx], %[bs], %[diag];" - : [ax] "+&r"(ax), [bx] "+&r"(bx) - : [as] "r"(as), [bs] "r"(bs), [diag] "r"(diag) - :); - x[2U * i] = ax; - x[2U * i + 1] = bx; - } - return; -} - -/** - @brief Single-core solution of upper triangular system - (from transposed lower triangular system) - @param[in] pL input triangular matrix to be transposed - @param[in] in known variables vector - @param[in] x unknown solutions vector - @param[in] n dimension of the system - @return none -*/ - -void mempool_Lttrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, - const uint32_t n) { - - uint32_t i, j; - __fp16 a, b; - __fp16 c, d; - __fp16 as, bs; - __fp16 ax, bx; - __fp16 diag; - - // Solve for each variable x_i in turn - for (i = 0; i < n; i++) { - diag = pL[2 * ((n - 1 - i) * n + (n - 1 - i))]; - as = (__fp16)in[2 * (n - i - 1)]; - bs = (__fp16)in[2 * (n - i - 1) + 1]; - // Use the previously solved variables to calculate the sum - for (j = 0; j < i; j++) { - a = pL[2U * ((n - 1 - j) * n + (n - 1 - i))]; - b = pL[2U * ((n - 1 - j) * n + (n - 1 - i)) + 1]; - c = x[2U * (n - 1 - j)]; - d = x[2U * (n - 1 - j) + 1]; - asm volatile("fnmsub.h %[as], %[a], %[c], %[as];" - "fnmsub.h %[as], %[b], %[d], %[as];" - "fnmsub.h %[bs], %[a], %[d], %[bs];" - "fmadd.h %[bs], %[b], %[c], %[bs];" - : [as] "+&r"(as), [bs] "+&r"(bs) - : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d) - :); + uint32_t cidx = transposed ? (n - j - 1) : j; + c = x[2U * cidx]; + d = x[2U * cidx + 1]; + if (transposed) { + a = pL[2U * (cidx * offset + ridx)]; + b = pL[2U * (cidx * offset + ridx) + 1]; + asm volatile("fnmsub.h %[as], %[a], %[c], %[as];" + "fnmsub.h %[as], %[b], %[d], %[as];" + "fnmsub.h %[bs], %[a], %[d], %[bs];" + "fmadd.h %[bs], %[b], %[c], %[bs];" + : [as] "+&r"(as), [bs] "+&r"(bs) + : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d) + :); + } else { + a = pL[2U * (ridx * offset + cidx)]; + b = pL[2U * (ridx * offset + cidx) + 1]; + asm volatile("fnmsub.h %[as], %[a], %[c], %[as];" + "fnmsub.h %[bs], %[a], %[d], %[bs];" + "fmadd.h %[as], %[b], %[d], %[as];" + "fnmsub.h %[bs], %[b], %[c], %[bs];" + : [as] "+&r"(as), [bs] "+&r"(bs) + : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d) + :); + } } // Subtract the sum from b_i and divide by the diagonal element L[i][i] asm volatile("fdiv.h %[ax], %[as], %[diag];" @@ -105,115 +72,8 @@ void mempool_Lttrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, : [ax] "+&r"(ax), [bx] "+&r"(bx) : [as] "r"(as), [bs] "r"(bs), [diag] "r"(diag) :); - x[2U * (n - i - 1)] = ax; - x[2U * (n - i - 1) + 1] = bx; - } - return; -} - -/** - @brief Single-core solution of lower triangular system - (from transposed lower triangular system) - @param[in] pL input triangular matrix - @param[in] in known variables vector - @param[in] x unknown solutions vector - @param[in] n dimension of the system - @return none -*/ - -void mempool_Ltrisol_folded_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, - const uint32_t n) { - - uint32_t i, j; - __fp16 a, b; - __fp16 c, d; - - __fp16 as, bs; - __fp16 ax, bx; - __fp16 diag; - - // Solve for each variable x_i in turn - for (i = 0; i < n; i++) { - diag = pL[i * N_BANKS + 2 * i]; - as = (__fp16)in[2U * i]; - bs = (__fp16)in[2U * i + 1]; - // Use the previously solved variables to calculate the sum - for (j = 0; j < i; j++) { - a = pL[i * N_BANKS + 2 * j]; - b = pL[i * N_BANKS + 2 * j + 1]; - c = x[2U * j]; - d = x[2U * j + 1]; - asm volatile("fnmsub.h %[as], %[a], %[c], %[as];" - "fnmsub.h %[bs], %[a], %[d], %[bs];" - "fmadd.h %[as], %[b], %[d], %[as];" - "fnmsub.h %[bs], %[b], %[c], %[bs];" - : [as] "+&r"(as), [bs] "+&r"(bs) - : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d) - :); - } - // Subtract the sum from b_i and divide by the diagonal element L[i][i] - asm volatile("fdiv.h %[ax], %[as], %[diag];" - "fdiv.h %[bx], %[bs], %[diag];" - : [ax] "+&r"(ax), [bx] "+&r"(bx) - : [as] "r"(as), [bs] "r"(bs), [diag] "r"(diag) - :); - x[2U * i] = ax; - x[2U * i + 1] = bx; - } - return; -} - -/** - @brief Single-core solution of upper triangular system - Output data is folded to the core's local memory. - (from transposed lower triangular system) - @param[in] pL input triangular matrix to be transposed - @param[in] in known variables vector - @param[in] x unknown solutions vector - @param[in] n dimension of the system - @return none -*/ - -void mempool_Lttrisol_folded_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, - const uint32_t n) { - - uint32_t i, j; - __fp16 a, b; - __fp16 c, d; - - __fp16 as, bs; - __fp16 ax, bx; - __fp16 diag; - - // Solve for each variable x_i in turn - for (i = 0; i < n; i++) { - uint32_t rev_i = (n - 1 - i); - diag = pL[rev_i * N_BANKS + 2 * rev_i]; - as = (__fp16)in[2 * rev_i]; - bs = (__fp16)in[2 * rev_i + 1]; - // Use the previously solved variables to calculate the sum - for (j = 0; j < i; j++) { - uint32_t rev_j = (n - 1 - j); - a = pL[rev_j * N_BANKS + 2 * rev_i]; - b = pL[rev_j * N_BANKS + 2 * rev_i + 1]; - c = x[2U * rev_j]; - d = x[2U * rev_j + 1]; - asm volatile("fnmsub.h %[as], %[a], %[c], %[as];" - "fnmsub.h %[as], %[b], %[d], %[as];" - "fnmsub.h %[bs], %[a], %[d], %[bs];" - "fmadd.h %[bs], %[b], %[c], %[bs];" - : [as] "+&r"(as), [bs] "+&r"(bs) - : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d) - :); - } - // Subtract the sum from b_i and divide by the diagonal element L[i][i] - asm volatile("fdiv.h %[ax], %[as], %[diag];" - "fdiv.h %[bx], %[bs], %[diag];" - : [ax] "+&r"(ax), [bx] "+&r"(bx) - : [as] "r"(as), [bs] "r"(bs), [diag] "r"(diag) - :); - x[2U * rev_i] = ax; - x[2U * rev_i + 1] = bx; + x[2U * ridx] = ax; + x[2U * ridx + 1] = bx; } return; } @@ -229,7 +89,8 @@ void mempool_Lttrisol_folded_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, @return none */ -void mempool_Ltrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, const uint32_t n) { +void mempool_Ltrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, const uint32_t n, + const uint32_t transposed, const uint32_t folded) { uint32_t i, j; __fp16 a, b; @@ -237,82 +98,44 @@ void mempool_Ltrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, const uint32_t n) { __fp16 as, bs; __fp16 diag; + const uint32_t offset = folded ? N_BANKS : n; float ax, bx, diag_f32; v2h res; // Solve for each variable x_i in turn for (i = 0; i < n; i++) { - diag = pL[2 * (i * n + i)]; - as = (__fp16)in[2U * i]; - bs = (__fp16)in[2U * i + 1]; + uint32_t ridx = transposed ? (n - i - 1) : i; + diag = pL[2U * (ridx * offset + ridx)]; + as = (__fp16)in[2U * ridx]; + bs = (__fp16)in[2U * ridx + 1]; // Use the previously solved variables to calculate the sum for (j = 0; j < i; j++) { - a = pL[2U * (i * n + j)]; - b = pL[2U * (i * n + j) + 1]; - c = x[2U * j]; - d = x[2U * j + 1]; - asm volatile("fnmsub.h %[as], %[a], %[c], %[as];" - "fnmsub.h %[bs], %[a], %[d], %[bs];" - "fmadd.h %[as], %[b], %[d], %[as];" - "fnmsub.h %[bs], %[b], %[c], %[bs];" - : [as] "+&r"(as), [bs] "+&r"(bs) - : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d) - :); - } - // Subtract the sum from b_i and divide by the diagonal element L[i][i] - asm volatile("fcvt.s.h %0, %1;" : "=r"(diag_f32) : "r"(diag) :); - asm volatile("fcvt.s.h %0, %1;" : "=r"(ax) : "r"(as) :); - asm volatile("fcvt.s.h %0, %1;" : "=r"(bx) : "r"(bs) :); - ax = ax / diag_f32; - bx = bx / diag_f32; - asm volatile("vfcpka.h.s %0, %1, %2;" : "=r"(res) : "r"(ax), "r"(bx) :); - (*(v2h *)&x[2U * i]) = res; - } - return; -} - -/** - @brief Single-core solution of upper triangular system - (from transposed lower triangular system) - @param[in] pL input triangular matrix to be transposed - @param[in] in known variables vector - @param[in] x unknown solutions vector - @param[in] n dimension of the system - @return none -*/ - -void mempool_Lttrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, - const uint32_t n) { - uint32_t i, j; - __fp16 a, b; - __fp16 c, d; - - __fp16 as, bs; - __fp16 diag; - - float ax, bx, diag_f32; - v2h res; - - // Solve for each variable x_i in turn - for (i = 0; i < n; i++) { - diag = pL[2 * ((n - 1 - i) * n + (n - 1 - i))]; - as = (__fp16)in[2 * (n - i - 1)]; - bs = (__fp16)in[2 * (n - i - 1) + 1]; - // Use the previously solved variables to calculate the sum - for (j = 0; j < i; j++) { - a = pL[2U * ((n - 1 - j) * n + (n - 1 - i))]; - b = pL[2U * ((n - 1 - j) * n + (n - 1 - i)) + 1]; - c = x[2U * (n - 1 - j)]; - d = x[2U * (n - 1 - j) + 1]; - asm volatile("fnmsub.h %[as], %[a], %[c], %[as];" - "fnmsub.h %[as], %[b], %[d], %[as];" - "fnmsub.h %[bs], %[a], %[d], %[bs];" - "fmadd.h %[bs], %[b], %[c], %[bs];" - : [as] "+&r"(as), [bs] "+&r"(bs) - : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d) - :); + uint32_t volatile cidx = transposed ? (n - j - 1) : j; + c = x[2U * cidx]; + d = x[2U * cidx + 1]; + if (transposed) { + a = pL[2U * (cidx * offset + ridx)]; + b = pL[2U * (cidx * offset + ridx) + 1]; + asm volatile("fnmsub.h %[as], %[a], %[c], %[as];" + "fnmsub.h %[as], %[b], %[d], %[as];" + "fnmsub.h %[bs], %[a], %[d], %[bs];" + "fmadd.h %[bs], %[b], %[c], %[bs];" + : [as] "+&r"(as), [bs] "+&r"(bs) + : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d) + :); + } else { + a = pL[2U * (ridx * offset + cidx)]; + b = pL[2U * (ridx * offset + cidx) + 1]; + asm volatile("fnmsub.h %[as], %[a], %[c], %[as];" + "fnmsub.h %[bs], %[a], %[d], %[bs];" + "fmadd.h %[as], %[b], %[d], %[as];" + "fnmsub.h %[bs], %[b], %[c], %[bs];" + : [as] "+&r"(as), [bs] "+&r"(bs) + : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d) + :); + } } // Subtract the sum from b_i and divide by the diagonal element L[i][i] asm volatile("fcvt.s.h %0, %1;" : "=r"(diag_f32) : "r"(diag) :); @@ -321,7 +144,7 @@ void mempool_Lttrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, ax = ax / diag_f32; bx = bx / diag_f32; asm volatile("vfcpka.h.s %0, %1, %2;" : "=r"(res) : "r"(ax), "r"(bx) :); - (*(v2h *)&x[2U * (n - i - 1)]) = res; + (*(v2h *)&x[2U * ridx]) = res; } return; } diff --git a/software/kernels/baremetal/mempool_linearsolver_f32s.h b/software/kernels/baremetal/mempool_linearsolver_f32s.h index c3f3b6ce1..d3297397c 100644 --- a/software/kernels/baremetal/mempool_linearsolver_f32s.h +++ b/software/kernels/baremetal/mempool_linearsolver_f32s.h @@ -15,10 +15,12 @@ @param[in] in known variables vector @param[in] x unknown solutions vector @param[in] n dimension of the system + @param[in] transposed solve transposed system @return none */ -void mempool_Ltrisol_f32s(float *pL, float *in, float *x, const uint32_t n) { +void mempool_Ltrisol_f32s(float *pL, float *in, float *x, const uint32_t n, + const uint32_t transposed, const uint32_t folded) { uint32_t i, j; float a, b; @@ -27,122 +29,28 @@ void mempool_Ltrisol_f32s(float *pL, float *in, float *x, const uint32_t n) { float as, bs; float ax, bx; float diag; + const uint32_t offset = folded ? N_BANKS : n; // Solve for each variable x_i in turn for (i = 0; i < n; i++) { - as = in[2U * i]; - bs = in[2U * i + 1]; - diag = pL[2 * (i * n + i)]; + uint32_t ridx = transposed ? (n - i - 1) : i; + diag = pL[2U * (ridx * offset + ridx)]; + as = in[2U * ridx]; + bs = in[2U * ridx + 1]; // Use the previously solved variables to calculate the sum for (j = 0; j < i; j++) { - a = pL[2U * (i * n + j)]; - b = pL[2U * (i * n + j) + 1]; - c = x[2U * j]; - d = x[2U * j + 1]; - asm volatile("fnmsub.s %[as], %[a], %[c], %[as];" - "fnmsub.s %[bs], %[a], %[d], %[bs];" - "fmadd.s %[as], %[b], %[d], %[as];" - "fnmsub.s %[bs], %[b], %[c], %[bs];" - : [as] "+&r"(as), [bs] "+&r"(bs) - : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d) - :); - } - // Subtract the sum from b_i and divide by the diagonal element L[i][i] - asm volatile("fdiv.s %[ax], %[as], %[diag];" - "fdiv.s %[bx], %[bs], %[diag];" - : [ax] "+&r"(ax), [bx] "+&r"(bx) - : [as] "r"(as), [bs] "r"(bs), [diag] "r"(diag) - :); - x[2U * i] = ax; - x[2U * i + 1] = bx; - } - return; -} - -/** - @brief Single-core solution of upper triangular system - (from transposed lower triangular system) - @param[in] pL input triangular matrix to be transposed - @param[in] in known variables vector - @param[in] x unknown solutions vector - @param[in] n dimension of the system - @return none -*/ - -void mempool_Lttrisol_f32s(float *pL, float *in, float *x, const uint32_t n) { - uint32_t i, j; - float a, b; - float c, d; - - float as, bs; - float ax, bx; - float diag; - - // Solve for each variable x_i in turn - for (i = 0; i < n; i++) { - diag = pL[2 * ((n - 1 - i) * n + (n - 1 - i))]; - as = in[2 * (n - i - 1)]; - bs = in[2 * (n - i - 1) + 1]; - // Use the previously solved variables to calculate the sum - for (j = 0; j < i; j++) { - a = pL[2U * ((n - 1 - j) * n + (n - 1 - i))]; - b = pL[2U * ((n - 1 - j) * n + (n - 1 - i)) + 1]; - c = x[2U * (n - 1 - j)]; - d = x[2U * (n - 1 - j) + 1]; - asm volatile("fnmsub.s %[as], %[a], %[c], %[as];" - "fnmsub.s %[as], %[b], %[d], %[as];" - "fnmsub.s %[bs], %[a], %[d], %[bs];" - "fmadd.s %[bs], %[b], %[c], %[bs];" - : [as] "+&r"(as), [bs] "+&r"(bs) - : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d) - :); - } - // Subtract the sum from b_i and divide by the diagonal element L[i][i] - asm volatile("fdiv.s %[ax], %[as], %[diag];" - "fdiv.s %[bx], %[bs], %[diag];" - : [ax] "+&r"(ax), [bx] "+&r"(bx) - : [as] "r"(as), [bs] "r"(bs), [diag] "r"(diag) - :); - x[2U * (n - i - 1)] = ax; - x[2U * (n - i - 1) + 1] = bx; - } - return; -} - -/** - @brief Single-core solution of lower triangular system - (from transposed lower triangular system) - @param[in] pL input triangular matrix - @param[in] in known variables vector - @param[in] x unknown solutions vector - @param[in] n dimension of the system - @return none -*/ - -void mempool_Ltrisol_folded_f32s(float *pL, float *in, float *x, - const uint32_t n) { - - uint32_t i, j; - float a, b; - float c, d; - - float as, bs; - float ax, bx; - float diag; - uint32_t banks_per_row = (n / 2) * N_BANKS; - - // Solve for each variable x_i in turn - for (i = 0; i < n; i++) { - diag = pL[i * banks_per_row + (i / 2) * N_BANKS + 2 * (i % 2)]; - as = in[(i / 2) * N_BANKS + 2 * (i % 2)]; - bs = in[(i / 2) * N_BANKS + 2 * (i % 2) + 1]; - // Use the previously solved variables to calculate the sum - for (j = 0; j < i; j++) { - a = pL[i * banks_per_row + (j / 2) * N_BANKS + 2 * (j % 2)]; - b = pL[i * banks_per_row + (j / 2) * N_BANKS + 2 * (j % 2) + 1]; - c = x[(j / 2) * N_BANKS + 2 * (j % 2)]; - d = x[(j / 2) * N_BANKS + 2 * (j % 2) + 1]; + uint32_t cidx = transposed ? (n - j - 1) : j; + if (!transposed) { + a = pL[2U * (ridx * offset + cidx)]; + b = pL[2U * (ridx * offset + cidx) + 1]; + } else { + a = pL[2U * (cidx * offset + ridx)]; + b = pL[2U * (cidx * offset + ridx) + 1]; + } + + c = x[2U * cidx]; + d = x[2U * cidx + 1]; asm volatile("fnmsub.s %[as], %[a], %[c], %[as];" "fnmsub.s %[bs], %[a], %[d], %[bs];" "fmadd.s %[as], %[b], %[d], %[as];" @@ -157,67 +65,8 @@ void mempool_Ltrisol_folded_f32s(float *pL, float *in, float *x, : [ax] "+&r"(ax), [bx] "+&r"(bx) : [as] "r"(as), [bs] "r"(bs), [diag] "r"(diag) :); - x[(i / 2) * N_BANKS + 2 * (i % 2)] = ax; - x[(i / 2) * N_BANKS + 2 * (i % 2) + 1] = bx; - } - return; -} - -/** - @brief Single-core solution of upper triangular system - Output data is folded to the core's local memory. - (from transposed lower triangular system) - @param[in] pL input triangular matrix to be transposed - @param[in] in known variables vector - @param[in] x unknown solutions vector - @param[in] n dimension of the system - @return none -*/ - -void mempool_Lttrisol_folded_f32s(float *pL, float *in, float *x, - const uint32_t n) { - - uint32_t i, j; - float a, b; - float c, d; - - float as, bs; - float ax, bx; - float diag; - uint32_t banks_per_row = (n / 2) * N_BANKS; - - // Solve for each variable x_i in turn - for (i = 0; i < n; i++) { - // reversed i index - uint32_t rev_i = (n - 1 - i); - diag = pL[rev_i * banks_per_row + (rev_i / 2) * N_BANKS + 2 * (rev_i % 2)]; - as = in[(rev_i / 2) * N_BANKS + 2 * (rev_i % 2)]; - bs = in[(rev_i / 2) * N_BANKS + 2 * (rev_i % 2) + 1]; - // Use the previously solved variables to calculate the sum - for (j = 0; j < i; j++) { - // reversed j index - uint32_t rev_j = (n - 1 - j); - a = pL[rev_j * banks_per_row + (rev_i / 2) * N_BANKS + 2 * (rev_i % 2)]; - b = pL[rev_j * banks_per_row + (rev_i / 2) * N_BANKS + 2 * (rev_i % 2) + - 1]; - c = x[2U * rev_j]; - d = x[2U * rev_j + 1]; - asm volatile("fnmsub.s %[as], %[a], %[c], %[as];" - "fnmsub.s %[as], %[b], %[d], %[as];" - "fnmsub.s %[bs], %[a], %[d], %[bs];" - "fmadd.s %[bs], %[b], %[c], %[bs];" - : [as] "+&r"(as), [bs] "+&r"(bs) - : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d) - :); - } - // Subtract the sum from b_i and divide by the diagonal element L[i][i] - asm volatile("fdiv.s %[ax], %[as], %[diag];" - "fdiv.s %[bx], %[bs], %[diag];" - : [ax] "+&r"(ax), [bx] "+&r"(bx) - : [as] "r"(as), [bs] "r"(bs), [diag] "r"(diag) - :); - x[2U * rev_i] = ax; - x[2U * rev_i + 1] = bx; + x[2U * ridx] = ax; + x[2U * ridx + 1] = bx; } return; } diff --git a/software/kernels/baremetal/mempool_linearsolver_q16s.h b/software/kernels/baremetal/mempool_linearsolver_q16s.h index 4c3c2ad4a..cd9134968 100644 --- a/software/kernels/baremetal/mempool_linearsolver_q16s.h +++ b/software/kernels/baremetal/mempool_linearsolver_q16s.h @@ -5,7 +5,6 @@ // Author: Marco Bertuletti, ETH Zurich #pragma once -#define N_BANKS (NUM_CORES * BANKING_FACTOR) /** @brief Single-core solution of lower triangular system @@ -13,11 +12,12 @@ @param[in] y known variables vector @param[in] x unknown solutions vector @param[in] n dimension of the system + @param[in] transposed solve transposed system @return none */ void mempool_Ltrisol_q16vecs(int16_t *pL, int16_t *y, int16_t *x, - const uint32_t n) { + const uint32_t n, const uint32_t transposed) { uint32_t i, j; int32_t as, bs, diag; @@ -27,15 +27,20 @@ void mempool_Ltrisol_q16vecs(int16_t *pL, int16_t *y, int16_t *x, // Solve for each variable x[i] in loop for (i = 0; i < n; i++) { - // Pre-load the diagonal element - diag = pL[2U * (i * n + i)]; + uint32_t ridx = transposed ? (n - i - 1) : i; + diag = pL[2U * (ridx * offset + ridx)]; // Initialize the sums as = 0; bs = 0; // Use the previously solved variables to compute the sum for (j = 0; j < i; j++) { - ab = *(v2s *)&pL[2U * (i * n + j)]; - cd = *(v2s *)&x[2U * j]; + uint32_t cidx = transposed ? (n - j - 1) : j; + if (!transposed) { + ab = *(v2s *)&pL[2U * (ridx * n + cidx)]; + } else { + ab = *(v2s *)&pL[2U * (cidx * n + ridx)]; + } + cd = *(v2s *)&x[2U * cidx]; const uint32_t shuffle_mask = 0x00020003; asm volatile( // s = s + (ac + bd) + j(bc - ad) @@ -53,76 +58,15 @@ void mempool_Ltrisol_q16vecs(int16_t *pL, int16_t *y, int16_t *x, :); } // Subtract the sum from y[i] and divide by the diagonal element L[i][i] - as = as - (int32_t)y[2U * i]; - bs = bs - (int32_t)y[2U * i + 1]; + as = as - (int32_t)y[2U * ridx]; + bs = bs - (int32_t)y[2U * ridx + 1]; asm volatile("div %[as], %[as], %[diag];" "div %[bs], %[bs], %[diag];" "pv.pack %[res], %[as], %[bs];" : [as] "+&r"(as), [bs] "+&r"(bs), [res] "+&r"(res) : [diag] "r"(diag) :); - (*(v2s *)&x[2U * i]) = res; - } - - return; -} - -/** - @brief Single-core solution of upper triangular system - (from transposed lower triangular system) - @param[in] pL input triangular matrix to be transposed - @param[in] y known variables vector - @param[in] x unknown solutions vector - @param[in] n dimension of the system - @return none -*/ - -void mempool_Lttrisol_q16vecs(int16_t *pL, int16_t *y, int16_t *x, - const uint32_t n) { - - uint32_t i, j; - int32_t as, bs, diag; - v2s ab, cd; - v2s res = (v2s){0, 0}; - v2s ndc = (v2s){0, 0}; - - // Solve for each variable x[i] in loop - for (i = 0; i < n; i++) { - // Pre-load the diagonal element - diag = pL[2 * ((n - 1 - i) * n + (n - 1 - i))]; - // Initialize the sums - as = 0; - bs = 0; - // Use the previously solved variables to compute the sum - for (j = 0; j < i; j++) { - ab = *(v2s *)&pL[2U * ((n - 1 - j) * n + (n - 1 - i))]; - cd = *(v2s *)&x[2U * (n - 1 - j)]; - const uint32_t shuffle_mask = 0x00020003; - asm volatile( - // s = s + (ac + bd) + j(bc - ad) - "pv.dotsp.h %[as], %[ab], %[cd];" - "pv.shuffle2.h %[ndc], %[cd], %[shuffle_mask];" - "pv.sub.h %[ndc], %[zero], %[ndc];" - "pv.dotsp.h %[bs], %[ab], %[ndc];" - "srai %[as], %[as], 0x8;" - "srai %[bs], %[bs], 0x8;" - "p.clip %[bs], %[bs], 0x16;" - "p.clip %[as], %[as], 0x16;" - : [as] "+&r"(as), [bs] "+&r"(bs), [ndc] "+r"(ndc) - : [ab] "r"(ab), [cd] "r"(cd), [shuffle_mask] "r"(shuffle_mask), - [zero] "r"((v2s){0, 0}) - :); - } - // Subtract the sum from y[i] and divide by the diagonal element L[i][i] - as = as - (int32_t)y[2 * (n - i - 1)]; - bs = bs - (int32_t)y[2 * (n - i - 1) + 1]; - asm volatile("div %[as], %[as], %[diag];" - "div %[bs], %[bs], %[diag];" - "pv.pack %[res], %[as], %[bs];" - : [as] "+&r"(as), [bs] "+&r"(bs), [res] "+&r"(res) - : [diag] "r"(diag) - :); - (*(v2s *)&x[2U * (n - i - 1)]) = res; + (*(v2s *)&x[2U * ridx]) = res; } return; diff --git a/software/kernels/baremetal/mempool_mimo_mmse_f16s.h b/software/kernels/baremetal/mempool_mimo_mmse_f16s.h index 37834107d..45076f9fe 100644 --- a/software/kernels/baremetal/mempool_mimo_mmse_f16s.h +++ b/software/kernels/baremetal/mempool_mimo_mmse_f16s.h @@ -31,7 +31,7 @@ */ void mempool_hermitian_f16s(__fp16 *pH, __fp16 *pG, __fp16 *pS, const uint32_t n_rx, const uint32_t n_tx, - const uint32_t folded, const uint32_t zf) { + const uint32_t zf, const uint32_t folded) { uint32_t i, j, k; __fp16 a; @@ -112,29 +112,16 @@ void mempool_hermitian_f16s(__fp16 *pH, __fp16 *pG, __fp16 *pS, bs3 = (__fp16)0.0f; } } - if (!folded) { - // Store - pG[2 * (i * n_tx + j)] = as0; - pG[2 * (i * n_tx + j + 1U)] = as1; - pG[2 * (i * n_tx + j + 2U)] = as2; - pG[2 * (i * n_tx + j + 3U)] = as3; - pG[2 * (i * n_tx + j) + 1U] = bs0; - pG[2 * (i * n_tx + j + 1U) + 1U] = bs1; - pG[2 * (i * n_tx + j + 2U) + 1U] = bs2; - pG[2 * (i * n_tx + j + 3U) + 1U] = bs3; - } else { - // uint32_t addr = i * (n_tx / 4) * N_BANKS + (j / 4) * N_BANKS; - uint32_t addr = i * N_BANKS; - // Store - pG[addr] = as0; - pG[addr + 1U] = bs0; - pG[addr + 2U] = as1; - pG[addr + 3U] = bs1; - pG[addr + 4U] = as2; - pG[addr + 5U] = bs2; - pG[addr + 6U] = as3; - pG[addr + 7U] = bs3; - } + uint32_t const offset = folded ? N_BANKS : n_tx; + // Store + pG[2 * (i * offset + j)] = as0; + pG[2 * (i * offset + j + 1U)] = as1; + pG[2 * (i * offset + j + 2U)] = as2; + pG[2 * (i * offset + j + 3U)] = as3; + pG[2 * (i * offset + j) + 1U] = bs0; + pG[2 * (i * offset + j + 1U) + 1U] = bs1; + pG[2 * (i * offset + j + 2U) + 1U] = bs2; + pG[2 * (i * offset + j + 3U) + 1U] = bs3; } } return; @@ -151,8 +138,7 @@ void mempool_hermitian_f16s(__fp16 *pH, __fp16 *pG, __fp16 *pS, @return none */ void mempool_MVP_conjtransp_f16s(__fp16 *pH, __fp16 *px, __fp16 *py, - const uint32_t n_rx, const uint32_t n_tx, - const uint32_t folded) { + const uint32_t n_rx, const uint32_t n_tx) { uint32_t i, j; __fp16 a0, a1, a2, a3; @@ -215,31 +201,16 @@ void mempool_MVP_conjtransp_f16s(__fp16 *pH, __fp16 *px, __fp16 *py, [a3] "r"(a3), [b0] "r"(b0), [b1] "r"(b1), [b2] "r"(b2), [b3] "r"(b3) :); } - if (!folded) { - // Store - py[2U * i] = as0; - py[2U * (i + 1U)] = as1; - py[2U * (i + 2U)] = as2; - py[2U * (i + 3U)] = as3; - py[2U * i + 1U] = bs0; - py[2U * (i + 1U) + 1U] = bs1; - py[2U * (i + 2U) + 1U] = bs2; - py[2U * (i + 3U) + 1U] = bs3; - i += 4; - } else { - // Store - uint32_t addr = (i / 4) * N_BANKS; - py[addr] = as0; - py[addr + 1U] = bs0; - py[addr + 2U] = as1; - py[addr + 3U] = bs1; - py[addr + 4U] = as2; - py[addr + 5U] = bs2; - py[addr + 6U] = as3; - py[addr + 7U] = bs3; - i += 4; - } - + // Store + py[2U * i] = as0; + py[2U * (i + 1U)] = as1; + py[2U * (i + 2U)] = as2; + py[2U * (i + 3U)] = as3; + py[2U * i + 1U] = bs0; + py[2U * (i + 1U) + 1U] = bs1; + py[2U * (i + 2U) + 1U] = bs2; + py[2U * (i + 3U) + 1U] = bs3; + i += 4; } while (i < n_tx); return; } @@ -267,7 +238,7 @@ void mempool_MVP_conjtransp_f16s(__fp16 *pH, __fp16 *px, __fp16 *py, */ void mempool_hermitian_f16vecs(__fp16 *pH, __fp16 *pG, __fp16 *pS, const uint32_t n_rx, const uint32_t n_tx, - const uint32_t zf) { + const uint32_t zf, const uint32_t folded) { uint32_t i, j, k; v2h ab; v2h cd0, cd1, cd2, cd3; @@ -282,7 +253,7 @@ void mempool_hermitian_f16vecs(__fp16 *pH, __fp16 *pG, __fp16 *pS, for (i = 0; i < n_tx; i++) { - if (n_tx == 1) { + if (n_tx % 4 != 0) { as0 = 0.0f; // Initialize the real part of sums bs0 = 0.0f; // Initialize the imag part of sums // Inner Loop @@ -312,9 +283,11 @@ void mempool_hermitian_f16vecs(__fp16 *pH, __fp16 *pG, __fp16 *pS, asm volatile("and %0, %0, %1;" : "+&r"(res0) : "r"(0x0000FFFF)); asm volatile("fadd.h %0, %0, %1;" : "+&r"(res0) : "r"(pS[2 * i])); } - (*(v2h *)&pG[2 * (i * n_tx + j)]) = res0; + // Store + uint32_t addr = folded ? 2 * (i * N_BANKS + j) : 2 * (i * n_tx + j); + (*(v2h *)&pG[addr]) = res0; - } else if (n_tx >= 4) { + } else { // UNROLL_4 for (j = 0; j < n_tx; j += 4) { as0 = 0.0f; @@ -381,11 +354,12 @@ void mempool_hermitian_f16vecs(__fp16 *pH, __fp16 *pG, __fp16 *pS, asm volatile("fadd.h %0, %0, %1;" : "+&r"(res3) : "r"(pS[2 * i])); } } + uint32_t const offset = folded ? N_BANKS : n_tx; // Store - (*(v2h *)&pG[2 * (i * n_tx + j)]) = res0; - (*(v2h *)&pG[2 * (i * n_tx + j + 1U)]) = res1; - (*(v2h *)&pG[2 * (i * n_tx + j + 2U)]) = res2; - (*(v2h *)&pG[2 * (i * n_tx + j + 3U)]) = res3; + (*(v2h *)&pG[2 * (i * offset + j)]) = res0; + (*(v2h *)&pG[2 * (i * offset + j + 1U)]) = res1; + (*(v2h *)&pG[2 * (i * offset + j + 2U)]) = res2; + (*(v2h *)&pG[2 * (i * offset + j + 3U)]) = res3; } } } @@ -393,7 +367,7 @@ void mempool_hermitian_f16vecs(__fp16 *pH, __fp16 *pG, __fp16 *pS, #else for (i = 0; i < n_tx; i++) { - if (n_tx >= 4) { + if (n_tx % 4 == 0) { // UNROLL_4 for (j = 0; j < n_tx; j += 4) { v2h res0 = (v2h)0.0f; @@ -439,10 +413,12 @@ void mempool_hermitian_f16vecs(__fp16 *pH, __fp16 *pG, __fp16 *pS, asm volatile("fadd.h %0, %0, %1;" : "+&r"(res3) : "r"(pS[2 * i])); } } - (*(v2h *)&pG[2 * (i * n_tx + j)]) = res0; - (*(v2h *)&pG[2 * (i * n_tx + j + 1U)]) = res1; - (*(v2h *)&pG[2 * (i * n_tx + j + 2U)]) = res2; - (*(v2h *)&pG[2 * (i * n_tx + j + 3U)]) = res3; + uint32_t const offset = folded ? N_BANKS : n_tx; + // Store + (*(v2h *)&pG[2 * (i * offset + j)]) = res0; + (*(v2h *)&pG[2 * (i * offset + j + 1U)]) = res1; + (*(v2h *)&pG[2 * (i * offset + j + 2U)]) = res2; + (*(v2h *)&pG[2 * (i * offset + j + 3U)]) = res3; } } } @@ -478,7 +454,7 @@ void mempool_MVP_conjtransp_f16vecs(__fp16 *pH, __fp16 *px, __fp16 *py, #endif #ifndef __CDOTP - if (n_tx < 4) { + if (n_tx % 4 != 0) { for (i = 0; i < n_tx; i++) { as0 = 0.0f; // Initialize the real part of sums bs0 = 0.0f; // Initialize the imag part of sums @@ -505,7 +481,7 @@ void mempool_MVP_conjtransp_f16vecs(__fp16 *pH, __fp16 *px, __fp16 *py, *(v2h *)&py[2U * i] = res0; } - } else if (n_tx >= 4) { + } else { // UNROLL_4 for (i = 0; i < n_tx; i += 4) { as0 = 0.0f; @@ -557,7 +533,7 @@ void mempool_MVP_conjtransp_f16vecs(__fp16 *pH, __fp16 *px, __fp16 *py, } } #else - if (n_tx >= 4) { + if (n_tx % 4 == 0) { // UNROLL_4 for (i = 0; i < n_tx; i += 4) { res0 = (v2h)0.0f; diff --git a/software/kernels/baremetal/mempool_mimo_mmse_f32p.h b/software/kernels/baremetal/mempool_mimo_mmse_f32p.h index 0dd492d33..7e3e6fe1a 100644 --- a/software/kernels/baremetal/mempool_mimo_mmse_f32p.h +++ b/software/kernels/baremetal/mempool_mimo_mmse_f32p.h @@ -18,7 +18,7 @@ @param[in] zf controls if the zero forcing is used @return none */ -void mempool_hermitian_f32p(float *pH, float *pG, float *sigma, +void mempool_hermitian_f32p(float *pH, float *pG, float *pS, const uint32_t n_rx, const uint32_t n_tx, const uint32_t folded, const uint32_t zf, const uint32_t core_id, const uint32_t nPE) { @@ -88,55 +88,30 @@ void mempool_hermitian_f32p(float *pH, float *pG, float *sigma, } if (zf == 0) { // Compute diagonal element - float s = sigma[i]; if (i == j) { - asm volatile("fadd.s %[as0], %[as0], %[sigma];" - : [as0] "+&r"(as0) - : [sigma] "r"(s) - :); + asm volatile("fadd.s %0, %0, %1;" : "+&r"(as0) : "r"(pS[2U * i])); bs0 = 0.0f; } else if (i == (j + 1U)) { - asm volatile("fadd.s %[as1], %[as1], %[sigma];" - : [as1] "+&r"(as1) - : [sigma] "r"(s) - :); + asm volatile("fadd.s %0, %0, %1;" : "+&r"(as1) : "r"(pS[2U * i])); bs1 = 0.0f; } else if (i == (j + 2U)) { - asm volatile("fadd.s %[as2], %[as2], %[sigma];" - : [as2] "+&r"(as2) - : [sigma] "r"(s) - :); + asm volatile("fadd.s %0, %0, %1;" : "+&r"(as2) : "r"(pS[2U * i])); bs2 = 0.0f; } else if (i == (j + 3U)) { - asm volatile("fadd.s %[as3], %[as3], %[sigma];" - : [as3] "+&r"(as3) - : [sigma] "r"(s) - :); + asm volatile("fadd.s %0, %0, %1;" : "+&r"(as3) : "r"(pS[2U * i])); bs3 = 0.0f; } } - if (!folded) { - // Store - pG[2 * (i * n_tx + j)] = as0; - pG[2 * (i * n_tx + j + 1U)] = as1; - pG[2 * (i * n_tx + j + 2U)] = as2; - pG[2 * (i * n_tx + j + 3U)] = as3; - pG[2 * (i * n_tx + j) + 1U] = bs0; - pG[2 * (i * n_tx + j + 1U) + 1U] = bs1; - pG[2 * (i * n_tx + j + 2U) + 1U] = bs2; - pG[2 * (i * n_tx + j + 3U) + 1U] = bs3; - } else { - // Store - uint32_t addr = i * ((n_tx / 2) * N_BANKS) + (j / 4) * (2 * N_BANKS); - pG[addr] = as0; - pG[addr + 1U] = bs0; - pG[addr + 2U] = as1; - pG[addr + 3U] = bs1; - pG[addr + N_BANKS] = as2; - pG[addr + N_BANKS + 1U] = bs2; - pG[addr + N_BANKS + 2U] = as3; - pG[addr + N_BANKS + 3U] = bs3; - } + uint32_t const offset = folded ? N_BANKS : n_tx; + // Store + pG[2 * (i * offset + j)] = as0; + pG[2 * (i * offset + j + 1U)] = as1; + pG[2 * (i * offset + j + 2U)] = as2; + pG[2 * (i * offset + j + 3U)] = as3; + pG[2 * (i * offset + j) + 1U] = bs0; + pG[2 * (i * offset + j + 1U) + 1U] = bs1; + pG[2 * (i * offset + j + 2U) + 1U] = bs2; + pG[2 * (i * offset + j + 3U) + 1U] = bs3; } } mempool_log_partial_barrier(2, mempool_get_core_id(), nPE); diff --git a/software/kernels/baremetal/mempool_mimo_mmse_f32s.h b/software/kernels/baremetal/mempool_mimo_mmse_f32s.h index 311052b10..baad28e0d 100644 --- a/software/kernels/baremetal/mempool_mimo_mmse_f32s.h +++ b/software/kernels/baremetal/mempool_mimo_mmse_f32s.h @@ -14,13 +14,13 @@ @param[in] pS points to the noise vector @param[in] nrx number of received samples @param[in] ntx number of transmitted samples - @param[in] folded controls if output is folded @param[in] zf controls if the zero forcing is used + @param[in] folded controls if output is folded @return none */ void mempool_hermitian_f32s(float *pH, float *pG, float *pS, const uint32_t n_rx, const uint32_t n_tx, - const uint32_t folded, const uint32_t zf) { + const uint32_t zf, const uint32_t folded) { uint32_t i, j, k; float a; @@ -86,55 +86,31 @@ void mempool_hermitian_f32s(float *pH, float *pG, float *pS, :); } if (zf == 0) { - // Compute diagonal elements + // Compute diagonal element if (i == j) { - asm volatile("fadd.s %[as0], %[as0], %[pS];" - : [as0] "+&r"(as0) - : [pS] "r"(pS[i]) - :); + asm volatile("fadd.s %0, %0, %1;" : "+&r"(as0) : "r"(pS[2U * i])); bs0 = 0.0f; } else if (i == (j + 1U)) { - asm volatile("fadd.s %[as1], %[as1], %[pS];" - : [as1] "+&r"(as1) - : [pS] "r"(pS[i]) - :); + asm volatile("fadd.s %0, %0, %1;" : "+&r"(as1) : "r"(pS[2U * i])); bs1 = 0.0f; } else if (i == (j + 2U)) { - asm volatile("fadd.s %[as2], %[as2], %[pS];" - : [as2] "+&r"(as2) - : [pS] "r"(pS[i]) - :); + asm volatile("fadd.s %0, %0, %1;" : "+&r"(as2) : "r"(pS[2U * i])); bs2 = 0.0f; } else if (i == (j + 3U)) { - asm volatile("fadd.s %[as3], %[as3], %[pS];" - : [as3] "+&r"(as3) - : [pS] "r"(pS[i]) - :); + asm volatile("fadd.s %0, %0, %1;" : "+&r"(as3) : "r"(pS[2U * i])); bs3 = 0.0f; } } - if (!folded) { - // Store - pG[2 * (i * n_tx + j)] = as0; - pG[2 * (i * n_tx + j + 1U)] = as1; - pG[2 * (i * n_tx + j + 2U)] = as2; - pG[2 * (i * n_tx + j + 3U)] = as3; - pG[2 * (i * n_tx + j) + 1U] = bs0; - pG[2 * (i * n_tx + j + 1U) + 1U] = bs1; - pG[2 * (i * n_tx + j + 2U) + 1U] = bs2; - pG[2 * (i * n_tx + j + 3U) + 1U] = bs3; - } else { - // Store - uint32_t addr = i * ((n_tx / 2) * N_BANKS) + (j / 4) * (2 * N_BANKS); - pG[addr] = as0; - pG[addr + 1U] = bs0; - pG[addr + 2U] = as1; - pG[addr + 3U] = bs1; - pG[addr + N_BANKS] = as2; - pG[addr + N_BANKS + 1U] = bs2; - pG[addr + N_BANKS + 2U] = as3; - pG[addr + N_BANKS + 3U] = bs3; - } + uint32_t const offset = folded ? N_BANKS : n_tx; + // Store + pG[2U * (i * offset + j)] = as0; + pG[2U * (i * offset + j + 1U)] = as1; + pG[2U * (i * offset + j + 2U)] = as2; + pG[2U * (i * offset + j + 3U)] = as3; + pG[2U * (i * offset + j) + 1U] = bs0; + pG[2U * (i * offset + j + 1U) + 1U] = bs1; + pG[2U * (i * offset + j + 2U) + 1U] = bs2; + pG[2U * (i * offset + j + 3U) + 1U] = bs3; } } return; @@ -215,30 +191,16 @@ void mempool_MVP_conjtransp_f32s(float *pH, float *px, float *py, [a3] "r"(a3), [b0] "r"(b0), [b1] "r"(b1), [b2] "r"(b2), [b3] "r"(b3) :); } - if (!folded) { - // Store - py[2U * i] = as0; - py[2U * (i + 1U)] = as1; - py[2U * (i + 2U)] = as2; - py[2U * (i + 3U)] = as3; - py[2U * i + 1U] = bs0; - py[2U * (i + 1U) + 1U] = bs1; - py[2U * (i + 2U) + 1U] = bs2; - py[2U * (i + 3U) + 1U] = bs3; - i += 4; - } else { - // Store - uint32_t addr = (i / 4) * 2 * N_BANKS; - py[addr] = as0; - py[addr + 1U] = bs0; - py[addr + 2U] = as1; - py[addr + 3U] = bs1; - py[addr + N_BANKS] = as2; - py[addr + N_BANKS + 1U] = bs2; - py[addr + N_BANKS + 2U] = as3; - py[addr + N_BANKS + 3U] = bs3; - i += 4; - } + // Store + py[2U * i] = as0; + py[2U * (i + 1U)] = as1; + py[2U * (i + 2U)] = as2; + py[2U * (i + 3U)] = as3; + py[2U * i + 1U] = bs0; + py[2U * (i + 1U) + 1U] = bs1; + py[2U * (i + 2U) + 1U] = bs2; + py[2U * (i + 3U) + 1U] = bs3; + i += 4; } while (i < n_tx); return; } diff --git a/software/kernels/baremetal/mempool_mimo_mmse_f8s.h b/software/kernels/baremetal/mempool_mimo_mmse_f8s.h index 74d772c64..476d5f3bb 100644 --- a/software/kernels/baremetal/mempool_mimo_mmse_f8s.h +++ b/software/kernels/baremetal/mempool_mimo_mmse_f8s.h @@ -20,7 +20,7 @@ */ void mempool_hermitian_f8s(__fp8 *pH, __fp8 *pS, __fp16 *pG, const uint32_t n_rx, const uint32_t n_tx, - const uint32_t zf) { + const uint32_t zf, const uint32_t folded) { uint32_t i, j, k; __fp8 a; @@ -116,16 +116,16 @@ void mempool_hermitian_f8s(__fp8 *pH, __fp8 *pS, __fp16 *pG, FP16_bs3 = (__fp16)0U; } } - + uint32_t const offset = folded ? N_BANKS : n_tx; // Store - pG[2 * (i * n_tx + j)] = FP16_as0; - pG[2 * (i * n_tx + j + 1U)] = FP16_as1; - pG[2 * (i * n_tx + j + 2U)] = FP16_as2; - pG[2 * (i * n_tx + j + 3U)] = FP16_as3; - pG[2 * (i * n_tx + j) + 1U] = FP16_bs0; - pG[2 * (i * n_tx + j + 1U) + 1U] = FP16_bs1; - pG[2 * (i * n_tx + j + 2U) + 1U] = FP16_bs2; - pG[2 * (i * n_tx + j + 3U) + 1U] = FP16_bs3; + pG[2 * (i * offset + j)] = FP16_as0; + pG[2 * (i * offset + j + 1U)] = FP16_as1; + pG[2 * (i * offset + j + 2U)] = FP16_as2; + pG[2 * (i * offset + j + 3U)] = FP16_as3; + pG[2 * (i * offset + j) + 1U] = FP16_bs0; + pG[2 * (i * offset + j + 1U) + 1U] = FP16_bs1; + pG[2 * (i * offset + j + 2U) + 1U] = FP16_bs2; + pG[2 * (i * offset + j + 3U) + 1U] = FP16_bs3; } } return; @@ -229,7 +229,7 @@ void mempool_MVP_conjtransp_f8s(__fp8 *pH, __fp8 *px, __fp16 *py, void mempool_hermitian_f8vecs(__fp8 *pH, __fp8 *pS, __fp16 *pG, const uint32_t n_rx, const uint32_t n_tx, - const uint32_t zf) { + const uint32_t zf, const uint32_t folded) { uint32_t i, j, k; uint32_t fe0, fe1, fe2, fe3; @@ -303,11 +303,12 @@ void mempool_hermitian_f8vecs(__fp8 *pH, __fp8 *pS, __fp16 *pG, asm volatile("and %0, %0, %1;" : "+&r"(fe3) : "r"(0x0000FFFF)); } } + uint32_t const offset = folded ? N_BANKS : n_tx; // Store - (*(uint32_t *)&pG[2 * (i * n_tx + j)]) = fe0; - (*(uint32_t *)&pG[2 * (i * n_tx + j + 1U)]) = fe1; - (*(uint32_t *)&pG[2 * (i * n_tx + j + 2U)]) = fe2; - (*(uint32_t *)&pG[2 * (i * n_tx + j + 3U)]) = fe3; + (*(uint32_t *)&pG[2 * (i * offset + j)]) = fe0; + (*(uint32_t *)&pG[2 * (i * offset + j + 1U)]) = fe1; + (*(uint32_t *)&pG[2 * (i * offset + j + 2U)]) = fe2; + (*(uint32_t *)&pG[2 * (i * offset + j + 3U)]) = fe3; } } return;