Skip to content

Commit

Permalink
[software] Clean up folded MIMO-MMSE and solution of triangular system
Browse files Browse the repository at this point in the history
  • Loading branch information
mbertuletti committed Oct 16, 2024
1 parent 85683ed commit c7c4ea6
Show file tree
Hide file tree
Showing 16 changed files with 409 additions and 1,006 deletions.
71 changes: 47 additions & 24 deletions software/apps/baremetal/mimo_mmse_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
#include "baremetal/mempool_mimo_mmse_f16s.h"

#include "data_mimo_mmse_f16.h"
#define ZF (0) // When asserted use zero-forcing
#define ZF (0) // When asserted use zero-forcing
#define FOLD (1) // When asserted fold matrices in memory
#define NUM_BANKS (BANKING_FACTOR * NUM_CORES)
//#define DOUBLE_BUFFERING

/**********************************************************
**********************************************************
Expand All @@ -35,13 +35,21 @@

#ifndef DOUBLE_BUFFERING

__fp16 l1_H[2 * N_TX * N_RX * N_ITR]
#if FOLD
#define NUM_ROW (1 + ((N_ITR * N_TX - 1) / NUM_BANKS))
__fp16 l1_G[2 * N_TX * NUM_BANKS * NUM_ROW]
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
__fp16 l1_L[2 * N_TX * NUM_BANKS * NUM_ROW]
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
#else
__fp16 l1_G[2 * N_TX * N_TX * N_ITR]
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
__fp16 l1_L[2 * N_TX * N_TX * N_ITR]
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
#endif

__fp16 l1_H[2 * N_TX * N_RX * N_ITR]
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
__fp16 l1_S[2 * N_TX * N_ITR]
__attribute__((aligned(sizeof(int32_t)), section(".l1")));
__fp16 l1_y[2 * N_RX * N_ITR]
Expand Down Expand Up @@ -99,16 +107,16 @@ int main() {
__fp16 *Ptry3 = y3 + itr * (2 * N_TX);
__fp16 *Ptrx = l1_x + itr * (2 * N_TX);
#ifdef VEC
mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, ZF);
mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, 0, ZF);
mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX);
mempool_cholesky_f16vecs(PtrG, PtrL, N_TX);
mempool_cholesky_f16vecs(PtrG, PtrL, N_TX, 0);
#else
mempool_hermitian_f16s(PtrH, PtrG, PtrS, N_RX, N_TX, 0, ZF);
mempool_MVP_conjtransp_f16s(PtrH, Ptry, Ptry2, N_RX, N_TX, 0);
mempool_cholesky_f16s(PtrG, PtrL, N_TX);
mempool_MVP_conjtransp_f16s(PtrH, Ptry, Ptry2, N_RX, N_TX);
mempool_cholesky_f16s(PtrG, PtrL, N_TX, 0);
#endif
mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX);
mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX);
mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX, 0, 0);
mempool_Ltrisol_f16s(PtrL, Ptry3, Ptrx, N_TX, 1, 0);
}
mempool_stop_benchmark();
}
Expand All @@ -118,26 +126,40 @@ int main() {
mempool_start_benchmark();
// Parallel subcarrier loop
for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) {

__fp16 *PtrH = l1_H + itr * (2 * N_TX * N_RX);
__fp16 *Ptry = l1_y + itr * (2 * N_RX);
__fp16 *PtrS = l1_S + itr * (2 * N_TX);
// Auxiliary vectors
#if FOLD
__fp16 *PtrG = l1_G + (itr % NUM_ROW) * (2 * N_TX * NUM_BANKS) +
(itr / NUM_ROW) * (2 * N_TX);
__fp16 *PtrL = l1_L + (itr % NUM_ROW) * (2 * N_TX * NUM_BANKS) +
(itr / NUM_ROW) * (2 * N_TX);
__fp16 *Ptry2 =
y2 + (itr % NUM_ROW) * NUM_BANKS + (itr / NUM_ROW) * (2 * N_TX);
__fp16 *Ptry3 =
y3 + (itr % NUM_ROW) * NUM_BANKS + (itr / NUM_ROW) * (2 * N_TX);
__fp16 *Ptrx = l1_x + itr * (2 * N_TX);
#else
__fp16 *PtrG = l1_G + itr * (2 * N_TX * N_TX);
__fp16 *PtrL = l1_L + itr * (2 * N_TX * N_TX);
__fp16 *Ptry2 = y2 + itr * (2 * N_TX);
__fp16 *Ptry3 = y3 + itr * (2 * N_TX);
__fp16 *Ptrx = l1_x + itr * (2 * N_TX);
#endif

#ifdef VEC
mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, ZF);
mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, ZF, FOLD);
mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX);
mempool_cholesky_f16vecs(PtrG, PtrL, N_TX);
mempool_cholesky_f16vecs(PtrG, PtrL, N_TX, FOLD);
#else
mempool_hermitian_f16s(PtrH, PtrG, PtrS, N_RX, N_TX, 0, ZF);
mempool_MVP_conjtransp_f16s(PtrH, Ptry, Ptry2, N_RX, N_TX, 0);
mempool_cholesky_f16s(PtrG, PtrL, N_TX);
mempool_hermitian_f16s(PtrH, PtrG, PtrS, N_RX, N_TX, ZF, FOLD);
mempool_MVP_conjtransp_f16s(PtrH, Ptry, Ptry2, N_RX, N_TX);
mempool_cholesky_f16s(PtrG, PtrL, N_TX, FOLD);
#endif
mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX);
mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX);
mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX, 0, FOLD);
mempool_Ltrisol_f16s(PtrL, Ptry3, Ptrx, N_TX, 1, FOLD);
}
mempool_barrier(num_cores);
mempool_stop_benchmark();
Expand All @@ -152,6 +174,7 @@ int main() {
}
}
#else
// mempool_check_f16(l1_x, l2_x, 4 * N_TX, 0.1f, 0);
mempool_barrier(num_cores);
#endif

Expand Down Expand Up @@ -264,11 +287,11 @@ int main() {
__fp16 *PtrL = L + itr * (2 * N_TX * N_TX);
__fp16 *Ptry2 = y2 + itr * (2 * N_TX);
__fp16 *Ptry3 = y3 + itr * (2 * N_TX);
mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, ZF);
mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, 0, ZF);
mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX);
mempool_cholesky_f16vecs(PtrG, PtrL, N_TX);
mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX);
mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX);
mempool_cholesky_f16vecs(PtrG, PtrL, N_TX, 0);
mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX, 0, 0);
mempool_Ltrisol_f16s(PtrL, Ptry3, Ptrx, N_TX, 1, 0);
}
#endif

Expand All @@ -291,7 +314,7 @@ int main() {
__fp16 *PtrS = cmpt_S + itr * (2 * N_TX);
__fp16 *PtrG = G + itr * (2 * N_TX * N_TX);
__fp16 *Ptry2 = y2 + itr * (2 * N_TX);
mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, ZF);
mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, 0, ZF);
mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX);
}
mempool_log_barrier(2, core_id);
Expand All @@ -313,9 +336,9 @@ int main() {
__fp16 *PtrL = L + itr * (2 * N_TX * N_TX);
__fp16 *Ptry2 = y2 + itr * (2 * N_TX);
__fp16 *Ptry3 = y3 + itr * (2 * N_TX);
mempool_cholesky_f16vecs(PtrG, PtrL, N_TX);
mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX);
mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX);
mempool_cholesky_f16vecs(PtrG, PtrL, N_TX, 0);
mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX, 0, 0);
mempool_Ltrisol_f16s(PtrL, Ptry3, Ptrx, N_TX, 1, 0);
}
#endif

Expand Down
120 changes: 48 additions & 72 deletions software/apps/baremetal/mimo_mmse_f32/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,25 +21,25 @@

#include "data_mimo_mmse_f32.h"

//#define SINGLE
//#define JACOBI
#define PARALLEL
#define SINGLE

float l1_H[2 * N_TX * N_RX * N_ITR]
__attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)),
section(".l1_prio")));
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
float l1_G[2 * N_TX * N_TX * N_ITR]
__attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)),
section(".l1_prio")));
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
float l1_L[2 * N_TX * N_TX * N_ITR]
__attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)),
section(".l1_prio")));
float l1_Sigma[N_TX * N_ITR] __attribute__((section(".l1_prio")));

float l1_y[2 * N_RX * N_ITR] __attribute__((section(".l1_prio")));
float y2[2 * N_TX * N_ITR] __attribute__((section(".l1_prio")));
float y3[2 * N_TX * N_ITR] __attribute__((section(".l1_prio")));
float l1_x[2 * N_TX * N_ITR] __attribute__((section(".l1_prio")));
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
float l1_S[2 * N_TX * N_ITR]
__attribute__((aligned(sizeof(int32_t)), section(".l1")));
float l1_y[2 * N_RX * N_ITR]
__attribute__((aligned(sizeof(int32_t)), section(".l1")));

float y2[2 * N_TX * N_ITR]
__attribute__((aligned(sizeof(int32_t)), section(".l1")));
float y3[2 * N_TX * N_ITR]
__attribute__((aligned(sizeof(int32_t)), section(".l1")));
float l1_x[2 * N_TX * N_ITR]
__attribute__((aligned(sizeof(int32_t)), section(".l1")));

// Driver program
int main() {
Expand All @@ -52,33 +52,23 @@ int main() {
if (core_id == 0) {
dma_memcpy_blocking(l1_H, l2_H, 2 * N_ITR * N_RX * N_TX * sizeof(int32_t));
dma_memcpy_blocking(l1_y, l2_y, 2 * N_ITR * N_RX * sizeof(int32_t));
dma_memcpy_blocking(l1_Sigma, l2_Sigma, N_ITR * N_TX * sizeof(int32_t));
dma_memcpy_blocking(l1_S, l2_S, 2 * N_ITR * N_TX * sizeof(int32_t));
}
mempool_barrier(num_cores);

#if defined(SINGLE) && defined(__XDIVSQRT)
/* Benchmark */
if (core_id == 0) {
mempool_start_benchmark();
mempool_hermitian_f32s(l1_H, l1_G, l1_Sigma, N_RX, N_TX, 0, 0);
mempool_hermitian_f32s(l1_H, l1_G, l1_S, N_RX, N_TX, 0, 0);
mempool_MVP_conjtransp_f32s(l1_H, l1_y, y2, N_RX, N_TX, 0);
mempool_cholesky_f32s(l1_G, l1_L, N_TX);
mempool_Ltrisol_f32s(l1_L, y2, y3, N_TX);
mempool_Lttrisol_f32s(l1_L, y3, l1_x, N_TX);
mempool_stop_benchmark();
}
mempool_barrier(num_cores);
#endif

#ifdef JACOBI
/* Benchmark */
if (core_id == 0) {
mempool_start_benchmark();
mempool_hermitian_f32s(l1_H, l1_G, l1_Sigma, N_RX, N_TX, 0, 0);
mempool_MVP_conjtransp_f32s(l1_H, l1_y, y2, N_RX, N_TX, 0);
mempool_stop_benchmark();
mempool_start_benchmark();
mempool_jacobi_f32s(l1_G, y2, l1_x, N_TX, 0.005f, 20U);
#else
mempool_cholesky_f32s(l1_G, l1_L, N_TX, 0);
mempool_Ltrisol_f32s(l1_L, y2, y3, N_TX, 0, 0);
mempool_Ltrisol_f32s(l1_L, y3, l1_x, N_TX, 1, 0);
#endif
mempool_stop_benchmark();
}
mempool_barrier(num_cores);
Expand All @@ -88,21 +78,35 @@ int main() {
// Each iteration is assigned to a processor
mempool_start_benchmark();
for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) {

// Inputs
float *PtrH = l1_H + itr * (2 * N_TX * N_RX);
float *PtrSigma = l1_Sigma + itr * N_TX;
float *PtrS = l1_S + itr * (2 * N_TX);
float *Ptry = l1_y + itr * (2 * N_RX);
// Intermediate results and outputs
#if FOLD
__fp16 *PtrG = l1_G + (itr % NUM_ROW) * (2 * N_TX * NUM_BANKS) +
(itr / NUM_ROW) * (2 * N_TX);
__fp16 *PtrL = l1_L + (itr % NUM_ROW) * (2 * N_TX * NUM_BANKS) +
(itr / NUM_ROW) * (2 * N_TX);
__fp16 *Ptry2 =
y2 + (itr % NUM_ROW) * NUM_BANKS + (itr / NUM_ROW) * (2 * N_TX);
__fp16 *Ptry3 =
y3 + (itr % NUM_ROW) * NUM_BANKS + (itr / NUM_ROW) * (2 * N_TX);
__fp16 *Ptrx = l1_x + itr * (2 * N_TX);
#else
float *PtrG = l1_G + itr * (2 * N_TX * N_TX);
float *PtrL = l1_L + itr * (2 * N_TX * N_TX);
float *Ptry2 = y2 + itr * (2 * N_TX);
float *Ptry3 = y3 + itr * (2 * N_TX);
float *Ptrx = l1_x + itr * (2 * N_TX);
mempool_hermitian_f32s(PtrH, PtrG, PtrSigma, N_RX, N_TX, 0, 0);
mempool_MVP_conjtransp_f32s(PtrH, Ptry, Ptry2, N_RX, N_TX, 0);
mempool_cholesky_f32s(PtrG, PtrL, N_TX);
mempool_Ltrisol_f32s(PtrL, Ptry2, Ptry3, N_TX);
mempool_Lttrisol_f32s(PtrL, Ptry3, Ptrx, N_TX);
#endif

mempool_hermitian_f32s(PtrH, PtrG, PtrS, N_RX, N_TX, 0, FOLD);
mempool_MVP_conjtransp_f32s(PtrH, Ptry, Ptry2, N_RX, N_TX);
mempool_cholesky_f32s(PtrG, PtrL, N_TX, FOLD);
mempool_Ltrisol_f32s(PtrL, Ptry2, Ptry3, N_TX, 0, FOLD);
mempool_Ltrisol_f32s(PtrL, Ptry3, Ptrx, N_TX, 1, FOLD);
}
mempool_log_barrier(2, core_id);
mempool_stop_benchmark();
Expand All @@ -118,9 +122,9 @@ int main() {
for (uint32_t itr = pool_id; itr < N_ITR; itr += num_pools) {
float *PtrH = l1_H + itr * (2 * N_TX * N_RX);
float *PtrG = l1_G + itr * (2 * N_TX * N_TX);
float *PtrSigma = l1_Sigma + itr * N_TX;
mempool_hermitian_f32p(PtrH, PtrG, PtrSigma, N_RX, N_TX, 0, 0,
core_id % N_TX, N_TX);
float *PtrS = l1_S + itr * N_TX;
mempool_hermitian_f32p(PtrH, PtrG, PtrS, N_RX, N_TX, 0, 0, core_id % N_TX,
N_TX);
}
mempool_stop_benchmark();
mempool_start_benchmark();
Expand All @@ -135,37 +139,9 @@ int main() {
float *Ptry3 = y3 + itr * (2 * N_TX);
float *Ptrx = l1_x + itr * (2 * N_TX);
mempool_MVP_conjtransp_f32s(PtrH, Ptry, Ptry2, N_RX, N_TX, 0);
mempool_cholesky_f32s(PtrG, PtrL, N_TX);
mempool_Ltrisol_f32s(PtrL, Ptry2, Ptry3, N_TX);
mempool_Lttrisol_f32s(PtrL, Ptry3, Ptrx, N_TX);
}
mempool_log_barrier(2, core_id);
mempool_stop_benchmark();
#endif

#if defined(FOLDED) && defined(__XDIVSQRT)
mempool_start_benchmark();
for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) {
// Inputs
float *PtrH = l1_H + itr * (2 * N_TX * N_RX);
float *PtrSigma = l1_Sigma + itr * N_TX;
float *Ptry = l1_y + itr * (2 * N_RX);
// Intermediate results and outputs
float *PtrG = l1_G + (itr % num_cores) * N_TX +
(itr / num_cores) * (2 * N_TX * N_BANKS);
float *PtrL = l1_L + (itr % num_cores) * N_TX +
(itr / num_cores) * (2 * N_TX * N_BANKS);
float *Ptry2 =
y2 + (itr % num_cores) * N_TX + (itr / num_cores) * (2 * N_BANKS);
float *Ptry3 =
y3 + (itr % num_cores) * N_TX + (itr / num_cores) * (2 * N_BANKS);
float *Ptrx =
l1_x + (itr % num_cores) * N_TX + (itr / num_cores) * (2 * N_BANKS);
mempool_hermitian_f32s(PtrH, PtrG, PtrSigma, N_RX, N_TX, 1, 0);
mempool_MVP_conjtransp_f32s(PtrH, Ptry, Ptry2, N_RX, N_TX, 1);
mempool_cholesky_folded_f32s(PtrG, PtrL, N_TX);
mempool_Ltrisol_folded_f32s(PtrL, Ptry2, Ptry3, N_TX);
mempool_Lttrisol_folded_f32s(PtrL, Ptry3, Ptrx, N_TX);
mempool_cholesky_f32s(PtrG, PtrL, N_TX, 0);
mempool_Ltrisol_f32s(PtrL, Ptry2, Ptry3, N_TX, 0, 0);
mempool_Ltrisol_f32s(PtrL, Ptry3, Ptrx, N_TX, 1, 0);
}
mempool_log_barrier(2, core_id);
mempool_stop_benchmark();
Expand Down
Loading

0 comments on commit c7c4ea6

Please sign in to comment.