From 13ef7599e18cc7599bbef69b94224b8d5ec93210 Mon Sep 17 00:00:00 2001
From: mbertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Wed, 16 Oct 2024 08:46:23 +0200
Subject: [PATCH] [software] Clean up folded MIMO-MMSE and solution of
 triangular system

---
 software/apps/baremetal/mimo_mmse_f16/main.c  |  87 +++--
 software/apps/baremetal/mimo_mmse_f32/main.c  | 120 +++----
 software/apps/baremetal/mimo_mmse_f8/main.c   |  67 ++--
 software/apps/baremetal/mimo_mmse_q16/main.c  |   8 +-
 software/data/data_mimo_mmse_f32.h.tpl        |   2 +-
 software/data/generate_mimo_mmse.py           |   4 +-
 software/kernels/baremetal/mempool_checks.h   |  51 +--
 .../kernels/baremetal/mempool_cholesky_f16s.h | 179 ++++------
 .../kernels/baremetal/mempool_cholesky_f32s.h | 109 +-----
 .../baremetal/mempool_linearsolver_f16s.h     | 311 ++++--------------
 .../baremetal/mempool_linearsolver_f32s.h     | 193 ++---------
 .../baremetal/mempool_linearsolver_q16s.h     |  84 +----
 .../baremetal/mempool_mimo_mmse_f16s.h        | 110 +++----
 .../baremetal/mempool_mimo_mmse_f32p.h        |  55 +---
 .../baremetal/mempool_mimo_mmse_f32s.h        |  92 ++----
 .../kernels/baremetal/mempool_mimo_mmse_f8s.h |  31 +-
 16 files changed, 467 insertions(+), 1036 deletions(-)

diff --git a/software/apps/baremetal/mimo_mmse_f16/main.c b/software/apps/baremetal/mimo_mmse_f16/main.c
index 218f4d247..ffd879c91 100644
--- a/software/apps/baremetal/mimo_mmse_f16/main.c
+++ b/software/apps/baremetal/mimo_mmse_f16/main.c
@@ -18,9 +18,9 @@
 #include "baremetal/mempool_mimo_mmse_f16s.h"
 
 #include "data_mimo_mmse_f16.h"
-#define ZF (0) // When asserted use zero-forcing
+#define ZF (0)   // When asserted use zero-forcing
+#define FOLD (0) // When asserted fold matrices in memory
 #define NUM_BANKS (BANKING_FACTOR * NUM_CORES)
-//#define DOUBLE_BUFFERING
 
 /**********************************************************
  **********************************************************
@@ -35,13 +35,21 @@
 
 #ifndef DOUBLE_BUFFERING
 
-__fp16 l1_H[2 * N_TX * N_RX * N_ITR]
+#if FOLD
+#define NUM_ROW (1 + ((N_ITR * N_TX - 1) / NUM_BANKS))
+__fp16 l1_G[2 * N_TX * NUM_BANKS * NUM_ROW]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+__fp16 l1_L[2 * N_TX * NUM_BANKS * NUM_ROW]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+#else
 __fp16 l1_G[2 * N_TX * N_TX * N_ITR]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
 __fp16 l1_L[2 * N_TX * N_TX * N_ITR]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+#endif
 
+__fp16 l1_H[2 * N_TX * N_RX * N_ITR]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
 __fp16 l1_S[2 * N_TX * N_ITR]
     __attribute__((aligned(sizeof(int32_t)), section(".l1")));
 __fp16 l1_y[2 * N_RX * N_ITR]
@@ -62,9 +70,9 @@ int main() {
   mempool_barrier_init(core_id); // Initialize barrier and synchronize
 #endif
 
-#ifdef BANSHEE
   /* Initialize matrices */
   if (core_id == 0) {
+#ifdef BANSHEE
     for (uint32_t i = 0; i < N_RX * N_TX * N_ITR; i++) {
       (*(uint32_t *)&l1_H[2 * i]) = *(uint32_t *)&l2_H[2 * i];
     }
@@ -74,14 +82,14 @@ int main() {
     for (uint32_t i = 0; i < N_TX * N_ITR; i++) {
       (*(uint32_t *)&l1_S[2 * i]) = *(uint32_t *)&l2_S[2 * i];
     }
-  }
 #else
-  /* Initialize matrices */
-  if (core_id == 0) {
     dma_memcpy_blocking(l1_H, l2_H, 2 * N_TX * N_RX * N_ITR * sizeof(int16_t));
     dma_memcpy_blocking(l1_y, l2_y, 2 * N_RX * N_ITR * sizeof(int16_t));
     dma_memcpy_blocking(l1_S, l2_S, 2 * N_TX * N_ITR * sizeof(int16_t));
+#endif
+    printf("Data transferred\n");
   }
+#ifndef BANSHEE
   mempool_barrier(num_cores);
 #endif
 
@@ -99,16 +107,16 @@ int main() {
       __fp16 *Ptry3 = y3 + itr * (2 * N_TX);
       __fp16 *Ptrx = l1_x + itr * (2 * N_TX);
 #ifdef VEC
-      mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, ZF);
+      mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, FOLD, ZF);
       mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX);
-      mempool_cholesky_f16vecs(PtrG, PtrL, N_TX);
+      mempool_cholesky_f16vecs(PtrG, PtrL, N_TX, 0);
 #else
-      mempool_hermitian_f16s(PtrH, PtrG, PtrS, N_RX, N_TX, 0, ZF);
-      mempool_MVP_conjtransp_f16s(PtrH, Ptry, Ptry2, N_RX, N_TX, 0);
-      mempool_cholesky_f16s(PtrG, PtrL, N_TX);
+      mempool_hermitian_f16s(PtrH, PtrG, PtrS, N_RX, N_TX, FOLD, ZF);
+      mempool_MVP_conjtransp_f16s(PtrH, Ptry, Ptry2, N_RX, N_TX);
+      mempool_cholesky_f16s(PtrG, PtrL, N_TX, 0);
 #endif
-      mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX);
-      mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX);
+      mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX, 0, FOLD);
+      mempool_Ltrisol_f16s(PtrL, Ptry3, Ptrx, N_TX, 1, FOLD);
     }
     mempool_stop_benchmark();
   }
@@ -116,35 +124,51 @@ int main() {
 
 #ifdef PARALLEL
   mempool_start_benchmark();
+  uint32_t time_init = mempool_get_timer();
   // Parallel subcarrier loop
   for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) {
+
     __fp16 *PtrH = l1_H + itr * (2 * N_TX * N_RX);
     __fp16 *Ptry = l1_y + itr * (2 * N_RX);
     __fp16 *PtrS = l1_S + itr * (2 * N_TX);
     // Auxiliary vectors
+#if FOLD
+    __fp16 *PtrG = l1_G + (itr % NUM_ROW) * (2 * N_TX * NUM_BANKS) +
+                   (itr / NUM_ROW) * (2 * N_TX);
+    __fp16 *PtrL = l1_L + (itr % NUM_ROW) * (2 * N_TX * NUM_BANKS) +
+                   (itr / NUM_ROW) * (2 * N_TX);
+    __fp16 *Ptry2 =
+        y2 + (itr % NUM_ROW) * NUM_BANKS + (itr / NUM_ROW) * (2 * N_TX);
+    __fp16 *Ptry3 =
+        y3 + (itr % NUM_ROW) * NUM_BANKS + (itr / NUM_ROW) * (2 * N_TX);
+    __fp16 *Ptrx = l1_x + itr * (2 * N_TX);
+#else
     __fp16 *PtrG = l1_G + itr * (2 * N_TX * N_TX);
     __fp16 *PtrL = l1_L + itr * (2 * N_TX * N_TX);
     __fp16 *Ptry2 = y2 + itr * (2 * N_TX);
     __fp16 *Ptry3 = y3 + itr * (2 * N_TX);
     __fp16 *Ptrx = l1_x + itr * (2 * N_TX);
+#endif
+
 #ifdef VEC
-    mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, ZF);
+    mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, ZF, FOLD);
     mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX);
-    mempool_cholesky_f16vecs(PtrG, PtrL, N_TX);
+    mempool_cholesky_f16vecs(PtrG, PtrL, N_TX, FOLD);
 #else
-    mempool_hermitian_f16s(PtrH, PtrG, PtrS, N_RX, N_TX, 0, ZF);
-    mempool_MVP_conjtransp_f16s(PtrH, Ptry, Ptry2, N_RX, N_TX, 0);
-    mempool_cholesky_f16s(PtrG, PtrL, N_TX);
+    mempool_hermitian_f16s(PtrH, PtrG, PtrS, N_RX, N_TX, ZF, FOLD);
+    mempool_MVP_conjtransp_f16s(PtrH, Ptry, Ptry2, N_RX, N_TX);
+    mempool_cholesky_f16s(PtrG, PtrL, N_TX, FOLD);
 #endif
-    mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX);
-    mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX);
+    mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX, 0, FOLD);
+    mempool_Ltrisol_f16s(PtrL, Ptry3, Ptrx, N_TX, 1, FOLD);
   }
   mempool_barrier(num_cores);
+  uint32_t time_end = mempool_get_timer();
   mempool_stop_benchmark();
 #endif
 
-  // Check the result
 #ifdef BANSHEE
+  // Check the result
   if (core_id == 0) {
     for (uint32_t i = 0; i < 2 * N_TX * N_ITR; i++) {
       uint32_t x = (*(uint32_t *)&l1_x[i]) & 0x0000FFFF;
@@ -152,6 +176,9 @@ int main() {
     }
   }
 #else
+  if (core_id == 0) {
+    printf("Runtime: %d\n", time_end - time_init);
+  }
   mempool_barrier(num_cores);
 #endif
 
@@ -264,11 +291,11 @@ int main() {
       __fp16 *PtrL = L + itr * (2 * N_TX * N_TX);
       __fp16 *Ptry2 = y2 + itr * (2 * N_TX);
       __fp16 *Ptry3 = y3 + itr * (2 * N_TX);
-      mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, ZF);
+      mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, 0, ZF);
       mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX);
-      mempool_cholesky_f16vecs(PtrG, PtrL, N_TX);
-      mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX);
-      mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX);
+      mempool_cholesky_f16vecs(PtrG, PtrL, N_TX, 0);
+      mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX, 0, 0);
+      mempool_Ltrisol_f16s(PtrL, Ptry3, Ptrx, N_TX, 1, 0);
     }
 #endif
 
@@ -291,7 +318,7 @@ int main() {
       __fp16 *PtrS = cmpt_S + itr * (2 * N_TX);
       __fp16 *PtrG = G + itr * (2 * N_TX * N_TX);
       __fp16 *Ptry2 = y2 + itr * (2 * N_TX);
-      mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, ZF);
+      mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, 0, ZF);
       mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX);
     }
     mempool_log_barrier(2, core_id);
@@ -313,9 +340,9 @@ int main() {
       __fp16 *PtrL = L + itr * (2 * N_TX * N_TX);
       __fp16 *Ptry2 = y2 + itr * (2 * N_TX);
       __fp16 *Ptry3 = y3 + itr * (2 * N_TX);
-      mempool_cholesky_f16vecs(PtrG, PtrL, N_TX);
-      mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX);
-      mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX);
+      mempool_cholesky_f16vecs(PtrG, PtrL, N_TX, 0);
+      mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX, 0, 0);
+      mempool_Ltrisol_f16s(PtrL, Ptry3, Ptrx, N_TX, 1, 0);
     }
 #endif
 
diff --git a/software/apps/baremetal/mimo_mmse_f32/main.c b/software/apps/baremetal/mimo_mmse_f32/main.c
index aa95b1919..194c4c71c 100644
--- a/software/apps/baremetal/mimo_mmse_f32/main.c
+++ b/software/apps/baremetal/mimo_mmse_f32/main.c
@@ -21,25 +21,25 @@
 
 #include "data_mimo_mmse_f32.h"
 
-//#define SINGLE
-//#define JACOBI
-#define PARALLEL
+#define SINGLE
 
 float l1_H[2 * N_TX * N_RX * N_ITR]
-    __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)),
-                   section(".l1_prio")));
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
 float l1_G[2 * N_TX * N_TX * N_ITR]
-    __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)),
-                   section(".l1_prio")));
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
 float l1_L[2 * N_TX * N_TX * N_ITR]
-    __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)),
-                   section(".l1_prio")));
-float l1_Sigma[N_TX * N_ITR] __attribute__((section(".l1_prio")));
-
-float l1_y[2 * N_RX * N_ITR] __attribute__((section(".l1_prio")));
-float y2[2 * N_TX * N_ITR] __attribute__((section(".l1_prio")));
-float y3[2 * N_TX * N_ITR] __attribute__((section(".l1_prio")));
-float l1_x[2 * N_TX * N_ITR] __attribute__((section(".l1_prio")));
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+float l1_S[2 * N_TX * N_ITR]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1")));
+float l1_y[2 * N_RX * N_ITR]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1")));
+
+float y2[2 * N_TX * N_ITR]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1")));
+float y3[2 * N_TX * N_ITR]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1")));
+float l1_x[2 * N_TX * N_ITR]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1")));
 
 // Driver program
 int main() {
@@ -52,7 +52,7 @@ int main() {
   if (core_id == 0) {
     dma_memcpy_blocking(l1_H, l2_H, 2 * N_ITR * N_RX * N_TX * sizeof(int32_t));
     dma_memcpy_blocking(l1_y, l2_y, 2 * N_ITR * N_RX * sizeof(int32_t));
-    dma_memcpy_blocking(l1_Sigma, l2_Sigma, N_ITR * N_TX * sizeof(int32_t));
+    dma_memcpy_blocking(l1_S, l2_S, 2 * N_ITR * N_TX * sizeof(int32_t));
   }
   mempool_barrier(num_cores);
 
@@ -60,25 +60,15 @@ int main() {
   /* Benchmark */
   if (core_id == 0) {
     mempool_start_benchmark();
-    mempool_hermitian_f32s(l1_H, l1_G, l1_Sigma, N_RX, N_TX, 0, 0);
+    mempool_hermitian_f32s(l1_H, l1_G, l1_S, N_RX, N_TX, 0, 0);
     mempool_MVP_conjtransp_f32s(l1_H, l1_y, y2, N_RX, N_TX, 0);
-    mempool_cholesky_f32s(l1_G, l1_L, N_TX);
-    mempool_Ltrisol_f32s(l1_L, y2, y3, N_TX);
-    mempool_Lttrisol_f32s(l1_L, y3, l1_x, N_TX);
-    mempool_stop_benchmark();
-  }
-  mempool_barrier(num_cores);
-#endif
-
 #ifdef JACOBI
-  /* Benchmark */
-  if (core_id == 0) {
-    mempool_start_benchmark();
-    mempool_hermitian_f32s(l1_H, l1_G, l1_Sigma, N_RX, N_TX, 0, 0);
-    mempool_MVP_conjtransp_f32s(l1_H, l1_y, y2, N_RX, N_TX, 0);
-    mempool_stop_benchmark();
-    mempool_start_benchmark();
     mempool_jacobi_f32s(l1_G, y2, l1_x, N_TX, 0.005f, 20U);
+#else
+    mempool_cholesky_f32s(l1_G, l1_L, N_TX, 0);
+    mempool_Ltrisol_f32s(l1_L, y2, y3, N_TX, 0, 0);
+    mempool_Ltrisol_f32s(l1_L, y3, l1_x, N_TX, 1, 0);
+#endif
     mempool_stop_benchmark();
   }
   mempool_barrier(num_cores);
@@ -88,21 +78,35 @@ int main() {
   // Each iteration is assigned to a processor
   mempool_start_benchmark();
   for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) {
+
     // Inputs
     float *PtrH = l1_H + itr * (2 * N_TX * N_RX);
-    float *PtrSigma = l1_Sigma + itr * N_TX;
+    float *PtrS = l1_S + itr * (2 * N_TX);
     float *Ptry = l1_y + itr * (2 * N_RX);
     // Intermediate results and outputs
+#if FOLD
+    __fp16 *PtrG = l1_G + (itr % NUM_ROW) * (2 * N_TX * NUM_BANKS) +
+                   (itr / NUM_ROW) * (2 * N_TX);
+    __fp16 *PtrL = l1_L + (itr % NUM_ROW) * (2 * N_TX * NUM_BANKS) +
+                   (itr / NUM_ROW) * (2 * N_TX);
+    __fp16 *Ptry2 =
+        y2 + (itr % NUM_ROW) * NUM_BANKS + (itr / NUM_ROW) * (2 * N_TX);
+    __fp16 *Ptry3 =
+        y3 + (itr % NUM_ROW) * NUM_BANKS + (itr / NUM_ROW) * (2 * N_TX);
+    __fp16 *Ptrx = l1_x + itr * (2 * N_TX);
+#else
     float *PtrG = l1_G + itr * (2 * N_TX * N_TX);
     float *PtrL = l1_L + itr * (2 * N_TX * N_TX);
     float *Ptry2 = y2 + itr * (2 * N_TX);
     float *Ptry3 = y3 + itr * (2 * N_TX);
     float *Ptrx = l1_x + itr * (2 * N_TX);
-    mempool_hermitian_f32s(PtrH, PtrG, PtrSigma, N_RX, N_TX, 0, 0);
-    mempool_MVP_conjtransp_f32s(PtrH, Ptry, Ptry2, N_RX, N_TX, 0);
-    mempool_cholesky_f32s(PtrG, PtrL, N_TX);
-    mempool_Ltrisol_f32s(PtrL, Ptry2, Ptry3, N_TX);
-    mempool_Lttrisol_f32s(PtrL, Ptry3, Ptrx, N_TX);
+#endif
+
+    mempool_hermitian_f32s(PtrH, PtrG, PtrS, N_RX, N_TX, 0, FOLD);
+    mempool_MVP_conjtransp_f32s(PtrH, Ptry, Ptry2, N_RX, N_TX);
+    mempool_cholesky_f32s(PtrG, PtrL, N_TX, FOLD);
+    mempool_Ltrisol_f32s(PtrL, Ptry2, Ptry3, N_TX, 0, FOLD);
+    mempool_Ltrisol_f32s(PtrL, Ptry3, Ptrx, N_TX, 1, FOLD);
   }
   mempool_log_barrier(2, core_id);
   mempool_stop_benchmark();
@@ -118,9 +122,9 @@ int main() {
   for (uint32_t itr = pool_id; itr < N_ITR; itr += num_pools) {
     float *PtrH = l1_H + itr * (2 * N_TX * N_RX);
     float *PtrG = l1_G + itr * (2 * N_TX * N_TX);
-    float *PtrSigma = l1_Sigma + itr * N_TX;
-    mempool_hermitian_f32p(PtrH, PtrG, PtrSigma, N_RX, N_TX, 0, 0,
-                           core_id % N_TX, N_TX);
+    float *PtrS = l1_S + itr * N_TX;
+    mempool_hermitian_f32p(PtrH, PtrG, PtrS, N_RX, N_TX, 0, 0, core_id % N_TX,
+                           N_TX);
   }
   mempool_stop_benchmark();
   mempool_start_benchmark();
@@ -135,37 +139,9 @@ int main() {
     float *Ptry3 = y3 + itr * (2 * N_TX);
     float *Ptrx = l1_x + itr * (2 * N_TX);
     mempool_MVP_conjtransp_f32s(PtrH, Ptry, Ptry2, N_RX, N_TX, 0);
-    mempool_cholesky_f32s(PtrG, PtrL, N_TX);
-    mempool_Ltrisol_f32s(PtrL, Ptry2, Ptry3, N_TX);
-    mempool_Lttrisol_f32s(PtrL, Ptry3, Ptrx, N_TX);
-  }
-  mempool_log_barrier(2, core_id);
-  mempool_stop_benchmark();
-#endif
-
-#if defined(FOLDED) && defined(__XDIVSQRT)
-  mempool_start_benchmark();
-  for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) {
-    // Inputs
-    float *PtrH = l1_H + itr * (2 * N_TX * N_RX);
-    float *PtrSigma = l1_Sigma + itr * N_TX;
-    float *Ptry = l1_y + itr * (2 * N_RX);
-    // Intermediate results and outputs
-    float *PtrG = l1_G + (itr % num_cores) * N_TX +
-                  (itr / num_cores) * (2 * N_TX * N_BANKS);
-    float *PtrL = l1_L + (itr % num_cores) * N_TX +
-                  (itr / num_cores) * (2 * N_TX * N_BANKS);
-    float *Ptry2 =
-        y2 + (itr % num_cores) * N_TX + (itr / num_cores) * (2 * N_BANKS);
-    float *Ptry3 =
-        y3 + (itr % num_cores) * N_TX + (itr / num_cores) * (2 * N_BANKS);
-    float *Ptrx =
-        l1_x + (itr % num_cores) * N_TX + (itr / num_cores) * (2 * N_BANKS);
-    mempool_hermitian_f32s(PtrH, PtrG, PtrSigma, N_RX, N_TX, 1, 0);
-    mempool_MVP_conjtransp_f32s(PtrH, Ptry, Ptry2, N_RX, N_TX, 1);
-    mempool_cholesky_folded_f32s(PtrG, PtrL, N_TX);
-    mempool_Ltrisol_folded_f32s(PtrL, Ptry2, Ptry3, N_TX);
-    mempool_Lttrisol_folded_f32s(PtrL, Ptry3, Ptrx, N_TX);
+    mempool_cholesky_f32s(PtrG, PtrL, N_TX, 0);
+    mempool_Ltrisol_f32s(PtrL, Ptry2, Ptry3, N_TX, 0, 0);
+    mempool_Ltrisol_f32s(PtrL, Ptry3, Ptrx, N_TX, 1, 0);
   }
   mempool_log_barrier(2, core_id);
   mempool_stop_benchmark();
diff --git a/software/apps/baremetal/mimo_mmse_f8/main.c b/software/apps/baremetal/mimo_mmse_f8/main.c
index cd6258ad6..294e917bc 100644
--- a/software/apps/baremetal/mimo_mmse_f8/main.c
+++ b/software/apps/baremetal/mimo_mmse_f8/main.c
@@ -14,26 +14,34 @@
 
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_cholesky_f16s.h"
-#include "baremetal/mempool_cholesky_f8s.h"
 #include "baremetal/mempool_linearsolver_f16s.h"
-#include "baremetal/mempool_linearsolver_f8s.h"
 #include "baremetal/mempool_mimo_mmse_f8s.h"
 
 #include "data_mimo_mmse_f8.h"
-#define ZF (0) // When asserted use zero-forcing
+#define ZF (0)   // When asserted use zero-forcing
+#define FOLD (0) // When asserted fold matrixes in memory
 #define NUM_BANKS (BANKING_FACTOR * NUM_CORES)
 
+#if FOLD
+
+#define NUM_ROW (1 + ((N_ITR * N_TX - 1) / NUM_BANKS))
+__fp16 l1_G[2 * N_TX * NUM_BANKS * NUM_ROW]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+__fp16 l1_L[2 * N_TX * NUM_BANKS * NUM_ROW]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+#else
+__fp16 l1_G[2 * N_TX * N_TX * N_ITR]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+__fp16 l1_L[2 * N_TX * N_TX * N_ITR]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+#endif
+
 __fp8 l1_H[2 * N_TX * N_RX * N_ITR]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
 __fp8 l1_S[2 * N_TX * N_ITR]
     __attribute__((aligned(sizeof(int32_t)), section(".l1")));
 __fp8 l1_y[2 * N_RX * N_ITR]
     __attribute__((aligned(sizeof(int32_t)), section(".l1")));
-
-__fp16 l1_G[2 * N_TX * N_TX * N_ITR]
-    __attribute__((aligned(NUM_BANKS * sizeof(int32_t)), section(".l1_prio")));
-__fp16 l1_L[2 * N_TX * N_TX * N_ITR]
-    __attribute__((aligned(NUM_BANKS * sizeof(int32_t)), section(".l1_prio")));
 __fp16 y2[2 * N_TX * N_ITR]
     __attribute__((aligned(sizeof(int32_t)), section(".l1")));
 __fp16 y3[2 * N_TX * N_ITR]
@@ -90,17 +98,17 @@ int main() {
       __fp16 *Ptrx = l1_x + itr * (2 * N_TX);
       // 8b
 #ifdef VEC
-      mempool_hermitian_f8vecs(PtrH, PtrS, PtrG, N_RX, N_TX, ZF);
+      mempool_hermitian_f8vecs(PtrH, PtrS, PtrG, N_RX, N_TX, ZF, 0);
       mempool_MVP_conjtransp_f8vecs(PtrH, Ptry, Ptry2, N_RX, N_TX);
-      mempool_cholesky_f16vecs(PtrG, PtrL, N_TX);
+      mempool_cholesky_f16vecs(PtrG, PtrL, N_TX, 0);
 #else
-      mempool_hermitian_f8s(PtrH, PtrS, PtrG, N_RX, N_TX, ZF);
+      mempool_hermitian_f8s(PtrH, PtrS, PtrG, N_RX, N_TX, ZF, 0);
       mempool_MVP_conjtransp_f8s(PtrH, Ptry, Ptry2, N_RX, N_TX);
-      mempool_cholesky_f16s(PtrG, PtrL, N_TX);
+      mempool_cholesky_f16s(PtrG, PtrL, N_TX, 0);
 #endif
       // 16b
-      mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX);
-      mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX);
+      mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX, 0, 0);
+      mempool_Ltrisol_f16s(PtrL, Ptry3, Ptrx, N_TX, 1, 0);
     }
     mempool_stop_benchmark();
   }
@@ -108,31 +116,47 @@ int main() {
 
 #ifdef PARALLEL
   mempool_start_benchmark();
+  uint32_t time_init = mempool_get_timer();
   // Parallel subcarrier loop
   for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) {
+
     __fp8 *PtrH = l1_H + itr * (2 * N_TX * N_RX);
     __fp8 *Ptry = l1_y + itr * (2 * N_RX);
     __fp8 *PtrS = l1_S + itr * (2 * N_TX);
     // Auxiliary vectors
+#if FOLD
+    __fp16 *PtrG = l1_G + (itr % NUM_ROW) * (2 * N_TX * NUM_BANKS) +
+                   (itr / NUM_ROW) * (2 * N_TX);
+    __fp16 *PtrL = l1_L + (itr % NUM_ROW) * (2 * N_TX * NUM_BANKS) +
+                   (itr / NUM_ROW) * (2 * N_TX);
+    __fp16 *Ptry2 =
+        y2 + (itr % NUM_ROW) * NUM_BANKS + (itr / NUM_ROW) * (2 * N_TX);
+    __fp16 *Ptry3 =
+        y3 + (itr % NUM_ROW) * NUM_BANKS + (itr / NUM_ROW) * (2 * N_TX);
+    __fp16 *Ptrx = l1_x + itr * (2 * N_TX);
+#else
     __fp16 *PtrG = l1_G + itr * (2 * N_TX * N_TX);
     __fp16 *PtrL = l1_L + itr * (2 * N_TX * N_TX);
     __fp16 *Ptry2 = y2 + itr * (2 * N_TX);
     __fp16 *Ptry3 = y3 + itr * (2 * N_TX);
     __fp16 *Ptrx = l1_x + itr * (2 * N_TX);
+#endif
+
 #ifdef VEC
-    mempool_hermitian_f8vecs(PtrH, PtrS, PtrG, N_RX, N_TX, ZF);
+    mempool_hermitian_f8vecs(PtrH, PtrS, PtrG, N_RX, N_TX, ZF, FOLD);
     mempool_MVP_conjtransp_f8vecs(PtrH, Ptry, Ptry2, N_RX, N_TX);
-    mempool_cholesky_f16vecs(PtrG, PtrL, N_TX);
+    mempool_cholesky_f16vecs(PtrG, PtrL, N_TX, FOLD);
 #else
-    mempool_hermitian_f8s(PtrH, PtrS, PtrG, N_RX, N_TX, ZF);
+    mempool_hermitian_f8s(PtrH, PtrS, PtrG, N_RX, N_TX, ZF, FOLD);
     mempool_MVP_conjtransp_f8s(PtrH, Ptry, Ptry2, N_RX, N_TX);
-    mempool_cholesky_f16s(PtrG, PtrL, N_TX);
+    mempool_cholesky_f16s(PtrG, PtrL, N_TX, FOLD);
 #endif
     // 16b
-    mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX);
-    mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX);
+    mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX, 0, FOLD);
+    mempool_Ltrisol_f16s(PtrL, Ptry3, Ptrx, N_TX, 1, FOLD);
   }
   mempool_barrier(num_cores);
+  uint32_t time_end = mempool_get_timer();
   mempool_stop_benchmark();
 #endif
 
@@ -145,6 +169,9 @@ int main() {
     }
   }
 #else
+  if (core_id == 0) {
+    printf("Runtime: %d\n", time_end - time_init);
+  }
   mempool_barrier(num_cores);
 #endif
 
diff --git a/software/apps/baremetal/mimo_mmse_q16/main.c b/software/apps/baremetal/mimo_mmse_q16/main.c
index c7dcda78d..24fd9e44d 100644
--- a/software/apps/baremetal/mimo_mmse_q16/main.c
+++ b/software/apps/baremetal/mimo_mmse_q16/main.c
@@ -64,8 +64,8 @@ int main() {
     mempool_MVP_conjtransp_q16vecs((v2s *)l1_H, (v2s *)l1_y, (v2s *)y2, N_RX,
                                    N_TX, 0);
     mempool_cholesky_q16vecs(l1_G, l1_L, N_TX);
-    mempool_Ltrisol_q16vecs(l1_L, y2, y3, N_TX);
-    mempool_Lttrisol_q16vecs(l1_L, y3, l1_x, N_TX);
+    mempool_Ltrisol_q16vecs(l1_L, y2, y3, N_TX, 0);
+    mempool_Ltrisol_q16vecs(l1_L, y3, l1_x, N_TX, 1);
     mempool_stop_benchmark();
   }
   mempool_barrier(num_cores);
@@ -92,8 +92,8 @@ int main() {
     mempool_MVP_conjtransp_q16vecs((v2s *)PtrH, (v2s *)Ptry, (v2s *)Ptry2, N_RX,
                                    N_TX, 0);
     mempool_cholesky_q16vecs(PtrG, PtrL, N_TX);
-    mempool_Ltrisol_q16vecs(PtrL, Ptry2, Ptry3, N_TX);
-    mempool_Lttrisol_q16vecs(PtrL, Ptry3, Ptrx, N_TX);
+    mempool_Ltrisol_q16vecs(PtrL, Ptry2, Ptry3, N_TX, 0);
+    mempool_Ltrisol_q16vecs(PtrL, Ptry3, Ptrx, N_TX, 1);
   }
   mempool_log_barrier(2, core_id);
   mempool_stop_benchmark();
diff --git a/software/data/data_mimo_mmse_f32.h.tpl b/software/data/data_mimo_mmse_f32.h.tpl
index 858deb254..c7bed1889 100644
--- a/software/data/data_mimo_mmse_f32.h.tpl
+++ b/software/data/data_mimo_mmse_f32.h.tpl
@@ -27,7 +27,7 @@ float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_G[${2 * N_tx
 
 float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_y[${2 * N_rx * N_itr}] = ${array_to_cstr(y)};
 
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_Sigma[${N_tx * N_itr}] = ${array_to_cstr(N)};
+float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_S[${2 * N_tx * N_itr}] = ${array_to_cstr(N)};
 
 // Outputs
 
diff --git a/software/data/generate_mimo_mmse.py b/software/data/generate_mimo_mmse.py
index e4dd80932..f8918f561 100644
--- a/software/data/generate_mimo_mmse.py
+++ b/software/data/generate_mimo_mmse.py
@@ -52,13 +52,15 @@ def generate_fmmse(N_tx, N_rx, N_itr, my_type):
         G = np.matmul(H_h, H) + N * np.eye(H.shape[1])
         N = N * np.ones(N_tx)
 
-         # Cholesky decomposition
+        # Cholesky decomposition
         L = np.linalg.cholesky(G)
         # Linear system solution
         y1 = np.transpose(np.dot(H_h, y))
         y2 = solve_triangular(L, y1, lower=True)
         x = solve_triangular(np.asmatrix(L).H, y2)
 
+        H = np.reshape(np.asarray(H), (N_tx * N_rx), order='C')
+        G = np.reshape(np.asarray(G), (N_tx * N_tx), order='C')
         N = np.column_stack((N.real, N.imag)).astype(my_type).flatten()
         H = np.column_stack((H.real, H.imag)).astype(my_type).flatten()
         G = np.column_stack((G.real, G.imag)).astype(my_type).flatten()
diff --git a/software/kernels/baremetal/mempool_checks.h b/software/kernels/baremetal/mempool_checks.h
index d680764c1..700354436 100644
--- a/software/kernels/baremetal/mempool_checks.h
+++ b/software/kernels/baremetal/mempool_checks.h
@@ -15,17 +15,19 @@
 void mempool_check_q32(int32_t *__restrict__ pRes, int32_t *__restrict__ pExp,
                        uint32_t NEL, int32_t TOL, bool verbose) {
   uint32_t core_id = mempool_get_core_id();
-  int32_t error;
+
   if (core_id == 0) {
+
     uint32_t ERRORS = 0;
     for (uint32_t i = 0; i < NEL; i++) {
       int32_t exp = pExp[i];
       int32_t res = pRes[i];
-      error = exp - res;
-      bool print = ((error > TOL) || (error < (-TOL))) || verbose;
+      int32_t diff = exp - res;
+      uint32_t error = ((diff > TOL) || (diff < (-TOL))) ? 1 : 0;
+      uint32_t print = error || verbose;
+      ERRORS += error;
       if (print) {
         printf("CHECK(%d): EXP = %08X - RESP = %08X\n", i, exp, res);
-        ERRORS++;
       }
     }
     printf("%d ERRORS out of %d CHECKS\n", ERRORS, NEL);
@@ -44,17 +46,19 @@ void mempool_check_q32(int32_t *__restrict__ pRes, int32_t *__restrict__ pExp,
 void mempool_check_q16(int16_t *__restrict__ pRes, int16_t *__restrict__ pExp,
                        uint32_t NEL, int16_t TOL, bool verbose) {
   uint32_t core_id = mempool_get_core_id();
-  int16_t error;
+
   if (core_id == 0) {
+
     uint32_t ERRORS = 0;
     for (uint32_t i = 0; i < NEL; i++) {
       int16_t exp = (int16_t)pExp[i];
       int16_t res = (int16_t)pRes[i];
-      error = (int16_t)(exp - res);
-      bool print = ((error > TOL) || (error < (-TOL))) | verbose;
+      int16_t diff = (int16_t)(exp - res);
+      uint32_t error = ((diff > TOL) || (diff < (-TOL))) ? 1 : 0;
+      uint32_t print = error || verbose;
+      ERRORS += error;
       if (print) {
         printf("CHECK(%d): EXP = %08X - RESP = %08X\n", i, exp, res);
-        ERRORS++;
       }
     }
     printf("%d ERRORS out of %d CHECKS\n", ERRORS, NEL);
@@ -74,20 +78,24 @@ void mempool_check_q16(int16_t *__restrict__ pRes, int16_t *__restrict__ pExp,
 void mempool_check_f32(float *__restrict__ pRes, float *__restrict__ pExp,
                        uint32_t NEL, float TOL, bool verbose) {
   uint32_t core_id = mempool_get_core_id();
-  float error = 0.0f;
+
   if (core_id == 0) {
+
     uint32_t ERRORS = 0;
     for (uint32_t i = 0; i < NEL; i++) {
       float exp = pExp[i];
       float res = pRes[i];
-      asm volatile("fsub.s %[error], %[res], %[exp];"
-                   : [error] "+&r"(error)
+      float diff;
+      asm volatile("fsub.s %[diff], %[res], %[exp];"
+                   : [diff] "+&r"(diff)
                    : [res] "r"(res), [exp] "r"(exp)
                    :);
-      bool print = ((error > TOL) || (error < (-TOL))) || verbose;
+      uint32_t error = ((diff > TOL) || (diff < (-TOL))) ? 1 : 0;
+      uint32_t print = error || verbose;
+      ERRORS += error;
       if (print) {
-        printf("CHECK(%d): EXP = %08X - RESP = %08X\n", i, exp, res);
-        ERRORS++;
+        printf("CHECK(%d): EXP = %08X - RESP = %08X\n", i, *(int32_t *)&exp,
+               *(int32_t *)&res);
       }
     }
     printf("%d ERRORS out of %d CHECKS\n", ERRORS, NEL);
@@ -106,22 +114,25 @@ void mempool_check_f32(float *__restrict__ pRes, float *__restrict__ pExp,
 void mempool_check_f16(__fp16 *__restrict__ pRes, __fp16 *__restrict__ pExp,
                        uint32_t NEL, float TOL, bool verbose) {
   uint32_t core_id = mempool_get_core_id();
-  float error;
+
   if (core_id == 0) {
     uint32_t ERRORS = 0;
     for (uint32_t i = 0; i < NEL; i++) {
       __fp16 exp = pExp[i];
       __fp16 res = pRes[i];
-      asm volatile("fsub.h %[error], %[res], %[exp];"
-                   "fcvt.s.h %[error], %[error];"
-                   : [error] "+&r"(error)
+      float diff;
+      asm volatile("fsub.h %[diff], %[res], %[exp];"
+                   "fcvt.s.h %[diff], %[diff];"
+                   : [diff] "+&r"(diff)
                    : [res] "r"(res), [exp] "r"(exp)
                    :);
-      bool print = ((error > TOL) || (error < (-TOL))) || verbose;
+
+      uint32_t error = ((diff > TOL) || (diff < (-TOL))) ? 1 : 0;
+      uint32_t print = error || verbose;
+      ERRORS += error;
       if (print) {
         printf("CHECK(%d): EXP = %08X - RESP = %08X\n", i, *(int32_t *)&exp,
                *(int32_t *)&res);
-        ERRORS++;
       }
     }
     printf("%d ERRORS out of %d CHECKS\n", ERRORS, NEL);
diff --git a/software/kernels/baremetal/mempool_cholesky_f16s.h b/software/kernels/baremetal/mempool_cholesky_f16s.h
index 56483490e..121267545 100644
--- a/software/kernels/baremetal/mempool_cholesky_f16s.h
+++ b/software/kernels/baremetal/mempool_cholesky_f16s.h
@@ -16,9 +16,11 @@
   @param[in]     pSrc  points to input matrix
   @param[in]     pL points to output lower triangular matrix
   @param[in]     n dimension of the input data
+  @param[in]     folded matrices are folded row-wise in memory
   @return        none
 */
-void mempool_cholesky_f16s(__fp16 *pSrc, __fp16 *pL, const uint32_t n) {
+void mempool_cholesky_f16s(__fp16 *pSrc, __fp16 *pL, const uint32_t n,
+                           const uint32_t folded) {
 
   __fp16 sum;
   __fp16 a, b;
@@ -27,91 +29,15 @@ void mempool_cholesky_f16s(__fp16 *pSrc, __fp16 *pL, const uint32_t n) {
   __fp16 ap, bp; // Pivot element
   __fp16 as, bs; // Sum element
   uint32_t i, j, k;
+  const uint32_t offset = folded ? N_BANKS : n;
 
   for (j = 0; j < n; j++) {
     // Elements on diagonal (input matrix is positive-definite)
-    ap = pSrc[2U * (j * n + j)];
-    sum = (__fp16)0.0f;
-    for (k = 0; k < j; k++) {
-      a = pL[2U * (j * n + k)];
-      b = pL[2U * (j * n + k) + 1];
-      asm volatile("fmadd.h %[sum], %[a], %[a], %[sum];"
-                   "fmadd.h %[sum], %[b], %[b], %[sum];"
-                   : [sum] "+&r"(sum)
-                   : [a] "r"(a), [b] "r"(b)
-                   :);
-    }
-    asm volatile("fsub.h %[ap], %[ap], %[sum];"
-                 "fsqrt.h  %[ap], %[ap];"
-                 : [ap] "+&r"(ap)
-                 : [sum] "r"(sum)
-                 :);
-    pL[2U * (j * n + j)] = ap;
-
-    // Elements on rows
-    for (i = j + 1; i < n; i++) {
-      // Pivot
-      ap = pSrc[2U * (i * n + j)];
-      bp = pSrc[2U * (i * n + j) + 1];
-      // Diag
-      diag = pL[2U * (j * n + j)];
-      // Sum -> s = s + (ac + bd) + j*(bc - ad)
-      as = (__fp16)0.0f;
-      bs = (__fp16)0.0f;
-      for (k = 0; k < j; k++) {
-        a = pL[2U * (i * n + k)];
-        b = pL[2U * (i * n + k) + 1];
-        c = pL[2U * (j * n + k)];
-        d = pL[2U * (j * n + k) + 1];
-        asm volatile("fmadd.h %[as], %[a], %[c], %[as];"
-                     "fmadd.h %[as], %[b], %[d], %[as];"
-                     "fmadd.h %[bs], %[b], %[c], %[bs];"
-                     "fnmsub.h %[bs], %[a], %[d], %[bs];"
-                     : [as] "+&r"(as), [bs] "+&r"(bs)
-                     : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
-                     :);
-      }
-      asm volatile("fsub.h %[ap], %[ap], %[as];"
-                   "fsub.h %[bp], %[bp], %[bs];"
-                   "fdiv.h %[ap], %[ap], %[diag];"
-                   "fdiv.h %[bp], %[bp], %[diag];"
-                   : [ap] "+&r"(ap), [bp] "+&r"(bp)
-                   : [diag] "r"(diag), [as] "r"(as), [bs] "r"(bs)
-                   :);
-      pL[2U * (i * n + j)] = ap;
-      pL[2U * (i * n + j) + 1] = bp;
-    }
-  }
-  return;
-}
-
-/**
-  @brief         Cholesky decomposition with Crout algorithm.
-  Output data is folded to the core's local memory.
-  @param[in]     pSrc  points to input matrix
-  @param[in]     pL points to output lower triangular matrix
-  @param[in]     n dimension of the input data
-  @return        none
-*/
-void mempool_cholesky_folded_f16s(__fp16 *pSrc, __fp16 *pL, const uint32_t n) {
-
-  register __fp16 sum;
-  __fp16 a, b;
-  __fp16 c, d;
-  __fp16 diag;   // Diagonal element
-  __fp16 ap, bp; // Pivot element
-  __fp16 as, bs; // Sum element
-
-  uint32_t i, j, k;
-
-  for (j = 0; j < n; j++) {
-
-    // Elements on diagonal (input matrix is positive-definite)
-    ap = pSrc[2U * (j * N_BANKS + j)];
+    ap = pSrc[2U * (j * offset + j)];
     sum = (__fp16)0.0f;
     for (k = 0; k < j; k++) {
-      a = pL[2U * (j * N_BANKS + k)];
-      b = pL[2U * (j * N_BANKS + k) + 1];
+      a = pL[2U * (j * offset + k)];
+      b = pL[2U * (j * offset + k) + 1];
       asm volatile("fmadd.h %[sum], %[a], %[a], %[sum];"
                    "fmadd.h %[sum], %[b], %[b], %[sum];"
                    : [sum] "+&r"(sum)
@@ -123,23 +49,23 @@ void mempool_cholesky_folded_f16s(__fp16 *pSrc, __fp16 *pL, const uint32_t n) {
                  : [ap] "+&r"(ap)
                  : [sum] "r"(sum)
                  :);
-    pL[2U * (j * N_BANKS + j)] = ap;
+    pL[2U * (j * offset + j)] = ap;
 
     // Elements on rows
     for (i = j + 1; i < n; i++) {
       // Pivot
-      ap = pSrc[2U * (i * N_BANKS + j)];
-      bp = pSrc[2U * (i * N_BANKS + j) + 1];
+      ap = pSrc[2U * (i * offset + j)];
+      bp = pSrc[2U * (i * offset + j) + 1];
       // Diag
-      diag = pL[2U * (j * N_BANKS + j)];
+      diag = pL[2U * (j * offset + j)];
       // Sum -> s = s + (ac + bd) + j*(bc - ad)
       as = (__fp16)0.0f;
       bs = (__fp16)0.0f;
       for (k = 0; k < j; k++) {
-        a = pL[2U * (i * N_BANKS + k)];
-        b = pL[2U * (i * N_BANKS + k) + 1];
-        c = pL[2U * (j * N_BANKS + k)];
-        d = pL[2U * (j * N_BANKS + k) + 1];
+        a = pL[2U * (i * offset + k)];
+        b = pL[2U * (i * offset + k) + 1];
+        c = pL[2U * (j * offset + k)];
+        d = pL[2U * (j * offset + k) + 1];
         asm volatile("fmadd.h %[as], %[a], %[c], %[as];"
                      "fmadd.h %[as], %[b], %[d], %[as];"
                      "fmadd.h %[bs], %[b], %[c], %[bs];"
@@ -155,8 +81,8 @@ void mempool_cholesky_folded_f16s(__fp16 *pSrc, __fp16 *pL, const uint32_t n) {
                    : [ap] "+&r"(ap), [bp] "+&r"(bp)
                    : [diag] "r"(diag), [as] "r"(as), [bs] "r"(bs)
                    :);
-      pL[2U * (i * N_BANKS + j)] = ap;
-      pL[2U * (i * N_BANKS + j) + 1] = bp;
+      pL[2U * (i * offset + j)] = ap;
+      pL[2U * (i * offset + j) + 1] = bp;
     }
   }
   return;
@@ -170,55 +96,64 @@ void mempool_cholesky_folded_f16s(__fp16 *pSrc, __fp16 *pL, const uint32_t n) {
   @param[in]     n dimension of the input data
   @return        none
 */
-void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n) {
+void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n,
+                              const uint32_t folded) {
 
   __fp16 diag;
   v2h apbp, dgdg;
   v2h ab, cd;
-
   uint32_t i, j, k;
+  const uint32_t offset = folded ? N_BANKS : n;
+
   for (j = 0; j < n; j++) {
 
     // Elements on diagonal (input matrix is positive-definite)
-    __fp16 ap = pSrc[2U * (j * n + j)];
+    __fp16 ap = pSrc[2U * (j * offset + j)];
     float sum = 0.0f;
     for (k = 0; k < j; k++) {
-      ab = (*(v2h *)&pL[2U * (j * n + k)]);
+      ab = (*(v2h *)&pL[2U * (j * offset + k)]);
       asm volatile("vfdotpex.s.h %0, %1, %1;" : "+&r"(sum) : "r"(ab) :);
     }
     asm volatile("fcvt.h.s %0, %0;" : "+&r"(sum) :);
     asm volatile("fsub.h   %0, %0, %1;" : "+&r"(ap) : "r"(sum));
     asm volatile("fsqrt.h  %0, %0;" : "+&r"(ap) :);
-    pL[2U * (j * n + j)] = ap;
+    pL[2U * (j * offset + j)] = ap;
 
     // Elements on rows
-    uint32_t volatile __srt_iloop__ = (j + 1);
-    uint32_t volatile __end_kloop__ = j;
+    uint32_t __srt_iloop__ = (j + 1);
+    uint32_t __end_kloop__ = j;
 #ifdef __CDOTP
     for (i = __srt_iloop__; i < n; i++) {
-      apbp = (*(v2h *)&pSrc[2U * (i * n + j)]); // Pivot
-      diag = pL[2U * (j * n + j)];              // Diag
+      apbp = (*(v2h *)&pSrc[2U * (i * offset + j)]); // Pivot
+      diag = pL[2U * (j * offset + j)];              // Diag
+      v2h asbs = (v2h)0.0f;
+      float as = 0.0f;
+      float bs = 0.0f;
       for (k = 0; k < __end_kloop__; k++) {
-        ab = (*(v2h *)&pL[2U * (i * n + k)]);
-        cd = (*(v2h *)&pL[2U * (j * n + k)]);
-        asm volatile("fcndotpex.s.h  %0, %1, %2;"
-                     : "+&r"(apbp)
-                     : "r"(ab), "r"(cd));
+        ab = (*(v2h *)&pL[2U * (i * offset + k)]);
+        cd = (*(v2h *)&pL[2U * (j * offset + k)]);
+        asm volatile("fccdotpex.s.h  %0, %1, %2;"
+                     : "+&r"(asbs)
+                     : "r"(cd), "r"(ab));
       }
+      asm volatile("pv.shuffle2.h %0, %0, %[mask];"
+                   : "+&r"(asbs)
+                   : [mask] "r"(0x00020003));
       asm volatile("pv.pack %0, %1, %1;" : "+&r"(dgdg) : "r"(diag));
+      asm volatile("vfsub.h %0, %0, %1;" : "+&r"(apbp) : "r"(asbs));
       asm volatile("vfdiv.h %0, %0, %1;" : "+&r"(apbp) : "r"(dgdg));
-      (*(v2h *)&pL[2U * (i * n + j)]) = apbp;
+      (*(v2h *)&pL[2U * (i * offset + j)]) = apbp;
     }
 #else
     for (i = __srt_iloop__; i < n; i++) {
-      apbp = (*(v2h *)&pSrc[2U * (i * n + j)]); // Pivot
-      diag = pL[2U * (j * n + j)];              // Diag
+      apbp = (*(v2h *)&pSrc[2U * (i * offset + j)]); // Pivot
+      diag = pL[2U * (j * offset + j)];              // Diag
       float as = 0.0f;
       float bs = 0.0f;
       v2h asbs;
       for (k = 0; k < __end_kloop__; k++) {
-        ab = (*(v2h *)&pL[2U * (i * n + k)]);
-        cd = (*(v2h *)&pL[2U * (j * n + k)]);
+        ab = (*(v2h *)&pL[2U * (i * offset + k)]);
+        cd = (*(v2h *)&pL[2U * (j * offset + k)]);
         v2h ndc;
         const uint32_t neg_mask = 0x00008000;
         const uint32_t shuffle_mask = 0x00020003;
@@ -237,7 +172,7 @@ void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n) {
       asm volatile("pv.pack    %0, %1, %1;" : "+&r"(dgdg) : "r"(diag));
       asm volatile("vfsub.h    %0, %0, %1;" : "+&r"(apbp) : "r"(asbs));
       asm volatile("vfdiv.h    %0, %0, %1;" : "+&r"(apbp) : "r"(dgdg));
-      (*(v2h *)&pL[2U * (i * n + j)]) = apbp;
+      (*(v2h *)&pL[2U * (i * offset + j)]) = apbp;
     }
 #endif
   }
@@ -430,7 +365,8 @@ void mempool_cholesky_f16vecs_unroll4(__fp16 *pSrc, __fp16 *pL,
   @param[in]     n dimension of the input data
   @return        none
 */
-void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n) {
+void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n,
+                              const uint32_t folded) {
 
   float sum;            // register float sum
   float as, bs;         // real and imaginary sums
@@ -444,14 +380,15 @@ void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n) {
   v2h ab, cd, ndc;
 
   uint32_t i, j, k;
+  const uint32_t offset = folded ? N_BANKS : n;
 
   for (j = 0; j < n; j++) {
 
     // Elements on diagonal (input matrix is positive-definite)
-    ap = pSrc[2U * (j * n + j)];
+    ap = pSrc[2U * (j * offset + j)];
     asm volatile("fcvt.s.h %0, %1;" : "=r"(sum) : "r"(ap) :);
     for (k = 0; k < j; k++) {
-      ab = (*(v2h *)&pL[2U * (j * n + k)]);
+      ab = (*(v2h *)&pL[2U * (j * offset + k)]);
       asm volatile("vfndotpex.s.h %[sum], %[ab], %[ab];"
                    : [sum] "+&r"(sum)
                    : [ab] "r"(ab)
@@ -459,15 +396,15 @@ void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n) {
     }
     sum = (float)sqrt(sum);
     asm volatile("fcvt.h.s %0, %1;" : "=r"(ap) : "r"(sum) :);
-    pL[2U * (j * n + j)] = ap;
+    pL[2U * (j * offset + j)] = ap;
 
     // Elements on rows
     for (i = j + 1; i < n; i++) {
       // Pivot
-      ap = pSrc[2U * (i * n + j)];
-      bp = pSrc[2U * (i * n + j + 1)];
+      ap = pSrc[2U * (i * offset + j)];
+      bp = pSrc[2U * (i * offset + j + 1)];
       // Diag
-      diag = pL[2U * (j * n + j)];
+      diag = pL[2U * (j * offset + j)];
 
       // Sum -> s = s + (ac + bd) + j*(bc - ad)
       as = (float)0.0f;
@@ -476,8 +413,8 @@ void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n) {
       asm volatile("fcvt.s.h %0, %1;" : "=r"(bp_f32) : "r"(bp) :);
       asm volatile("fcvt.s.h %0, %1;" : "=r"(diag_f32) : "r"(diag) :);
       for (k = 0; k < j; k++) {
-        ab = (*(v2h *)&pL[2U * (i * n + k)]);
-        cd = (*(v2h *)&pL[2U * (j * n + k)]);
+        ab = (*(v2h *)&pL[2U * (i * offset + k)]);
+        cd = (*(v2h *)&pL[2U * (j * offset + k)]);
         const uint32_t neg_mask = 0x00008000;
         const uint32_t shuffle_mask = 0x00020003;
         asm volatile(
@@ -494,7 +431,7 @@ void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n) {
       as = (ap_f32 - as) / diag_f32;
       bs = (bp_f32 - bs) / diag_f32;
       asm volatile("vfcpka.h.s %0, %1, %2;" : "=r"(asbs) : "r"(as), "r"(bs) :);
-      (*(v2h *)&pL[2U * (i * n + j)]) = asbs;
+      (*(v2h *)&pL[2U * (i * offset + j)]) = asbs;
     }
   }
   return;
diff --git a/software/kernels/baremetal/mempool_cholesky_f32s.h b/software/kernels/baremetal/mempool_cholesky_f32s.h
index 2bad891f5..63fd878dc 100644
--- a/software/kernels/baremetal/mempool_cholesky_f32s.h
+++ b/software/kernels/baremetal/mempool_cholesky_f32s.h
@@ -16,7 +16,8 @@
   @param[in]     n dimension of the input data
   @return        none
 */
-void mempool_cholesky_f32s(float *pSrc, float *pL, const uint32_t n) {
+void mempool_cholesky_f32s(float *pSrc, float *pL, const uint32_t n,
+                           const uint32_t folded) {
 
   register float sum;
   float a, b;
@@ -24,17 +25,17 @@ void mempool_cholesky_f32s(float *pSrc, float *pL, const uint32_t n) {
   float diag;   // Diagonal element
   float ap, bp; // Pivot element
   float as, bs; // Sum element
-
   uint32_t i, j, k;
+  const uint32_t offset = folded ? N_BANKS : n;
 
   for (j = 0; j < n; j++) {
 
     // Elements on diagonal (input matrix is positive-definite)
-    ap = pSrc[2U * (j * n + j)];
+    ap = pSrc[2U * (j * offset + j)];
     sum = 0.0f;
     for (k = 0; k < j; k++) {
-      a = pL[2U * (j * n + k)];
-      b = pL[2U * (j * n + k) + 1];
+      a = pL[2U * (j * offset + k)];
+      b = pL[2U * (j * offset + k) + 1];
       asm volatile("fmadd.s %[sum], %[a], %[a], %[sum];"
                    "fmadd.s %[sum], %[b], %[b], %[sum];"
                    : [sum] "+&r"(sum)
@@ -51,98 +52,18 @@ void mempool_cholesky_f32s(float *pSrc, float *pL, const uint32_t n) {
     // Elements on rows
     for (i = j + 1; i < n; i++) {
       // Pivot
-      ap = pSrc[2U * (i * n + j)];
-      bp = pSrc[2U * (i * n + j) + 1];
-      // Diag
-      diag = pL[2U * (j * n + j)];
-      // Sum -> s = s + (ac + bd) + j*(bc - ad)
-      as = 0.0f;
-      bs = 0.0f;
-      for (k = 0; k < j; k++) {
-        a = pL[2U * (i * n + k)];
-        b = pL[2U * (i * n + k) + 1];
-        c = pL[2U * (j * n + k)];
-        d = pL[2U * (j * n + k) + 1];
-        asm volatile("fmadd.s %[as], %[a], %[c], %[as];"
-                     "fmadd.s %[as], %[b], %[d], %[as];"
-                     "fmadd.s %[bs], %[b], %[c], %[bs];"
-                     "fnmsub.s %[bs], %[a], %[d], %[bs];"
-                     : [as] "+&r"(as), [bs] "+&r"(bs)
-                     : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
-                     :);
-      }
-      asm volatile("fsub.s %[ap], %[ap], %[as];"
-                   "fsub.s %[bp], %[bp], %[bs];"
-                   "fdiv.s %[ap], %[ap], %[diag];"
-                   "fdiv.s %[bp], %[bp], %[diag];"
-                   : [ap] "+&r"(ap), [bp] "+&r"(bp)
-                   : [diag] "r"(diag), [as] "r"(as), [bs] "r"(bs)
-                   :);
-      pL[2U * (i * n + j)] = ap;
-      pL[2U * (i * n + j) + 1] = bp;
-    }
-  }
-  return;
-}
-
-void mempool_cholesky_folded_f32s(float *pSrc, float *pL, const uint32_t n) {
-
-  register float sum;
-
-  // first matrix row:
-  //           item[0-2] item[1-3]
-  // memrow[0]    x x       x x
-  // memrow[1]    x x       x x
-  // second matrix row:
-  // memrow[2]    x x       x x
-  // memrow[3]    x x       x x
-  // third matrix row:
-  // memrow[4]    x x       x x
-  // memrow[5]    x x       x x
-  // i * memrow_xrow * N_BANKS + (j / local_items) * N_BANKS + j % local_items
-
-  float a, b;
-  float c, d;
-  float diag;   // Diagonal element
-  float ap, bp; // Pivot element
-  float as, bs; // Sum element
-  uint32_t banks_per_row = (n / 2) * N_BANKS;
-
-  uint32_t i, j, k;
-  for (j = 0; j < n; j++) {
-    // Elements on diagonal (input matrix is positive-definite)
-    ap = pSrc[j * banks_per_row + (j / 2) * N_BANKS + 2 * (j % 2)];
-    sum = 0.0f;
-    for (k = 0; k < j; k++) {
-      a = pL[j * banks_per_row + (k / 2) * N_BANKS + 2 * (k % 2)];
-      b = pL[j * banks_per_row + (k / 2) * N_BANKS + 2 * (k % 2) + 1];
-      asm volatile("fmadd.s %[sum], %[a], %[a], %[sum];"
-                   "fmadd.s %[sum], %[b], %[b], %[sum];"
-                   : [sum] "+&r"(sum)
-                   : [a] "r"(a), [b] "r"(b)
-                   :);
-    }
-    asm volatile("fsub.s %[ap], %[ap], %[sum];"
-                 "fsqrt.s  %[ap], %[ap];"
-                 : [ap] "+&r"(ap)
-                 : [sum] "r"(sum)
-                 :);
-    pL[j * banks_per_row + (j / 2) * N_BANKS + 2 * (j % 2)] = ap;
-    // Elements on rows
-    for (i = j + 1; i < n; i++) {
-      // Pivot
-      ap = pSrc[i * banks_per_row + (j / 2) * N_BANKS + 2 * (j % 2)];
-      bp = pSrc[i * banks_per_row + (j / 2) * N_BANKS + 2 * (j % 2) + 1];
+      ap = pSrc[2U * (i * offset + j)];
+      bp = pSrc[2U * (i * offset + j) + 1];
       // Diag
-      diag = pL[j * banks_per_row + (j / 2) * N_BANKS + 2 * (j % 2)];
+      diag = pL[2U * (j * offset + j)];
       // Sum -> s = s + (ac + bd) + j*(bc - ad)
       as = 0.0f;
       bs = 0.0f;
       for (k = 0; k < j; k++) {
-        a = pL[i * banks_per_row + (k / 2) * N_BANKS + 2 * (k % 2)];
-        b = pL[i * banks_per_row + (k / 2) * N_BANKS + 2 * (k % 2) + 1];
-        c = pL[j * banks_per_row + (k / 2) * N_BANKS + 2 * (k % 2)];
-        d = pL[j * banks_per_row + (k / 2) * N_BANKS + 2 * (k % 2) + 1];
+        a = pL[2U * (i * offset + k)];
+        b = pL[2U * (i * offset + k) + 1];
+        c = pL[2U * (j * offset + k)];
+        d = pL[2U * (j * offset + k) + 1];
         asm volatile("fmadd.s %[as], %[a], %[c], %[as];"
                      "fmadd.s %[as], %[b], %[d], %[as];"
                      "fmadd.s %[bs], %[b], %[c], %[bs];"
@@ -158,8 +79,8 @@ void mempool_cholesky_folded_f32s(float *pSrc, float *pL, const uint32_t n) {
                    : [ap] "+&r"(ap), [bp] "+&r"(bp)
                    : [diag] "r"(diag), [as] "r"(as), [bs] "r"(bs)
                    :);
-      pL[i * banks_per_row + (j / 2) * N_BANKS + 2 * (j % 2)] = ap;
-      pL[i * banks_per_row + (j / 2) * N_BANKS + 2 * (j % 2) + 1] = bp;
+      pL[2U * (i * offset + j)] = ap;
+      pL[2U * (i * offset + j) + 1] = bp;
     }
   }
   return;
diff --git a/software/kernels/baremetal/mempool_linearsolver_f16s.h b/software/kernels/baremetal/mempool_linearsolver_f16s.h
index 32ab942ca..c4e134527 100644
--- a/software/kernels/baremetal/mempool_linearsolver_f16s.h
+++ b/software/kernels/baremetal/mempool_linearsolver_f16s.h
@@ -15,10 +15,13 @@
   @param[in]     in known variables vector
   @param[in]     x unknown solutions vector
   @param[in]     n dimension of the system
+  @param[in]     transposed solve transposed system
+  @param[in]     folded matrix L is folded row-wise in memory
   @return        none
 */
 
-void mempool_Ltrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, const uint32_t n) {
+void mempool_Ltrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, const uint32_t n,
+                          const uint32_t transposed, const uint32_t folded) {
 
   uint32_t i, j;
   __fp16 a, b;
@@ -27,77 +30,41 @@ void mempool_Ltrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, const uint32_t n) {
   __fp16 as, bs;
   __fp16 ax, bx;
   __fp16 diag;
+  const uint32_t offset = folded ? N_BANKS : n;
 
   // Solve for each variable x_i in turn
   for (i = 0; i < n; i++) {
-    diag = pL[2 * (i * n + i)];
-    as = (__fp16)in[2U * i];
-    bs = (__fp16)in[2U * i + 1];
+    uint32_t ridx = transposed ? (n - i - 1) : i;
+    diag = pL[2U * (ridx * offset + ridx)];
+    as = (__fp16)in[2U * ridx];
+    bs = (__fp16)in[2U * ridx + 1];
     // Use the previously solved variables to calculate the sum
     for (j = 0; j < i; j++) {
-      a = pL[2U * (i * n + j)];
-      b = pL[2U * (i * n + j) + 1];
-      c = x[2U * j];
-      d = x[2U * j + 1];
-      asm volatile("fnmsub.h  %[as], %[a], %[c], %[as];"
-                   "fnmsub.h  %[bs], %[a], %[d], %[bs];"
-                   "fmadd.h   %[as], %[b], %[d], %[as];"
-                   "fnmsub.h  %[bs], %[b], %[c], %[bs];"
-                   : [as] "+&r"(as), [bs] "+&r"(bs)
-                   : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
-                   :);
-    }
-    // Subtract the sum from b_i and divide by the diagonal element L[i][i]
-    asm volatile("fdiv.h %[ax], %[as], %[diag];"
-                 "fdiv.h %[bx], %[bs], %[diag];"
-                 : [ax] "+&r"(ax), [bx] "+&r"(bx)
-                 : [as] "r"(as), [bs] "r"(bs), [diag] "r"(diag)
-                 :);
-    x[2U * i] = ax;
-    x[2U * i + 1] = bx;
-  }
-  return;
-}
-
-/**
-  @brief         Single-core solution of upper triangular system
-                 (from transposed lower triangular system)
-  @param[in]     pL input triangular matrix to be transposed
-  @param[in]     in known variables vector
-  @param[in]     x unknown solutions vector
-  @param[in]     n dimension of the system
-  @return        none
-*/
-
-void mempool_Lttrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x,
-                           const uint32_t n) {
-
-  uint32_t i, j;
-  __fp16 a, b;
-  __fp16 c, d;
 
-  __fp16 as, bs;
-  __fp16 ax, bx;
-  __fp16 diag;
-
-  // Solve for each variable x_i in turn
-  for (i = 0; i < n; i++) {
-    diag = pL[2 * ((n - 1 - i) * n + (n - 1 - i))];
-    as = (__fp16)in[2 * (n - i - 1)];
-    bs = (__fp16)in[2 * (n - i - 1) + 1];
-    // Use the previously solved variables to calculate the sum
-    for (j = 0; j < i; j++) {
-      a = pL[2U * ((n - 1 - j) * n + (n - 1 - i))];
-      b = pL[2U * ((n - 1 - j) * n + (n - 1 - i)) + 1];
-      c = x[2U * (n - 1 - j)];
-      d = x[2U * (n - 1 - j) + 1];
-      asm volatile("fnmsub.h  %[as], %[a], %[c], %[as];"
-                   "fnmsub.h  %[as], %[b], %[d], %[as];"
-                   "fnmsub.h  %[bs], %[a], %[d], %[bs];"
-                   "fmadd.h   %[bs], %[b], %[c], %[bs];"
-                   : [as] "+&r"(as), [bs] "+&r"(bs)
-                   : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
-                   :);
+      uint32_t cidx = transposed ? (n - j - 1) : j;
+      c = x[2U * cidx];
+      d = x[2U * cidx + 1];
+      if (transposed) {
+        a = pL[2U * (cidx * offset + ridx)];
+        b = pL[2U * (cidx * offset + ridx) + 1];
+        asm volatile("fnmsub.h  %[as], %[a], %[c], %[as];"
+                     "fnmsub.h  %[as], %[b], %[d], %[as];"
+                     "fnmsub.h  %[bs], %[a], %[d], %[bs];"
+                     "fmadd.h   %[bs], %[b], %[c], %[bs];"
+                     : [as] "+&r"(as), [bs] "+&r"(bs)
+                     : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
+                     :);
+      } else {
+        a = pL[2U * (ridx * offset + cidx)];
+        b = pL[2U * (ridx * offset + cidx) + 1];
+        asm volatile("fnmsub.h  %[as], %[a], %[c], %[as];"
+                     "fnmsub.h  %[bs], %[a], %[d], %[bs];"
+                     "fmadd.h   %[as], %[b], %[d], %[as];"
+                     "fnmsub.h  %[bs], %[b], %[c], %[bs];"
+                     : [as] "+&r"(as), [bs] "+&r"(bs)
+                     : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
+                     :);
+      }
     }
     // Subtract the sum from b_i and divide by the diagonal element L[i][i]
     asm volatile("fdiv.h %[ax], %[as], %[diag];"
@@ -105,115 +72,8 @@ void mempool_Lttrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x,
                  : [ax] "+&r"(ax), [bx] "+&r"(bx)
                  : [as] "r"(as), [bs] "r"(bs), [diag] "r"(diag)
                  :);
-    x[2U * (n - i - 1)] = ax;
-    x[2U * (n - i - 1) + 1] = bx;
-  }
-  return;
-}
-
-/**
-  @brief        Single-core solution of lower triangular system
-                (from transposed lower triangular system)
-  @param[in]    pL input triangular matrix
-  @param[in]    in known variables vector
-  @param[in]    x unknown solutions vector
-  @param[in]    n dimension of the system
-  @return       none
-*/
-
-void mempool_Ltrisol_folded_f16s(__fp16 *pL, __fp16 *in, __fp16 *x,
-                                 const uint32_t n) {
-
-  uint32_t i, j;
-  __fp16 a, b;
-  __fp16 c, d;
-
-  __fp16 as, bs;
-  __fp16 ax, bx;
-  __fp16 diag;
-
-  // Solve for each variable x_i in turn
-  for (i = 0; i < n; i++) {
-    diag = pL[i * N_BANKS + 2 * i];
-    as = (__fp16)in[2U * i];
-    bs = (__fp16)in[2U * i + 1];
-    // Use the previously solved variables to calculate the sum
-    for (j = 0; j < i; j++) {
-      a = pL[i * N_BANKS + 2 * j];
-      b = pL[i * N_BANKS + 2 * j + 1];
-      c = x[2U * j];
-      d = x[2U * j + 1];
-      asm volatile("fnmsub.h  %[as], %[a], %[c], %[as];"
-                   "fnmsub.h  %[bs], %[a], %[d], %[bs];"
-                   "fmadd.h   %[as], %[b], %[d], %[as];"
-                   "fnmsub.h  %[bs], %[b], %[c], %[bs];"
-                   : [as] "+&r"(as), [bs] "+&r"(bs)
-                   : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
-                   :);
-    }
-    // Subtract the sum from b_i and divide by the diagonal element L[i][i]
-    asm volatile("fdiv.h %[ax], %[as], %[diag];"
-                 "fdiv.h %[bx], %[bs], %[diag];"
-                 : [ax] "+&r"(ax), [bx] "+&r"(bx)
-                 : [as] "r"(as), [bs] "r"(bs), [diag] "r"(diag)
-                 :);
-    x[2U * i] = ax;
-    x[2U * i + 1] = bx;
-  }
-  return;
-}
-
-/**
-  @brief        Single-core solution of upper triangular system
-                Output data is folded to the core's local memory.
-                (from transposed lower triangular system)
-  @param[in]    pL input triangular matrix to be transposed
-  @param[in]    in known variables vector
-  @param[in]    x unknown solutions vector
-  @param[in]    n dimension of the system
-  @return       none
-*/
-
-void mempool_Lttrisol_folded_f16s(__fp16 *pL, __fp16 *in, __fp16 *x,
-                                  const uint32_t n) {
-
-  uint32_t i, j;
-  __fp16 a, b;
-  __fp16 c, d;
-
-  __fp16 as, bs;
-  __fp16 ax, bx;
-  __fp16 diag;
-
-  // Solve for each variable x_i in turn
-  for (i = 0; i < n; i++) {
-    uint32_t rev_i = (n - 1 - i);
-    diag = pL[rev_i * N_BANKS + 2 * rev_i];
-    as = (__fp16)in[2 * rev_i];
-    bs = (__fp16)in[2 * rev_i + 1];
-    // Use the previously solved variables to calculate the sum
-    for (j = 0; j < i; j++) {
-      uint32_t rev_j = (n - 1 - j);
-      a = pL[rev_j * N_BANKS + 2 * rev_i];
-      b = pL[rev_j * N_BANKS + 2 * rev_i + 1];
-      c = x[2U * rev_j];
-      d = x[2U * rev_j + 1];
-      asm volatile("fnmsub.h  %[as], %[a], %[c], %[as];"
-                   "fnmsub.h  %[as], %[b], %[d], %[as];"
-                   "fnmsub.h  %[bs], %[a], %[d], %[bs];"
-                   "fmadd.h   %[bs], %[b], %[c], %[bs];"
-                   : [as] "+&r"(as), [bs] "+&r"(bs)
-                   : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
-                   :);
-    }
-    // Subtract the sum from b_i and divide by the diagonal element L[i][i]
-    asm volatile("fdiv.h %[ax], %[as], %[diag];"
-                 "fdiv.h %[bx], %[bs], %[diag];"
-                 : [ax] "+&r"(ax), [bx] "+&r"(bx)
-                 : [as] "r"(as), [bs] "r"(bs), [diag] "r"(diag)
-                 :);
-    x[2U * rev_i] = ax;
-    x[2U * rev_i + 1] = bx;
+    x[2U * ridx] = ax;
+    x[2U * ridx + 1] = bx;
   }
   return;
 }
@@ -229,7 +89,8 @@ void mempool_Lttrisol_folded_f16s(__fp16 *pL, __fp16 *in, __fp16 *x,
   @return        none
 */
 
-void mempool_Ltrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, const uint32_t n) {
+void mempool_Ltrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, const uint32_t n,
+                          const uint32_t transposed, const uint32_t folded) {
 
   uint32_t i, j;
   __fp16 a, b;
@@ -237,82 +98,44 @@ void mempool_Ltrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, const uint32_t n) {
 
   __fp16 as, bs;
   __fp16 diag;
+  const uint32_t offset = folded ? N_BANKS : n;
 
   float ax, bx, diag_f32;
   v2h res;
 
   // Solve for each variable x_i in turn
   for (i = 0; i < n; i++) {
-    diag = pL[2 * (i * n + i)];
-    as = (__fp16)in[2U * i];
-    bs = (__fp16)in[2U * i + 1];
+    uint32_t ridx = transposed ? (n - i - 1) : i;
+    diag = pL[2U * (ridx * offset + ridx)];
+    as = (__fp16)in[2U * ridx];
+    bs = (__fp16)in[2U * ridx + 1];
     // Use the previously solved variables to calculate the sum
     for (j = 0; j < i; j++) {
-      a = pL[2U * (i * n + j)];
-      b = pL[2U * (i * n + j) + 1];
-      c = x[2U * j];
-      d = x[2U * j + 1];
-      asm volatile("fnmsub.h  %[as], %[a], %[c], %[as];"
-                   "fnmsub.h  %[bs], %[a], %[d], %[bs];"
-                   "fmadd.h   %[as], %[b], %[d], %[as];"
-                   "fnmsub.h  %[bs], %[b], %[c], %[bs];"
-                   : [as] "+&r"(as), [bs] "+&r"(bs)
-                   : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
-                   :);
-    }
-    // Subtract the sum from b_i and divide by the diagonal element L[i][i]
-    asm volatile("fcvt.s.h %0, %1;" : "=r"(diag_f32) : "r"(diag) :);
-    asm volatile("fcvt.s.h %0, %1;" : "=r"(ax) : "r"(as) :);
-    asm volatile("fcvt.s.h %0, %1;" : "=r"(bx) : "r"(bs) :);
-    ax = ax / diag_f32;
-    bx = bx / diag_f32;
-    asm volatile("vfcpka.h.s %0, %1, %2;" : "=r"(res) : "r"(ax), "r"(bx) :);
-    (*(v2h *)&x[2U * i]) = res;
-  }
-  return;
-}
-
-/**
-  @brief         Single-core solution of upper triangular system
-                 (from transposed lower triangular system)
-  @param[in]     pL input triangular matrix to be transposed
-  @param[in]     in known variables vector
-  @param[in]     x unknown solutions vector
-  @param[in]     n dimension of the system
-  @return        none
-*/
-
-void mempool_Lttrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x,
-                           const uint32_t n) {
 
-  uint32_t i, j;
-  __fp16 a, b;
-  __fp16 c, d;
-
-  __fp16 as, bs;
-  __fp16 diag;
-
-  float ax, bx, diag_f32;
-  v2h res;
-
-  // Solve for each variable x_i in turn
-  for (i = 0; i < n; i++) {
-    diag = pL[2 * ((n - 1 - i) * n + (n - 1 - i))];
-    as = (__fp16)in[2 * (n - i - 1)];
-    bs = (__fp16)in[2 * (n - i - 1) + 1];
-    // Use the previously solved variables to calculate the sum
-    for (j = 0; j < i; j++) {
-      a = pL[2U * ((n - 1 - j) * n + (n - 1 - i))];
-      b = pL[2U * ((n - 1 - j) * n + (n - 1 - i)) + 1];
-      c = x[2U * (n - 1 - j)];
-      d = x[2U * (n - 1 - j) + 1];
-      asm volatile("fnmsub.h  %[as], %[a], %[c], %[as];"
-                   "fnmsub.h  %[as], %[b], %[d], %[as];"
-                   "fnmsub.h  %[bs], %[a], %[d], %[bs];"
-                   "fmadd.h   %[bs], %[b], %[c], %[bs];"
-                   : [as] "+&r"(as), [bs] "+&r"(bs)
-                   : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
-                   :);
+      uint32_t volatile cidx = transposed ? (n - j - 1) : j;
+      c = x[2U * cidx];
+      d = x[2U * cidx + 1];
+      if (transposed) {
+        a = pL[2U * (cidx * offset + ridx)];
+        b = pL[2U * (cidx * offset + ridx) + 1];
+        asm volatile("fnmsub.h  %[as], %[a], %[c], %[as];"
+                     "fnmsub.h  %[as], %[b], %[d], %[as];"
+                     "fnmsub.h  %[bs], %[a], %[d], %[bs];"
+                     "fmadd.h   %[bs], %[b], %[c], %[bs];"
+                     : [as] "+&r"(as), [bs] "+&r"(bs)
+                     : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
+                     :);
+      } else {
+        a = pL[2U * (ridx * offset + cidx)];
+        b = pL[2U * (ridx * offset + cidx) + 1];
+        asm volatile("fnmsub.h  %[as], %[a], %[c], %[as];"
+                     "fnmsub.h  %[bs], %[a], %[d], %[bs];"
+                     "fmadd.h   %[as], %[b], %[d], %[as];"
+                     "fnmsub.h  %[bs], %[b], %[c], %[bs];"
+                     : [as] "+&r"(as), [bs] "+&r"(bs)
+                     : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
+                     :);
+      }
     }
     // Subtract the sum from b_i and divide by the diagonal element L[i][i]
     asm volatile("fcvt.s.h %0, %1;" : "=r"(diag_f32) : "r"(diag) :);
@@ -321,7 +144,7 @@ void mempool_Lttrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x,
     ax = ax / diag_f32;
     bx = bx / diag_f32;
     asm volatile("vfcpka.h.s %0, %1, %2;" : "=r"(res) : "r"(ax), "r"(bx) :);
-    (*(v2h *)&x[2U * (n - i - 1)]) = res;
+    (*(v2h *)&x[2U * ridx]) = res;
   }
   return;
 }
diff --git a/software/kernels/baremetal/mempool_linearsolver_f32s.h b/software/kernels/baremetal/mempool_linearsolver_f32s.h
index c3f3b6ce1..d3297397c 100644
--- a/software/kernels/baremetal/mempool_linearsolver_f32s.h
+++ b/software/kernels/baremetal/mempool_linearsolver_f32s.h
@@ -15,10 +15,12 @@
   @param[in]     in known variables vector
   @param[in]     x unknown solutions vector
   @param[in]     n dimension of the system
+  @param[in]     transposed solve transposed system
   @return        none
 */
 
-void mempool_Ltrisol_f32s(float *pL, float *in, float *x, const uint32_t n) {
+void mempool_Ltrisol_f32s(float *pL, float *in, float *x, const uint32_t n,
+                          const uint32_t transposed, const uint32_t folded) {
 
   uint32_t i, j;
   float a, b;
@@ -27,122 +29,28 @@ void mempool_Ltrisol_f32s(float *pL, float *in, float *x, const uint32_t n) {
   float as, bs;
   float ax, bx;
   float diag;
+  const uint32_t offset = folded ? N_BANKS : n;
 
   // Solve for each variable x_i in turn
   for (i = 0; i < n; i++) {
-    as = in[2U * i];
-    bs = in[2U * i + 1];
-    diag = pL[2 * (i * n + i)];
+    uint32_t ridx = transposed ? (n - i - 1) : i;
+    diag = pL[2U * (ridx * offset + ridx)];
+    as = in[2U * ridx];
+    bs = in[2U * ridx + 1];
     // Use the previously solved variables to calculate the sum
     for (j = 0; j < i; j++) {
-      a = pL[2U * (i * n + j)];
-      b = pL[2U * (i * n + j) + 1];
-      c = x[2U * j];
-      d = x[2U * j + 1];
-      asm volatile("fnmsub.s %[as], %[a], %[c], %[as];"
-                   "fnmsub.s %[bs], %[a], %[d], %[bs];"
-                   "fmadd.s  %[as], %[b], %[d], %[as];"
-                   "fnmsub.s %[bs], %[b], %[c], %[bs];"
-                   : [as] "+&r"(as), [bs] "+&r"(bs)
-                   : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
-                   :);
-    }
-    // Subtract the sum from b_i and divide by the diagonal element L[i][i]
-    asm volatile("fdiv.s %[ax], %[as], %[diag];"
-                 "fdiv.s %[bx], %[bs], %[diag];"
-                 : [ax] "+&r"(ax), [bx] "+&r"(bx)
-                 : [as] "r"(as), [bs] "r"(bs), [diag] "r"(diag)
-                 :);
-    x[2U * i] = ax;
-    x[2U * i + 1] = bx;
-  }
-  return;
-}
-
-/**
-  @brief         Single-core solution of upper triangular system
-                 (from transposed lower triangular system)
-  @param[in]     pL input triangular matrix to be transposed
-  @param[in]     in known variables vector
-  @param[in]     x unknown solutions vector
-  @param[in]     n dimension of the system
-  @return        none
-*/
-
-void mempool_Lttrisol_f32s(float *pL, float *in, float *x, const uint32_t n) {
 
-  uint32_t i, j;
-  float a, b;
-  float c, d;
-
-  float as, bs;
-  float ax, bx;
-  float diag;
-
-  // Solve for each variable x_i in turn
-  for (i = 0; i < n; i++) {
-    diag = pL[2 * ((n - 1 - i) * n + (n - 1 - i))];
-    as = in[2 * (n - i - 1)];
-    bs = in[2 * (n - i - 1) + 1];
-    // Use the previously solved variables to calculate the sum
-    for (j = 0; j < i; j++) {
-      a = pL[2U * ((n - 1 - j) * n + (n - 1 - i))];
-      b = pL[2U * ((n - 1 - j) * n + (n - 1 - i)) + 1];
-      c = x[2U * (n - 1 - j)];
-      d = x[2U * (n - 1 - j) + 1];
-      asm volatile("fnmsub.s  %[as], %[a], %[c], %[as];"
-                   "fnmsub.s %[as], %[b], %[d], %[as];"
-                   "fnmsub.s  %[bs], %[a], %[d], %[bs];"
-                   "fmadd.s  %[bs], %[b], %[c], %[bs];"
-                   : [as] "+&r"(as), [bs] "+&r"(bs)
-                   : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
-                   :);
-    }
-    // Subtract the sum from b_i and divide by the diagonal element L[i][i]
-    asm volatile("fdiv.s %[ax], %[as], %[diag];"
-                 "fdiv.s %[bx], %[bs], %[diag];"
-                 : [ax] "+&r"(ax), [bx] "+&r"(bx)
-                 : [as] "r"(as), [bs] "r"(bs), [diag] "r"(diag)
-                 :);
-    x[2U * (n - i - 1)] = ax;
-    x[2U * (n - i - 1) + 1] = bx;
-  }
-  return;
-}
-
-/**
-  @brief        Single-core solution of lower triangular system
-                (from transposed lower triangular system)
-  @param[in]    pL input triangular matrix
-  @param[in]    in known variables vector
-  @param[in]    x unknown solutions vector
-  @param[in]    n dimension of the system
-  @return       none
-*/
-
-void mempool_Ltrisol_folded_f32s(float *pL, float *in, float *x,
-                                 const uint32_t n) {
-
-  uint32_t i, j;
-  float a, b;
-  float c, d;
-
-  float as, bs;
-  float ax, bx;
-  float diag;
-  uint32_t banks_per_row = (n / 2) * N_BANKS;
-
-  // Solve for each variable x_i in turn
-  for (i = 0; i < n; i++) {
-    diag = pL[i * banks_per_row + (i / 2) * N_BANKS + 2 * (i % 2)];
-    as = in[(i / 2) * N_BANKS + 2 * (i % 2)];
-    bs = in[(i / 2) * N_BANKS + 2 * (i % 2) + 1];
-    // Use the previously solved variables to calculate the sum
-    for (j = 0; j < i; j++) {
-      a = pL[i * banks_per_row + (j / 2) * N_BANKS + 2 * (j % 2)];
-      b = pL[i * banks_per_row + (j / 2) * N_BANKS + 2 * (j % 2) + 1];
-      c = x[(j / 2) * N_BANKS + 2 * (j % 2)];
-      d = x[(j / 2) * N_BANKS + 2 * (j % 2) + 1];
+      uint32_t cidx = transposed ? (n - j - 1) : j;
+      if (!transposed) {
+        a = pL[2U * (ridx * offset + cidx)];
+        b = pL[2U * (ridx * offset + cidx) + 1];
+      } else {
+        a = pL[2U * (cidx * offset + ridx)];
+        b = pL[2U * (cidx * offset + ridx) + 1];
+      }
+
+      c = x[2U * cidx];
+      d = x[2U * cidx + 1];
       asm volatile("fnmsub.s %[as], %[a], %[c], %[as];"
                    "fnmsub.s %[bs], %[a], %[d], %[bs];"
                    "fmadd.s  %[as], %[b], %[d], %[as];"
@@ -157,67 +65,8 @@ void mempool_Ltrisol_folded_f32s(float *pL, float *in, float *x,
                  : [ax] "+&r"(ax), [bx] "+&r"(bx)
                  : [as] "r"(as), [bs] "r"(bs), [diag] "r"(diag)
                  :);
-    x[(i / 2) * N_BANKS + 2 * (i % 2)] = ax;
-    x[(i / 2) * N_BANKS + 2 * (i % 2) + 1] = bx;
-  }
-  return;
-}
-
-/**
-  @brief        Single-core solution of upper triangular system
-                Output data is folded to the core's local memory.
-                (from transposed lower triangular system)
-  @param[in]    pL input triangular matrix to be transposed
-  @param[in]    in known variables vector
-  @param[in]    x unknown solutions vector
-  @param[in]    n dimension of the system
-  @return       none
-*/
-
-void mempool_Lttrisol_folded_f32s(float *pL, float *in, float *x,
-                                  const uint32_t n) {
-
-  uint32_t i, j;
-  float a, b;
-  float c, d;
-
-  float as, bs;
-  float ax, bx;
-  float diag;
-  uint32_t banks_per_row = (n / 2) * N_BANKS;
-
-  // Solve for each variable x_i in turn
-  for (i = 0; i < n; i++) {
-    // reversed i index
-    uint32_t rev_i = (n - 1 - i);
-    diag = pL[rev_i * banks_per_row + (rev_i / 2) * N_BANKS + 2 * (rev_i % 2)];
-    as = in[(rev_i / 2) * N_BANKS + 2 * (rev_i % 2)];
-    bs = in[(rev_i / 2) * N_BANKS + 2 * (rev_i % 2) + 1];
-    // Use the previously solved variables to calculate the sum
-    for (j = 0; j < i; j++) {
-      // reversed j index
-      uint32_t rev_j = (n - 1 - j);
-      a = pL[rev_j * banks_per_row + (rev_i / 2) * N_BANKS + 2 * (rev_i % 2)];
-      b = pL[rev_j * banks_per_row + (rev_i / 2) * N_BANKS + 2 * (rev_i % 2) +
-             1];
-      c = x[2U * rev_j];
-      d = x[2U * rev_j + 1];
-      asm volatile("fnmsub.s  %[as], %[a], %[c], %[as];"
-                   "fnmsub.s %[as], %[b], %[d], %[as];"
-                   "fnmsub.s  %[bs], %[a], %[d], %[bs];"
-                   "fmadd.s  %[bs], %[b], %[c], %[bs];"
-                   : [as] "+&r"(as), [bs] "+&r"(bs)
-                   : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
-                   :);
-    }
-    // Subtract the sum from b_i and divide by the diagonal element L[i][i]
-    asm volatile("fdiv.s %[ax], %[as], %[diag];"
-                 "fdiv.s %[bx], %[bs], %[diag];"
-                 : [ax] "+&r"(ax), [bx] "+&r"(bx)
-                 : [as] "r"(as), [bs] "r"(bs), [diag] "r"(diag)
-                 :);
-    x[2U * rev_i] = ax;
-    x[2U * rev_i + 1] = bx;
+    x[2U * ridx] = ax;
+    x[2U * ridx + 1] = bx;
   }
   return;
 }
diff --git a/software/kernels/baremetal/mempool_linearsolver_q16s.h b/software/kernels/baremetal/mempool_linearsolver_q16s.h
index 4c3c2ad4a..cd9134968 100644
--- a/software/kernels/baremetal/mempool_linearsolver_q16s.h
+++ b/software/kernels/baremetal/mempool_linearsolver_q16s.h
@@ -5,7 +5,6 @@
 // Author: Marco Bertuletti, ETH Zurich
 
 #pragma once
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
 
 /**
   @brief         Single-core solution of lower triangular system
@@ -13,11 +12,12 @@
   @param[in]     y known variables vector
   @param[in]     x unknown solutions vector
   @param[in]     n dimension of the system
+  @param[in]     transposed solve transposed system
   @return        none
 */
 
 void mempool_Ltrisol_q16vecs(int16_t *pL, int16_t *y, int16_t *x,
-                             const uint32_t n) {
+                             const uint32_t n, const uint32_t transposed) {
 
   uint32_t i, j;
   int32_t as, bs, diag;
@@ -27,15 +27,20 @@ void mempool_Ltrisol_q16vecs(int16_t *pL, int16_t *y, int16_t *x,
 
   // Solve for each variable x[i] in loop
   for (i = 0; i < n; i++) {
-    // Pre-load the diagonal element
-    diag = pL[2U * (i * n + i)];
+    uint32_t ridx = transposed ? (n - i - 1) : i;
+    diag = pL[2U * (ridx * offset + ridx)];
     // Initialize the sums
     as = 0;
     bs = 0;
     // Use the previously solved variables to compute the sum
     for (j = 0; j < i; j++) {
-      ab = *(v2s *)&pL[2U * (i * n + j)];
-      cd = *(v2s *)&x[2U * j];
+      uint32_t cidx = transposed ? (n - j - 1) : j;
+      if (!transposed) {
+        ab = *(v2s *)&pL[2U * (ridx * n + cidx)];
+      } else {
+        ab = *(v2s *)&pL[2U * (cidx * n + ridx)];
+      }
+      cd = *(v2s *)&x[2U * cidx];
       const uint32_t shuffle_mask = 0x00020003;
       asm volatile(
           // s = s + (ac + bd) + j(bc - ad)
@@ -53,76 +58,15 @@ void mempool_Ltrisol_q16vecs(int16_t *pL, int16_t *y, int16_t *x,
           :);
     }
     // Subtract the sum from y[i] and divide by the diagonal element L[i][i]
-    as = as - (int32_t)y[2U * i];
-    bs = bs - (int32_t)y[2U * i + 1];
+    as = as - (int32_t)y[2U * ridx];
+    bs = bs - (int32_t)y[2U * ridx + 1];
     asm volatile("div       %[as], %[as], %[diag];"
                  "div       %[bs], %[bs], %[diag];"
                  "pv.pack   %[res], %[as], %[bs];"
                  : [as] "+&r"(as), [bs] "+&r"(bs), [res] "+&r"(res)
                  : [diag] "r"(diag)
                  :);
-    (*(v2s *)&x[2U * i]) = res;
-  }
-
-  return;
-}
-
-/**
-  @brief         Single-core solution of upper triangular system
-                 (from transposed lower triangular system)
-  @param[in]     pL input triangular matrix to be transposed
-  @param[in]     y known variables vector
-  @param[in]     x unknown solutions vector
-  @param[in]     n dimension of the system
-  @return        none
-*/
-
-void mempool_Lttrisol_q16vecs(int16_t *pL, int16_t *y, int16_t *x,
-                              const uint32_t n) {
-
-  uint32_t i, j;
-  int32_t as, bs, diag;
-  v2s ab, cd;
-  v2s res = (v2s){0, 0};
-  v2s ndc = (v2s){0, 0};
-
-  // Solve for each variable x[i] in loop
-  for (i = 0; i < n; i++) {
-    // Pre-load the diagonal element
-    diag = pL[2 * ((n - 1 - i) * n + (n - 1 - i))];
-    // Initialize the sums
-    as = 0;
-    bs = 0;
-    // Use the previously solved variables to compute the sum
-    for (j = 0; j < i; j++) {
-      ab = *(v2s *)&pL[2U * ((n - 1 - j) * n + (n - 1 - i))];
-      cd = *(v2s *)&x[2U * (n - 1 - j)];
-      const uint32_t shuffle_mask = 0x00020003;
-      asm volatile(
-          // s = s + (ac + bd) + j(bc - ad)
-          "pv.dotsp.h     %[as],  %[ab],   %[cd];"
-          "pv.shuffle2.h  %[ndc], %[cd],   %[shuffle_mask];"
-          "pv.sub.h       %[ndc], %[zero], %[ndc];"
-          "pv.dotsp.h     %[bs],  %[ab],   %[ndc];"
-          "srai           %[as],  %[as],   0x8;"
-          "srai           %[bs],  %[bs],   0x8;"
-          "p.clip         %[bs],  %[bs],   0x16;"
-          "p.clip         %[as],  %[as],   0x16;"
-          : [as] "+&r"(as), [bs] "+&r"(bs), [ndc] "+r"(ndc)
-          : [ab] "r"(ab), [cd] "r"(cd), [shuffle_mask] "r"(shuffle_mask),
-            [zero] "r"((v2s){0, 0})
-          :);
-    }
-    // Subtract the sum from y[i] and divide by the diagonal element L[i][i]
-    as = as - (int32_t)y[2 * (n - i - 1)];
-    bs = bs - (int32_t)y[2 * (n - i - 1) + 1];
-    asm volatile("div       %[as], %[as], %[diag];"
-                 "div       %[bs], %[bs], %[diag];"
-                 "pv.pack   %[res], %[as], %[bs];"
-                 : [as] "+&r"(as), [bs] "+&r"(bs), [res] "+&r"(res)
-                 : [diag] "r"(diag)
-                 :);
-    (*(v2s *)&x[2U * (n - i - 1)]) = res;
+    (*(v2s *)&x[2U * ridx]) = res;
   }
 
   return;
diff --git a/software/kernels/baremetal/mempool_mimo_mmse_f16s.h b/software/kernels/baremetal/mempool_mimo_mmse_f16s.h
index 37834107d..45076f9fe 100644
--- a/software/kernels/baremetal/mempool_mimo_mmse_f16s.h
+++ b/software/kernels/baremetal/mempool_mimo_mmse_f16s.h
@@ -31,7 +31,7 @@
 */
 void mempool_hermitian_f16s(__fp16 *pH, __fp16 *pG, __fp16 *pS,
                             const uint32_t n_rx, const uint32_t n_tx,
-                            const uint32_t folded, const uint32_t zf) {
+                            const uint32_t zf, const uint32_t folded) {
 
   uint32_t i, j, k;
   __fp16 a;
@@ -112,29 +112,16 @@ void mempool_hermitian_f16s(__fp16 *pH, __fp16 *pG, __fp16 *pS,
           bs3 = (__fp16)0.0f;
         }
       }
-      if (!folded) {
-        // Store
-        pG[2 * (i * n_tx + j)] = as0;
-        pG[2 * (i * n_tx + j + 1U)] = as1;
-        pG[2 * (i * n_tx + j + 2U)] = as2;
-        pG[2 * (i * n_tx + j + 3U)] = as3;
-        pG[2 * (i * n_tx + j) + 1U] = bs0;
-        pG[2 * (i * n_tx + j + 1U) + 1U] = bs1;
-        pG[2 * (i * n_tx + j + 2U) + 1U] = bs2;
-        pG[2 * (i * n_tx + j + 3U) + 1U] = bs3;
-      } else {
-        // uint32_t addr = i * (n_tx / 4) * N_BANKS + (j / 4) * N_BANKS;
-        uint32_t addr = i * N_BANKS;
-        // Store
-        pG[addr] = as0;
-        pG[addr + 1U] = bs0;
-        pG[addr + 2U] = as1;
-        pG[addr + 3U] = bs1;
-        pG[addr + 4U] = as2;
-        pG[addr + 5U] = bs2;
-        pG[addr + 6U] = as3;
-        pG[addr + 7U] = bs3;
-      }
+      uint32_t const offset = folded ? N_BANKS : n_tx;
+      // Store
+      pG[2 * (i * offset + j)] = as0;
+      pG[2 * (i * offset + j + 1U)] = as1;
+      pG[2 * (i * offset + j + 2U)] = as2;
+      pG[2 * (i * offset + j + 3U)] = as3;
+      pG[2 * (i * offset + j) + 1U] = bs0;
+      pG[2 * (i * offset + j + 1U) + 1U] = bs1;
+      pG[2 * (i * offset + j + 2U) + 1U] = bs2;
+      pG[2 * (i * offset + j + 3U) + 1U] = bs3;
     }
   }
   return;
@@ -151,8 +138,7 @@ void mempool_hermitian_f16s(__fp16 *pH, __fp16 *pG, __fp16 *pS,
   @return        none
 */
 void mempool_MVP_conjtransp_f16s(__fp16 *pH, __fp16 *px, __fp16 *py,
-                                 const uint32_t n_rx, const uint32_t n_tx,
-                                 const uint32_t folded) {
+                                 const uint32_t n_rx, const uint32_t n_tx) {
 
   uint32_t i, j;
   __fp16 a0, a1, a2, a3;
@@ -215,31 +201,16 @@ void mempool_MVP_conjtransp_f16s(__fp16 *pH, __fp16 *px, __fp16 *py,
             [a3] "r"(a3), [b0] "r"(b0), [b1] "r"(b1), [b2] "r"(b2), [b3] "r"(b3)
           :);
     }
-    if (!folded) {
-      // Store
-      py[2U * i] = as0;
-      py[2U * (i + 1U)] = as1;
-      py[2U * (i + 2U)] = as2;
-      py[2U * (i + 3U)] = as3;
-      py[2U * i + 1U] = bs0;
-      py[2U * (i + 1U) + 1U] = bs1;
-      py[2U * (i + 2U) + 1U] = bs2;
-      py[2U * (i + 3U) + 1U] = bs3;
-      i += 4;
-    } else {
-      // Store
-      uint32_t addr = (i / 4) * N_BANKS;
-      py[addr] = as0;
-      py[addr + 1U] = bs0;
-      py[addr + 2U] = as1;
-      py[addr + 3U] = bs1;
-      py[addr + 4U] = as2;
-      py[addr + 5U] = bs2;
-      py[addr + 6U] = as3;
-      py[addr + 7U] = bs3;
-      i += 4;
-    }
-
+    // Store
+    py[2U * i] = as0;
+    py[2U * (i + 1U)] = as1;
+    py[2U * (i + 2U)] = as2;
+    py[2U * (i + 3U)] = as3;
+    py[2U * i + 1U] = bs0;
+    py[2U * (i + 1U) + 1U] = bs1;
+    py[2U * (i + 2U) + 1U] = bs2;
+    py[2U * (i + 3U) + 1U] = bs3;
+    i += 4;
   } while (i < n_tx);
   return;
 }
@@ -267,7 +238,7 @@ void mempool_MVP_conjtransp_f16s(__fp16 *pH, __fp16 *px, __fp16 *py,
 */
 void mempool_hermitian_f16vecs(__fp16 *pH, __fp16 *pG, __fp16 *pS,
                                const uint32_t n_rx, const uint32_t n_tx,
-                               const uint32_t zf) {
+                               const uint32_t zf, const uint32_t folded) {
   uint32_t i, j, k;
   v2h ab;
   v2h cd0, cd1, cd2, cd3;
@@ -282,7 +253,7 @@ void mempool_hermitian_f16vecs(__fp16 *pH, __fp16 *pG, __fp16 *pS,
 
   for (i = 0; i < n_tx; i++) {
 
-    if (n_tx == 1) {
+    if (n_tx % 4 != 0) {
       as0 = 0.0f; // Initialize the real part of sums
       bs0 = 0.0f; // Initialize the imag part of sums
       // Inner Loop
@@ -312,9 +283,11 @@ void mempool_hermitian_f16vecs(__fp16 *pH, __fp16 *pG, __fp16 *pS,
         asm volatile("and     %0, %0, %1;" : "+&r"(res0) : "r"(0x0000FFFF));
         asm volatile("fadd.h  %0, %0, %1;" : "+&r"(res0) : "r"(pS[2 * i]));
       }
-      (*(v2h *)&pG[2 * (i * n_tx + j)]) = res0;
+      // Store
+      uint32_t addr = folded ? 2 * (i * N_BANKS + j) : 2 * (i * n_tx + j);
+      (*(v2h *)&pG[addr]) = res0;
 
-    } else if (n_tx >= 4) {
+    } else {
       // UNROLL_4
       for (j = 0; j < n_tx; j += 4) {
         as0 = 0.0f;
@@ -381,11 +354,12 @@ void mempool_hermitian_f16vecs(__fp16 *pH, __fp16 *pG, __fp16 *pS,
             asm volatile("fadd.h  %0, %0, %1;" : "+&r"(res3) : "r"(pS[2 * i]));
           }
         }
+        uint32_t const offset = folded ? N_BANKS : n_tx;
         // Store
-        (*(v2h *)&pG[2 * (i * n_tx + j)]) = res0;
-        (*(v2h *)&pG[2 * (i * n_tx + j + 1U)]) = res1;
-        (*(v2h *)&pG[2 * (i * n_tx + j + 2U)]) = res2;
-        (*(v2h *)&pG[2 * (i * n_tx + j + 3U)]) = res3;
+        (*(v2h *)&pG[2 * (i * offset + j)]) = res0;
+        (*(v2h *)&pG[2 * (i * offset + j + 1U)]) = res1;
+        (*(v2h *)&pG[2 * (i * offset + j + 2U)]) = res2;
+        (*(v2h *)&pG[2 * (i * offset + j + 3U)]) = res3;
       }
     }
   }
@@ -393,7 +367,7 @@ void mempool_hermitian_f16vecs(__fp16 *pH, __fp16 *pG, __fp16 *pS,
 #else
 
   for (i = 0; i < n_tx; i++) {
-    if (n_tx >= 4) {
+    if (n_tx % 4 == 0) {
       // UNROLL_4
       for (j = 0; j < n_tx; j += 4) {
         v2h res0 = (v2h)0.0f;
@@ -439,10 +413,12 @@ void mempool_hermitian_f16vecs(__fp16 *pH, __fp16 *pG, __fp16 *pS,
             asm volatile("fadd.h  %0, %0, %1;" : "+&r"(res3) : "r"(pS[2 * i]));
           }
         }
-        (*(v2h *)&pG[2 * (i * n_tx + j)]) = res0;
-        (*(v2h *)&pG[2 * (i * n_tx + j + 1U)]) = res1;
-        (*(v2h *)&pG[2 * (i * n_tx + j + 2U)]) = res2;
-        (*(v2h *)&pG[2 * (i * n_tx + j + 3U)]) = res3;
+        uint32_t const offset = folded ? N_BANKS : n_tx;
+        // Store
+        (*(v2h *)&pG[2 * (i * offset + j)]) = res0;
+        (*(v2h *)&pG[2 * (i * offset + j + 1U)]) = res1;
+        (*(v2h *)&pG[2 * (i * offset + j + 2U)]) = res2;
+        (*(v2h *)&pG[2 * (i * offset + j + 3U)]) = res3;
       }
     }
   }
@@ -478,7 +454,7 @@ void mempool_MVP_conjtransp_f16vecs(__fp16 *pH, __fp16 *px, __fp16 *py,
 #endif
 
 #ifndef __CDOTP
-  if (n_tx < 4) {
+  if (n_tx % 4 != 0) {
     for (i = 0; i < n_tx; i++) {
       as0 = 0.0f; // Initialize the real part of sums
       bs0 = 0.0f; // Initialize the imag part of sums
@@ -505,7 +481,7 @@ void mempool_MVP_conjtransp_f16vecs(__fp16 *pH, __fp16 *px, __fp16 *py,
       *(v2h *)&py[2U * i] = res0;
     }
 
-  } else if (n_tx >= 4) {
+  } else {
     // UNROLL_4
     for (i = 0; i < n_tx; i += 4) {
       as0 = 0.0f;
@@ -557,7 +533,7 @@ void mempool_MVP_conjtransp_f16vecs(__fp16 *pH, __fp16 *px, __fp16 *py,
     }
   }
 #else
-  if (n_tx >= 4) {
+  if (n_tx % 4 == 0) {
     // UNROLL_4
     for (i = 0; i < n_tx; i += 4) {
       res0 = (v2h)0.0f;
diff --git a/software/kernels/baremetal/mempool_mimo_mmse_f32p.h b/software/kernels/baremetal/mempool_mimo_mmse_f32p.h
index 0dd492d33..7e3e6fe1a 100644
--- a/software/kernels/baremetal/mempool_mimo_mmse_f32p.h
+++ b/software/kernels/baremetal/mempool_mimo_mmse_f32p.h
@@ -18,7 +18,7 @@
   @param[in]     zf     controls if the zero forcing is used
   @return        none
 */
-void mempool_hermitian_f32p(float *pH, float *pG, float *sigma,
+void mempool_hermitian_f32p(float *pH, float *pG, float *pS,
                             const uint32_t n_rx, const uint32_t n_tx,
                             const uint32_t folded, const uint32_t zf,
                             const uint32_t core_id, const uint32_t nPE) {
@@ -88,55 +88,30 @@ void mempool_hermitian_f32p(float *pH, float *pG, float *sigma,
       }
       if (zf == 0) {
         // Compute diagonal element
-        float s = sigma[i];
         if (i == j) {
-          asm volatile("fadd.s  %[as0], %[as0], %[sigma];"
-                       : [as0] "+&r"(as0)
-                       : [sigma] "r"(s)
-                       :);
+          asm volatile("fadd.s  %0, %0, %1;" : "+&r"(as0) : "r"(pS[2U * i]));
           bs0 = 0.0f;
         } else if (i == (j + 1U)) {
-          asm volatile("fadd.s  %[as1], %[as1], %[sigma];"
-                       : [as1] "+&r"(as1)
-                       : [sigma] "r"(s)
-                       :);
+          asm volatile("fadd.s  %0, %0, %1;" : "+&r"(as1) : "r"(pS[2U * i]));
           bs1 = 0.0f;
         } else if (i == (j + 2U)) {
-          asm volatile("fadd.s  %[as2], %[as2], %[sigma];"
-                       : [as2] "+&r"(as2)
-                       : [sigma] "r"(s)
-                       :);
+          asm volatile("fadd.s  %0, %0, %1;" : "+&r"(as2) : "r"(pS[2U * i]));
           bs2 = 0.0f;
         } else if (i == (j + 3U)) {
-          asm volatile("fadd.s  %[as3], %[as3], %[sigma];"
-                       : [as3] "+&r"(as3)
-                       : [sigma] "r"(s)
-                       :);
+          asm volatile("fadd.s  %0, %0, %1;" : "+&r"(as3) : "r"(pS[2U * i]));
           bs3 = 0.0f;
         }
       }
-      if (!folded) {
-        // Store
-        pG[2 * (i * n_tx + j)] = as0;
-        pG[2 * (i * n_tx + j + 1U)] = as1;
-        pG[2 * (i * n_tx + j + 2U)] = as2;
-        pG[2 * (i * n_tx + j + 3U)] = as3;
-        pG[2 * (i * n_tx + j) + 1U] = bs0;
-        pG[2 * (i * n_tx + j + 1U) + 1U] = bs1;
-        pG[2 * (i * n_tx + j + 2U) + 1U] = bs2;
-        pG[2 * (i * n_tx + j + 3U) + 1U] = bs3;
-      } else {
-        // Store
-        uint32_t addr = i * ((n_tx / 2) * N_BANKS) + (j / 4) * (2 * N_BANKS);
-        pG[addr] = as0;
-        pG[addr + 1U] = bs0;
-        pG[addr + 2U] = as1;
-        pG[addr + 3U] = bs1;
-        pG[addr + N_BANKS] = as2;
-        pG[addr + N_BANKS + 1U] = bs2;
-        pG[addr + N_BANKS + 2U] = as3;
-        pG[addr + N_BANKS + 3U] = bs3;
-      }
+      uint32_t const offset = folded ? N_BANKS : n_tx;
+      // Store
+      pG[2 * (i * offset + j)] = as0;
+      pG[2 * (i * offset + j + 1U)] = as1;
+      pG[2 * (i * offset + j + 2U)] = as2;
+      pG[2 * (i * offset + j + 3U)] = as3;
+      pG[2 * (i * offset + j) + 1U] = bs0;
+      pG[2 * (i * offset + j + 1U) + 1U] = bs1;
+      pG[2 * (i * offset + j + 2U) + 1U] = bs2;
+      pG[2 * (i * offset + j + 3U) + 1U] = bs3;
     }
   }
   mempool_log_partial_barrier(2, mempool_get_core_id(), nPE);
diff --git a/software/kernels/baremetal/mempool_mimo_mmse_f32s.h b/software/kernels/baremetal/mempool_mimo_mmse_f32s.h
index 311052b10..baad28e0d 100644
--- a/software/kernels/baremetal/mempool_mimo_mmse_f32s.h
+++ b/software/kernels/baremetal/mempool_mimo_mmse_f32s.h
@@ -14,13 +14,13 @@
   @param[in]     pS     points to the noise vector
   @param[in]     nrx    number of received samples
   @param[in]     ntx    number of transmitted samples
-  @param[in]     folded controls if output is folded
   @param[in]     zf     controls if the zero forcing is used
+  @param[in]     folded controls if output is folded
   @return        none
 */
 void mempool_hermitian_f32s(float *pH, float *pG, float *pS,
                             const uint32_t n_rx, const uint32_t n_tx,
-                            const uint32_t folded, const uint32_t zf) {
+                            const uint32_t zf, const uint32_t folded) {
 
   uint32_t i, j, k;
   float a;
@@ -86,55 +86,31 @@ void mempool_hermitian_f32s(float *pH, float *pG, float *pS,
             :);
       }
       if (zf == 0) {
-        // Compute diagonal elements
+        // Compute diagonal element
         if (i == j) {
-          asm volatile("fadd.s  %[as0], %[as0], %[pS];"
-                       : [as0] "+&r"(as0)
-                       : [pS] "r"(pS[i])
-                       :);
+          asm volatile("fadd.s  %0, %0, %1;" : "+&r"(as0) : "r"(pS[2U * i]));
           bs0 = 0.0f;
         } else if (i == (j + 1U)) {
-          asm volatile("fadd.s  %[as1], %[as1], %[pS];"
-                       : [as1] "+&r"(as1)
-                       : [pS] "r"(pS[i])
-                       :);
+          asm volatile("fadd.s  %0, %0, %1;" : "+&r"(as1) : "r"(pS[2U * i]));
           bs1 = 0.0f;
         } else if (i == (j + 2U)) {
-          asm volatile("fadd.s  %[as2], %[as2], %[pS];"
-                       : [as2] "+&r"(as2)
-                       : [pS] "r"(pS[i])
-                       :);
+          asm volatile("fadd.s  %0, %0, %1;" : "+&r"(as2) : "r"(pS[2U * i]));
           bs2 = 0.0f;
         } else if (i == (j + 3U)) {
-          asm volatile("fadd.s  %[as3], %[as3], %[pS];"
-                       : [as3] "+&r"(as3)
-                       : [pS] "r"(pS[i])
-                       :);
+          asm volatile("fadd.s  %0, %0, %1;" : "+&r"(as3) : "r"(pS[2U * i]));
           bs3 = 0.0f;
         }
       }
-      if (!folded) {
-        // Store
-        pG[2 * (i * n_tx + j)] = as0;
-        pG[2 * (i * n_tx + j + 1U)] = as1;
-        pG[2 * (i * n_tx + j + 2U)] = as2;
-        pG[2 * (i * n_tx + j + 3U)] = as3;
-        pG[2 * (i * n_tx + j) + 1U] = bs0;
-        pG[2 * (i * n_tx + j + 1U) + 1U] = bs1;
-        pG[2 * (i * n_tx + j + 2U) + 1U] = bs2;
-        pG[2 * (i * n_tx + j + 3U) + 1U] = bs3;
-      } else {
-        // Store
-        uint32_t addr = i * ((n_tx / 2) * N_BANKS) + (j / 4) * (2 * N_BANKS);
-        pG[addr] = as0;
-        pG[addr + 1U] = bs0;
-        pG[addr + 2U] = as1;
-        pG[addr + 3U] = bs1;
-        pG[addr + N_BANKS] = as2;
-        pG[addr + N_BANKS + 1U] = bs2;
-        pG[addr + N_BANKS + 2U] = as3;
-        pG[addr + N_BANKS + 3U] = bs3;
-      }
+      uint32_t const offset = folded ? N_BANKS : n_tx;
+      // Store
+      pG[2U * (i * offset + j)] = as0;
+      pG[2U * (i * offset + j + 1U)] = as1;
+      pG[2U * (i * offset + j + 2U)] = as2;
+      pG[2U * (i * offset + j + 3U)] = as3;
+      pG[2U * (i * offset + j) + 1U] = bs0;
+      pG[2U * (i * offset + j + 1U) + 1U] = bs1;
+      pG[2U * (i * offset + j + 2U) + 1U] = bs2;
+      pG[2U * (i * offset + j + 3U) + 1U] = bs3;
     }
   }
   return;
@@ -215,30 +191,16 @@ void mempool_MVP_conjtransp_f32s(float *pH, float *px, float *py,
             [a3] "r"(a3), [b0] "r"(b0), [b1] "r"(b1), [b2] "r"(b2), [b3] "r"(b3)
           :);
     }
-    if (!folded) {
-      // Store
-      py[2U * i] = as0;
-      py[2U * (i + 1U)] = as1;
-      py[2U * (i + 2U)] = as2;
-      py[2U * (i + 3U)] = as3;
-      py[2U * i + 1U] = bs0;
-      py[2U * (i + 1U) + 1U] = bs1;
-      py[2U * (i + 2U) + 1U] = bs2;
-      py[2U * (i + 3U) + 1U] = bs3;
-      i += 4;
-    } else {
-      // Store
-      uint32_t addr = (i / 4) * 2 * N_BANKS;
-      py[addr] = as0;
-      py[addr + 1U] = bs0;
-      py[addr + 2U] = as1;
-      py[addr + 3U] = bs1;
-      py[addr + N_BANKS] = as2;
-      py[addr + N_BANKS + 1U] = bs2;
-      py[addr + N_BANKS + 2U] = as3;
-      py[addr + N_BANKS + 3U] = bs3;
-      i += 4;
-    }
+    // Store
+    py[2U * i] = as0;
+    py[2U * (i + 1U)] = as1;
+    py[2U * (i + 2U)] = as2;
+    py[2U * (i + 3U)] = as3;
+    py[2U * i + 1U] = bs0;
+    py[2U * (i + 1U) + 1U] = bs1;
+    py[2U * (i + 2U) + 1U] = bs2;
+    py[2U * (i + 3U) + 1U] = bs3;
+    i += 4;
   } while (i < n_tx);
   return;
 }
diff --git a/software/kernels/baremetal/mempool_mimo_mmse_f8s.h b/software/kernels/baremetal/mempool_mimo_mmse_f8s.h
index 74d772c64..476d5f3bb 100644
--- a/software/kernels/baremetal/mempool_mimo_mmse_f8s.h
+++ b/software/kernels/baremetal/mempool_mimo_mmse_f8s.h
@@ -20,7 +20,7 @@
 */
 void mempool_hermitian_f8s(__fp8 *pH, __fp8 *pS, __fp16 *pG,
                            const uint32_t n_rx, const uint32_t n_tx,
-                           const uint32_t zf) {
+                           const uint32_t zf, const uint32_t folded) {
 
   uint32_t i, j, k;
   __fp8 a;
@@ -116,16 +116,16 @@ void mempool_hermitian_f8s(__fp8 *pH, __fp8 *pS, __fp16 *pG,
           FP16_bs3 = (__fp16)0U;
         }
       }
-
+      uint32_t const offset = folded ? N_BANKS : n_tx;
       // Store
-      pG[2 * (i * n_tx + j)] = FP16_as0;
-      pG[2 * (i * n_tx + j + 1U)] = FP16_as1;
-      pG[2 * (i * n_tx + j + 2U)] = FP16_as2;
-      pG[2 * (i * n_tx + j + 3U)] = FP16_as3;
-      pG[2 * (i * n_tx + j) + 1U] = FP16_bs0;
-      pG[2 * (i * n_tx + j + 1U) + 1U] = FP16_bs1;
-      pG[2 * (i * n_tx + j + 2U) + 1U] = FP16_bs2;
-      pG[2 * (i * n_tx + j + 3U) + 1U] = FP16_bs3;
+      pG[2 * (i * offset + j)] = FP16_as0;
+      pG[2 * (i * offset + j + 1U)] = FP16_as1;
+      pG[2 * (i * offset + j + 2U)] = FP16_as2;
+      pG[2 * (i * offset + j + 3U)] = FP16_as3;
+      pG[2 * (i * offset + j) + 1U] = FP16_bs0;
+      pG[2 * (i * offset + j + 1U) + 1U] = FP16_bs1;
+      pG[2 * (i * offset + j + 2U) + 1U] = FP16_bs2;
+      pG[2 * (i * offset + j + 3U) + 1U] = FP16_bs3;
     }
   }
   return;
@@ -229,7 +229,7 @@ void mempool_MVP_conjtransp_f8s(__fp8 *pH, __fp8 *px, __fp16 *py,
 
 void mempool_hermitian_f8vecs(__fp8 *pH, __fp8 *pS, __fp16 *pG,
                               const uint32_t n_rx, const uint32_t n_tx,
-                              const uint32_t zf) {
+                              const uint32_t zf, const uint32_t folded) {
 
   uint32_t i, j, k;
   uint32_t fe0, fe1, fe2, fe3;
@@ -303,11 +303,12 @@ void mempool_hermitian_f8vecs(__fp8 *pH, __fp8 *pS, __fp16 *pG,
           asm volatile("and     %0, %0, %1;" : "+&r"(fe3) : "r"(0x0000FFFF));
         }
       }
+      uint32_t const offset = folded ? N_BANKS : n_tx;
       // Store
-      (*(uint32_t *)&pG[2 * (i * n_tx + j)]) = fe0;
-      (*(uint32_t *)&pG[2 * (i * n_tx + j + 1U)]) = fe1;
-      (*(uint32_t *)&pG[2 * (i * n_tx + j + 2U)]) = fe2;
-      (*(uint32_t *)&pG[2 * (i * n_tx + j + 3U)]) = fe3;
+      (*(uint32_t *)&pG[2 * (i * offset + j)]) = fe0;
+      (*(uint32_t *)&pG[2 * (i * offset + j + 1U)]) = fe1;
+      (*(uint32_t *)&pG[2 * (i * offset + j + 2U)]) = fe2;
+      (*(uint32_t *)&pG[2 * (i * offset + j + 3U)]) = fe3;
     }
   }
   return;