From 6b6adca2e23adfa9bb450642a561ab1437a8f22d Mon Sep 17 00:00:00 2001
From: Magnus Lundmark <magnuslundmark@gmail.com>
Date: Sat, 14 Oct 2023 12:43:02 +0200
Subject: [PATCH] minor renames and fixes

Signed-off-by: Magnus Lundmark <magnuslundmark@gmail.com>
---
 include/volk/volk_avx2_fma_intrinsics.h |   8 +-
 include/volk/volk_avx_intrinsics.h      |   9 +-
 kernels/volk/volk_32f_reciprocal_32f.h  | 112 +++++-------------------
 3 files changed, 31 insertions(+), 98 deletions(-)

diff --git a/include/volk/volk_avx2_fma_intrinsics.h b/include/volk/volk_avx2_fma_intrinsics.h
index a5ee947c..ba099604 100644
--- a/include/volk/volk_avx2_fma_intrinsics.h
+++ b/include/volk/volk_avx2_fma_intrinsics.h
@@ -18,12 +18,14 @@
 
 /*
  * First order Newton-Raphson approximation of 1 / x
+ * x_1 = x_0 * (2 - x_0 * x)
  */
-static inline __m256 _mm256_rcp1_avx2_fma_ps(const __m256 x)
+static inline __m256 _mm256_reciprocal_1_avx2_fma_ps(const __m256 x)
 {
     const __m256 TWO = _mm256_set1_ps(0x1.0p1f); // 2.0f
-    const __m256 x_inv = _mm256_rcp_ps(x);
-    return _mm256_mul_ps(x_inv, _mm256_fnmadd_ps(x_inv, x, TWO));
+    const __m256 x0 = _mm256_rcp_ps(x);
+    const __m256 x1 = _mm256_mul_ps(x0, _mm256_fnmadd_ps(x0, x, TWO));
+    return x1;
 }
 
 /*
diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h
index 48a8ee14..6be73ad7 100644
--- a/include/volk/volk_avx_intrinsics.h
+++ b/include/volk/volk_avx_intrinsics.h
@@ -19,13 +19,14 @@
 
 /*
  * First order Newton-Raphson approximation of 1 / x
+ * x_1 = x_0 * (2 - x_0 * x)
  */
-static inline __m256 _mm256_rcp1_avx_ps(const __m256 x)
+static inline __m256 _mm256_reciprocal_1_avx_ps(const __m256 x)
 {
     const __m256 TWO = _mm256_set1_ps(0x1.0p1f); // 2.0f
-    const __m256 x_inv = _mm256_rcp_ps(x);
-    const __m256 y = _mm256_sub_ps(TWO, _mm256_mul_ps(x_inv, x));
-    return _mm256_mul_ps(x_inv, y);
+    const __m256 x0 = _mm256_rcp_ps(x);
+    const __m256 x1 = _mm256_mul_ps(x0, _mm256_sub_ps(TWO, _mm256_mul_ps(x0, x)));
+    return x1;
 }
 
 /*
diff --git a/kernels/volk/volk_32f_reciprocal_32f.h b/kernels/volk/volk_32f_reciprocal_32f.h
index ae744bd2..8b916b5d 100644
--- a/kernels/volk/volk_32f_reciprocal_32f.h
+++ b/kernels/volk/volk_32f_reciprocal_32f.h
@@ -56,37 +56,31 @@
 static inline void
 volk_32f_reciprocal_32f_generic(float* out, const float* in, unsigned int num_points)
 {
-    unsigned int i = 0;
-    for (; i < num_points; i++) {
+    for (unsigned int i = 0; i < num_points; i++) {
         out[i] = 1.f / in[i];
     }
 }
 #endif /* LV_HAVE_GENERIC */
 
-
 #if LV_HAVE_AVX2 && LV_HAVE_FMA
 #include <immintrin.h>
 #include <volk/volk_avx2_fma_intrinsics.h>
 static inline void
 volk_32f_reciprocal_32f_a_avx2_fma(float* out, const float* in, unsigned int num_points)
 {
-    unsigned int number = 0;
     const unsigned int eighth_points = num_points / 8;
-
-    for (; number < eighth_points; number++) {
+    for (unsigned int number = 0; number < eighth_points; number++) {
         __m256 x = _mm256_load_ps(in);
         in += 8;
 
-        __m256 x_inv = _mm256_rcp1_avx2_fma_ps(x);
+        __m256 r = _mm256_reciprocal_1_avx2_fma_ps(x);
 
-        _mm256_store_ps(out, x_inv);
+        _mm256_store_ps(out, r);
         out += 8;
     }
 
-    number = eighth_points * 8;
-    for (; number < num_points; number++) {
-        *out++ = 1.f / (*in++);
-    }
+    const unsigned int done = eighth_points * 8;
+    volk_32f_reciprocal_32f_generic(out, in, num_points - done);
 }
 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
 
@@ -96,49 +90,19 @@ volk_32f_reciprocal_32f_a_avx2_fma(float* out, const float* in, unsigned int num
 static inline void
 volk_32f_reciprocal_32f_a_avx(float* out, const float* in, unsigned int num_points)
 {
-    unsigned int number = 0;
-    const unsigned int eighth_points = num_points / 8;
-
-    for (; number < eighth_points; number++) {
-        __m256 x = _mm256_load_ps(in);
-        in += 8;
-
-        __m256 x_inv = _mm256_rcp1_avx_ps(x);
-
-        _mm256_store_ps(out, x_inv);
-        out += 8;
-    }
-
-    number = eighth_points * 8;
-    for (; number < num_points; number++) {
-        *out++ = 1.f / (*in++);
-    }
-}
-#endif /* LV_HAVE_AVX */
-
-#ifdef LV_HAVE_AVX
-#include <immintrin.h>
-static inline void
-volk_32f_reciprocal_32f_a_avx_div(float* out, const float* in, unsigned int num_points)
-{
-    unsigned int number = 0;
     const unsigned int eighth_points = num_points / 8;
-    const __m256 ONE = _mm256_set1_ps(1.0f);
-
-    for (; number < eighth_points; number++) {
+    for (unsigned int number = 0; number < eighth_points; number++) {
         __m256 x = _mm256_load_ps(in);
         in += 8;
 
-        __m256 x_inv = _mm256_div_ps(ONE, x);
+        __m256 r = _mm256_reciprocal_1_avx_ps(x);
 
-        _mm256_store_ps(out, x_inv);
+        _mm256_store_ps(out, r);
         out += 8;
     }
 
-    number = eighth_points * 8;
-    for (; number < num_points; number++) {
-        *out++ = 1.f / (*in++);
-    }
+    const unsigned int done = eighth_points * 8;
+    volk_32f_reciprocal_32f_generic(out, in, num_points - done);
 }
 #endif /* LV_HAVE_AVX */
 
@@ -153,23 +117,19 @@ volk_32f_reciprocal_32f_a_avx_div(float* out, const float* in, unsigned int num_
 static inline void
 volk_32f_reciprocal_32f_u_avx2_fma(float* out, const float* in, unsigned int num_points)
 {
-    unsigned int number = 0;
     const unsigned int eighth_points = num_points / 8;
-
-    for (; number < eighth_points; number++) {
+    for (unsigned int number = 0; number < eighth_points; number++) {
         __m256 x = _mm256_loadu_ps(in);
         in += 8;
 
-        __m256 x_inv = _mm256_rcp1_avx2_fma_ps(x);
+        __m256 r = _mm256_reciprocal_1_avx2_fma_ps(x);
 
-        _mm256_storeu_ps(out, x_inv);
+        _mm256_storeu_ps(out, r);
         out += 8;
     }
 
-    number = eighth_points * 8;
-    for (; number < num_points; number++) {
-        *out++ = 1.f / (*in++);
-    }
+    const unsigned int done = eighth_points * 8;
+    volk_32f_reciprocal_32f_generic(out, in, num_points - done);
 }
 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
 
@@ -179,49 +139,19 @@ volk_32f_reciprocal_32f_u_avx2_fma(float* out, const float* in, unsigned int num
 static inline void
 volk_32f_reciprocal_32f_u_avx(float* out, const float* in, unsigned int num_points)
 {
-    unsigned int number = 0;
     const unsigned int eighth_points = num_points / 8;
-
-    for (; number < eighth_points; number++) {
+    for (unsigned int number = 0; number < eighth_points; number++) {
         __m256 x = _mm256_loadu_ps(in);
         in += 8;
 
-        __m256 x_inv = _mm256_rcp1_avx_ps(x);
+        __m256 r = _mm256_reciprocal_1_avx_ps(x);
 
-        _mm256_storeu_ps(out, x_inv);
+        _mm256_storeu_ps(out, r);
         out += 8;
     }
 
-    number = eighth_points * 8;
-    for (; number < num_points; number++) {
-        *out++ = 1.f / (*in++);
-    }
-}
-#endif /* LV_HAVE_AVX */
-
-#ifdef LV_HAVE_AVX
-#include <immintrin.h>
-static inline void
-volk_32f_reciprocal_32f_u_avx_div(float* out, const float* in, unsigned int num_points)
-{
-    unsigned int number = 0;
-    const unsigned int eighth_points = num_points / 8;
-    const __m256 ONE = _mm256_set1_ps(1.0f);
-
-    for (; number < eighth_points; number++) {
-        __m256 x = _mm256_loadu_ps(in);
-        in += 8;
-
-        __m256 x_inv = _mm256_div_ps(ONE, x);
-
-        _mm256_storeu_ps(out, x_inv);
-        out += 8;
-    }
-
-    number = eighth_points * 8;
-    for (; number < num_points; number++) {
-        *out++ = 1.f / (*in++);
-    }
+    const unsigned int done = eighth_points * 8;
+    volk_32f_reciprocal_32f_generic(out, in, num_points - done);
 }
 #endif /* LV_HAVE_AVX */