gnuradio · Ka-zam · Oct 13, 2023 · Oct 13, 2023 · Oct 13, 2023 · Oct 14, 2023
diff --git a/include/volk/volk_avx2_fma_intrinsics.h b/include/volk/volk_avx2_fma_intrinsics.h
@@ -16,6 +16,16 @@
 #define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
 #include <immintrin.h>
 
+/*
+ * First order Newton-Raphson approximation of 1 / x
+ */
+static inline __m256 _mm256_rcp1_avx2_fma_ps(const __m256 x)
+{
+    const __m256 TWO = _mm256_set1_ps(0x1.0p1f); // 2.0f
+    const __m256 x_inv = _mm256_rcp_ps(x);
+    return _mm256_mul_ps(x_inv, _mm256_fnmadd_ps(x_inv, x, TWO));
+}
+
 /*
  * Approximate arctan(x) via polynomial expansion
  * on the interval [-1, 1]

diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h
@@ -17,6 +17,17 @@
 #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
 #include <immintrin.h>
 
+/*
+ * First order Newton-Raphson approximation of 1 / x
+ */
+static inline __m256 _mm256_rcp1_avx_ps(const __m256 x)
+{
+    const __m256 TWO = _mm256_set1_ps(0x1.0p1f); // 2.0f
+    const __m256 x_inv = _mm256_rcp_ps(x);
+    const __m256 y = _mm256_sub_ps(TWO, _mm256_mul_ps(x_inv, x));
+    return _mm256_mul_ps(x_inv, y);
+}
+
 /*
  * Approximate arctan(x) via polynomial expansion
  * on the interval [-1, 1]

diff --git a/kernels/volk/volk_32f_reciprocal_32f.h b/kernels/volk/volk_32f_reciprocal_32f.h
@@ -0,0 +1,228 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2023 Magnus Lundmark <[email protected]>
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_reciprocal_32f
+ *
+ * \b Overview
+ *
+ * Computes the reciprocal of the input vector and stores the results
+ * in the output vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_reciprocal_32f(float* out, const float* in, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li in: A pointer to the input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: A pointer to the output vector of floats.
+ *
+ * \b Example
+ * \code
+    int N = 10;
+    unsigned int alignment = volk_get_alignment();
+    float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+    float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+
+    for(unsigned int ii = 1; ii < N; ++ii){
+        in[ii] = (float)(ii*ii);
+    }
+
+    volk_32f_reciprocal_32f(out, in, N);
+
+    for(unsigned int ii = 0; ii < N; ++ii){
+        printf("out(%i) = %f\n", ii, out[ii]);
+    }
+
+    volk_free(in);
+    volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_reciprocal_32f_a_H
+#define INCLUDED_volk_32f_reciprocal_32f_a_H
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_32f_reciprocal_32f_generic(float* out, const float* in, unsigned int num_points)
+{
+    unsigned int i = 0;
+    for (; i < num_points; i++) {
+        out[i] = 1.f / in[i];
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+#include <volk/volk_avx2_fma_intrinsics.h>
+static inline void
+volk_32f_reciprocal_32f_a_avx2_fma(float* out, const float* in, unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighth_points = num_points / 8;
+
+    for (; number < eighth_points; number++) {
+        __m256 x = _mm256_load_ps(in);
+        in += 8;
+
+        __m256 x_inv = _mm256_rcp1_avx2_fma_ps(x);
+
+        _mm256_store_ps(out, x_inv);
+        out += 8;
+    }
+
+    number = eighth_points * 8;
+    for (; number < num_points; number++) {
+        *out++ = 1.f / (*in++);
+    }
+}
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+static inline void
+volk_32f_reciprocal_32f_a_avx(float* out, const float* in, unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighth_points = num_points / 8;
+
+    for (; number < eighth_points; number++) {
+        __m256 x = _mm256_load_ps(in);
+        in += 8;
+
+        __m256 x_inv = _mm256_rcp1_avx_ps(x);
+
+        _mm256_store_ps(out, x_inv);
+        out += 8;
+    }
+
+    number = eighth_points * 8;
+    for (; number < num_points; number++) {
+        *out++ = 1.f / (*in++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+static inline void
+volk_32f_reciprocal_32f_a_avx_div(float* out, const float* in, unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighth_points = num_points / 8;
+    const __m256 ONE = _mm256_set1_ps(1.0f);
+
+    for (; number < eighth_points; number++) {
+        __m256 x = _mm256_load_ps(in);
+        in += 8;
+
+        __m256 x_inv = _mm256_div_ps(ONE, x);
+
+        _mm256_store_ps(out, x_inv);
+        out += 8;
+    }
+
+    number = eighth_points * 8;
+    for (; number < num_points; number++) {
+        *out++ = 1.f / (*in++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32f_reciprocal_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_reciprocal_32f_u_H
+#define INCLUDED_volk_32f_reciprocal_32f_u_H
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+#include <volk/volk_avx2_fma_intrinsics.h>
+static inline void
+volk_32f_reciprocal_32f_u_avx2_fma(float* out, const float* in, unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighth_points = num_points / 8;
+
+    for (; number < eighth_points; number++) {
+        __m256 x = _mm256_loadu_ps(in);
+        in += 8;
+
+        __m256 x_inv = _mm256_rcp1_avx2_fma_ps(x);
+
+        _mm256_storeu_ps(out, x_inv);
+        out += 8;
+    }
+
+    number = eighth_points * 8;
+    for (; number < num_points; number++) {
+        *out++ = 1.f / (*in++);
+    }
+}
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+static inline void
+volk_32f_reciprocal_32f_u_avx(float* out, const float* in, unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighth_points = num_points / 8;
+
+    for (; number < eighth_points; number++) {
+        __m256 x = _mm256_loadu_ps(in);
+        in += 8;
+
+        __m256 x_inv = _mm256_rcp1_avx_ps(x);
+
+        _mm256_storeu_ps(out, x_inv);
+        out += 8;
+    }
+
+    number = eighth_points * 8;
+    for (; number < num_points; number++) {
+        *out++ = 1.f / (*in++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+static inline void
+volk_32f_reciprocal_32f_u_avx_div(float* out, const float* in, unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighth_points = num_points / 8;
+    const __m256 ONE = _mm256_set1_ps(1.0f);
+
+    for (; number < eighth_points; number++) {
+        __m256 x = _mm256_loadu_ps(in);
+        in += 8;
+
+        __m256 x_inv = _mm256_div_ps(ONE, x);
+
+        _mm256_storeu_ps(out, x_inv);
+        out += 8;
+    }
+
+    number = eighth_points * 8;
+    for (; number < num_points; number++) {
+        *out++ = 1.f / (*in++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32f_reciprocal_32f_u_H */
diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h
@@ -135,6 +135,7 @@ std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
     QA(VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params))
     QA(VOLK_INIT_TEST(volk_32f_sqrt_32f, test_params_inacc))
     QA(VOLK_INIT_TEST(volk_32f_s32f_stddev_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_reciprocal_32f, test_params))
     QA(VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2, test_params.make_tol(1e-3)))
     QA(VOLK_INIT_TEST(volk_32f_x2_subtract_32f, test_params))
     QA(VOLK_INIT_TEST(volk_32f_x3_sum_of_poly_32f, test_params_inacc))