From 6b6adca2e23adfa9bb450642a561ab1437a8f22d Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Sat, 14 Oct 2023 12:43:02 +0200 Subject: [PATCH] minor renames and fixes Signed-off-by: Magnus Lundmark --- include/volk/volk_avx2_fma_intrinsics.h | 8 +- include/volk/volk_avx_intrinsics.h | 9 +- kernels/volk/volk_32f_reciprocal_32f.h | 112 +++++------------------- 3 files changed, 31 insertions(+), 98 deletions(-) diff --git a/include/volk/volk_avx2_fma_intrinsics.h b/include/volk/volk_avx2_fma_intrinsics.h index a5ee947c..ba099604 100644 --- a/include/volk/volk_avx2_fma_intrinsics.h +++ b/include/volk/volk_avx2_fma_intrinsics.h @@ -18,12 +18,14 @@ /* * First order Newton-Raphson approximation of 1 / x + * x_1 = x_0 * (2 - x_0 * x) */ -static inline __m256 _mm256_rcp1_avx2_fma_ps(const __m256 x) +static inline __m256 _mm256_reciprocal_1_avx2_fma_ps(const __m256 x) { const __m256 TWO = _mm256_set1_ps(0x1.0p1f); // 2.0f - const __m256 x_inv = _mm256_rcp_ps(x); - return _mm256_mul_ps(x_inv, _mm256_fnmadd_ps(x_inv, x, TWO)); + const __m256 x0 = _mm256_rcp_ps(x); + const __m256 x1 = _mm256_mul_ps(x0, _mm256_fnmadd_ps(x0, x, TWO)); + return x1; } /* diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h index 48a8ee14..6be73ad7 100644 --- a/include/volk/volk_avx_intrinsics.h +++ b/include/volk/volk_avx_intrinsics.h @@ -19,13 +19,14 @@ /* * First order Newton-Raphson approximation of 1 / x + * x_1 = x_0 * (2 - x_0 * x) */ -static inline __m256 _mm256_rcp1_avx_ps(const __m256 x) +static inline __m256 _mm256_reciprocal_1_avx_ps(const __m256 x) { const __m256 TWO = _mm256_set1_ps(0x1.0p1f); // 2.0f - const __m256 x_inv = _mm256_rcp_ps(x); - const __m256 y = _mm256_sub_ps(TWO, _mm256_mul_ps(x_inv, x)); - return _mm256_mul_ps(x_inv, y); + const __m256 x0 = _mm256_rcp_ps(x); + const __m256 x1 = _mm256_mul_ps(x0, _mm256_sub_ps(TWO, _mm256_mul_ps(x0, x))); + return x1; } /* diff --git a/kernels/volk/volk_32f_reciprocal_32f.h b/kernels/volk/volk_32f_reciprocal_32f.h index ae744bd2..8b916b5d 100644 --- a/kernels/volk/volk_32f_reciprocal_32f.h +++ b/kernels/volk/volk_32f_reciprocal_32f.h @@ -56,37 +56,31 @@ static inline void volk_32f_reciprocal_32f_generic(float* out, const float* in, unsigned int num_points) { - unsigned int i = 0; - for (; i < num_points; i++) { + for (unsigned int i = 0; i < num_points; i++) { out[i] = 1.f / in[i]; } } #endif /* LV_HAVE_GENERIC */ - #if LV_HAVE_AVX2 && LV_HAVE_FMA #include #include static inline void volk_32f_reciprocal_32f_a_avx2_fma(float* out, const float* in, unsigned int num_points) { - unsigned int number = 0; const unsigned int eighth_points = num_points / 8; - - for (; number < eighth_points; number++) { + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_load_ps(in); in += 8; - __m256 x_inv = _mm256_rcp1_avx2_fma_ps(x); + __m256 r = _mm256_reciprocal_1_avx2_fma_ps(x); - _mm256_store_ps(out, x_inv); + _mm256_store_ps(out, r); out += 8; } - number = eighth_points * 8; - for (; number < num_points; number++) { - *out++ = 1.f / (*in++); - } + const unsigned int done = eighth_points * 8; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */ @@ -96,49 +90,19 @@ volk_32f_reciprocal_32f_a_avx2_fma(float* out, const float* in, unsigned int num static inline void volk_32f_reciprocal_32f_a_avx(float* out, const float* in, unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighth_points = num_points / 8; - - for (; number < eighth_points; number++) { - __m256 x = _mm256_load_ps(in); - in += 8; - - __m256 x_inv = _mm256_rcp1_avx_ps(x); - - _mm256_store_ps(out, x_inv); - out += 8; - } - - number = eighth_points * 8; - for (; number < num_points; number++) { - *out++ = 1.f / (*in++); - } -} -#endif /* LV_HAVE_AVX */ - -#ifdef LV_HAVE_AVX -#include -static inline void -volk_32f_reciprocal_32f_a_avx_div(float* out, const float* in, unsigned int num_points) -{ - unsigned int number = 0; const unsigned int eighth_points = num_points / 8; - const __m256 ONE = _mm256_set1_ps(1.0f); - - for (; number < eighth_points; number++) { + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_load_ps(in); in += 8; - __m256 x_inv = _mm256_div_ps(ONE, x); + __m256 r = _mm256_reciprocal_1_avx_ps(x); - _mm256_store_ps(out, x_inv); + _mm256_store_ps(out, r); out += 8; } - number = eighth_points * 8; - for (; number < num_points; number++) { - *out++ = 1.f / (*in++); - } + const unsigned int done = eighth_points * 8; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); } #endif /* LV_HAVE_AVX */ @@ -153,23 +117,19 @@ volk_32f_reciprocal_32f_a_avx_div(float* out, const float* in, unsigned int num_ static inline void volk_32f_reciprocal_32f_u_avx2_fma(float* out, const float* in, unsigned int num_points) { - unsigned int number = 0; const unsigned int eighth_points = num_points / 8; - - for (; number < eighth_points; number++) { + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_loadu_ps(in); in += 8; - __m256 x_inv = _mm256_rcp1_avx2_fma_ps(x); + __m256 r = _mm256_reciprocal_1_avx2_fma_ps(x); - _mm256_storeu_ps(out, x_inv); + _mm256_storeu_ps(out, r); out += 8; } - number = eighth_points * 8; - for (; number < num_points; number++) { - *out++ = 1.f / (*in++); - } + const unsigned int done = eighth_points * 8; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */ @@ -179,49 +139,19 @@ volk_32f_reciprocal_32f_u_avx2_fma(float* out, const float* in, unsigned int num static inline void volk_32f_reciprocal_32f_u_avx(float* out, const float* in, unsigned int num_points) { - unsigned int number = 0; const unsigned int eighth_points = num_points / 8; - - for (; number < eighth_points; number++) { + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_loadu_ps(in); in += 8; - __m256 x_inv = _mm256_rcp1_avx_ps(x); + __m256 r = _mm256_reciprocal_1_avx_ps(x); - _mm256_storeu_ps(out, x_inv); + _mm256_storeu_ps(out, r); out += 8; } - number = eighth_points * 8; - for (; number < num_points; number++) { - *out++ = 1.f / (*in++); - } -} -#endif /* LV_HAVE_AVX */ - -#ifdef LV_HAVE_AVX -#include -static inline void -volk_32f_reciprocal_32f_u_avx_div(float* out, const float* in, unsigned int num_points) -{ - unsigned int number = 0; - const unsigned int eighth_points = num_points / 8; - const __m256 ONE = _mm256_set1_ps(1.0f); - - for (; number < eighth_points; number++) { - __m256 x = _mm256_loadu_ps(in); - in += 8; - - __m256 x_inv = _mm256_div_ps(ONE, x); - - _mm256_storeu_ps(out, x_inv); - out += 8; - } - - number = eighth_points * 8; - for (; number < num_points; number++) { - *out++ = 1.f / (*in++); - } + const unsigned int done = eighth_points * 8; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); } #endif /* LV_HAVE_AVX */