Skip to content

Commit

Permalink
minor renames and fixes
Browse files Browse the repository at this point in the history
Signed-off-by: Magnus Lundmark <[email protected]>
  • Loading branch information
Ka-zam committed Oct 14, 2023
1 parent fcc0576 commit 6b6adca
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 98 deletions.
8 changes: 5 additions & 3 deletions include/volk/volk_avx2_fma_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@

/*
* First order Newton-Raphson approximation of 1 / x
* x_1 = x_0 * (2 - x_0 * x)
*/
static inline __m256 _mm256_rcp1_avx2_fma_ps(const __m256 x)
static inline __m256 _mm256_reciprocal_1_avx2_fma_ps(const __m256 x)
{
const __m256 TWO = _mm256_set1_ps(0x1.0p1f); // 2.0f
const __m256 x_inv = _mm256_rcp_ps(x);
return _mm256_mul_ps(x_inv, _mm256_fnmadd_ps(x_inv, x, TWO));
const __m256 x0 = _mm256_rcp_ps(x);
const __m256 x1 = _mm256_mul_ps(x0, _mm256_fnmadd_ps(x0, x, TWO));
return x1;
}

/*
Expand Down
9 changes: 5 additions & 4 deletions include/volk/volk_avx_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,14 @@

/*
* First order Newton-Raphson approximation of 1 / x
* x_1 = x_0 * (2 - x_0 * x)
*/
static inline __m256 _mm256_rcp1_avx_ps(const __m256 x)
static inline __m256 _mm256_reciprocal_1_avx_ps(const __m256 x)
{
const __m256 TWO = _mm256_set1_ps(0x1.0p1f); // 2.0f
const __m256 x_inv = _mm256_rcp_ps(x);
const __m256 y = _mm256_sub_ps(TWO, _mm256_mul_ps(x_inv, x));
return _mm256_mul_ps(x_inv, y);
const __m256 x0 = _mm256_rcp_ps(x);
const __m256 x1 = _mm256_mul_ps(x0, _mm256_sub_ps(TWO, _mm256_mul_ps(x0, x)));
return x1;
}

/*
Expand Down
112 changes: 21 additions & 91 deletions kernels/volk/volk_32f_reciprocal_32f.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,37 +56,31 @@
static inline void
volk_32f_reciprocal_32f_generic(float* out, const float* in, unsigned int num_points)
{
unsigned int i = 0;
for (; i < num_points; i++) {
for (unsigned int i = 0; i < num_points; i++) {
out[i] = 1.f / in[i];
}
}
#endif /* LV_HAVE_GENERIC */


#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include <immintrin.h>
#include <volk/volk_avx2_fma_intrinsics.h>
static inline void
volk_32f_reciprocal_32f_a_avx2_fma(float* out, const float* in, unsigned int num_points)
{
unsigned int number = 0;
const unsigned int eighth_points = num_points / 8;

for (; number < eighth_points; number++) {
for (unsigned int number = 0; number < eighth_points; number++) {
__m256 x = _mm256_load_ps(in);
in += 8;

__m256 x_inv = _mm256_rcp1_avx2_fma_ps(x);
__m256 r = _mm256_reciprocal_1_avx2_fma_ps(x);

_mm256_store_ps(out, x_inv);
_mm256_store_ps(out, r);
out += 8;
}

number = eighth_points * 8;
for (; number < num_points; number++) {
*out++ = 1.f / (*in++);
}
const unsigned int done = eighth_points * 8;
volk_32f_reciprocal_32f_generic(out, in, num_points - done);
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */

Expand All @@ -96,49 +90,19 @@ volk_32f_reciprocal_32f_a_avx2_fma(float* out, const float* in, unsigned int num
static inline void
volk_32f_reciprocal_32f_a_avx(float* out, const float* in, unsigned int num_points)
{
unsigned int number = 0;
const unsigned int eighth_points = num_points / 8;

for (; number < eighth_points; number++) {
__m256 x = _mm256_load_ps(in);
in += 8;

__m256 x_inv = _mm256_rcp1_avx_ps(x);

_mm256_store_ps(out, x_inv);
out += 8;
}

number = eighth_points * 8;
for (; number < num_points; number++) {
*out++ = 1.f / (*in++);
}
}
#endif /* LV_HAVE_AVX */

#ifdef LV_HAVE_AVX
#include <immintrin.h>
static inline void
volk_32f_reciprocal_32f_a_avx_div(float* out, const float* in, unsigned int num_points)
{
unsigned int number = 0;
const unsigned int eighth_points = num_points / 8;
const __m256 ONE = _mm256_set1_ps(1.0f);

for (; number < eighth_points; number++) {
for (unsigned int number = 0; number < eighth_points; number++) {
__m256 x = _mm256_load_ps(in);
in += 8;

__m256 x_inv = _mm256_div_ps(ONE, x);
__m256 r = _mm256_reciprocal_1_avx_ps(x);

_mm256_store_ps(out, x_inv);
_mm256_store_ps(out, r);
out += 8;
}

number = eighth_points * 8;
for (; number < num_points; number++) {
*out++ = 1.f / (*in++);
}
const unsigned int done = eighth_points * 8;
volk_32f_reciprocal_32f_generic(out, in, num_points - done);
}
#endif /* LV_HAVE_AVX */

Expand All @@ -153,23 +117,19 @@ volk_32f_reciprocal_32f_a_avx_div(float* out, const float* in, unsigned int num_
static inline void
volk_32f_reciprocal_32f_u_avx2_fma(float* out, const float* in, unsigned int num_points)
{
unsigned int number = 0;
const unsigned int eighth_points = num_points / 8;

for (; number < eighth_points; number++) {
for (unsigned int number = 0; number < eighth_points; number++) {
__m256 x = _mm256_loadu_ps(in);
in += 8;

__m256 x_inv = _mm256_rcp1_avx2_fma_ps(x);
__m256 r = _mm256_reciprocal_1_avx2_fma_ps(x);

_mm256_storeu_ps(out, x_inv);
_mm256_storeu_ps(out, r);
out += 8;
}

number = eighth_points * 8;
for (; number < num_points; number++) {
*out++ = 1.f / (*in++);
}
const unsigned int done = eighth_points * 8;
volk_32f_reciprocal_32f_generic(out, in, num_points - done);
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */

Expand All @@ -179,49 +139,19 @@ volk_32f_reciprocal_32f_u_avx2_fma(float* out, const float* in, unsigned int num
static inline void
volk_32f_reciprocal_32f_u_avx(float* out, const float* in, unsigned int num_points)
{
unsigned int number = 0;
const unsigned int eighth_points = num_points / 8;

for (; number < eighth_points; number++) {
for (unsigned int number = 0; number < eighth_points; number++) {
__m256 x = _mm256_loadu_ps(in);
in += 8;

__m256 x_inv = _mm256_rcp1_avx_ps(x);
__m256 r = _mm256_reciprocal_1_avx_ps(x);

_mm256_storeu_ps(out, x_inv);
_mm256_storeu_ps(out, r);
out += 8;
}

number = eighth_points * 8;
for (; number < num_points; number++) {
*out++ = 1.f / (*in++);
}
}
#endif /* LV_HAVE_AVX */

#ifdef LV_HAVE_AVX
#include <immintrin.h>
static inline void
volk_32f_reciprocal_32f_u_avx_div(float* out, const float* in, unsigned int num_points)
{
unsigned int number = 0;
const unsigned int eighth_points = num_points / 8;
const __m256 ONE = _mm256_set1_ps(1.0f);

for (; number < eighth_points; number++) {
__m256 x = _mm256_loadu_ps(in);
in += 8;

__m256 x_inv = _mm256_div_ps(ONE, x);

_mm256_storeu_ps(out, x_inv);
out += 8;
}

number = eighth_points * 8;
for (; number < num_points; number++) {
*out++ = 1.f / (*in++);
}
const unsigned int done = eighth_points * 8;
volk_32f_reciprocal_32f_generic(out, in, num_points - done);
}
#endif /* LV_HAVE_AVX */

Expand Down

0 comments on commit 6b6adca

Please sign in to comment.