Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New kernel for reciprocal #643

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions include/volk/volk_avx2_fma_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,18 @@
#define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
#include <immintrin.h>

/*
* First order Newton-Raphson approximation of 1 / x
* x_1 = x_0 * (2 - x_0 * x)
*/
static inline __m256 _mm256_reciprocal_1_avx2_fma_ps(const __m256 x)
{
const __m256 TWO = _mm256_set1_ps(0x1.0p1f); // 2.0f
const __m256 x0 = _mm256_rcp_ps(x);
const __m256 x1 = _mm256_mul_ps(x0, _mm256_fnmadd_ps(x0, x, TWO));
return x1;
}

/*
* Approximate arctan(x) via polynomial expansion
* on the interval [-1, 1]
Expand Down
12 changes: 12 additions & 0 deletions include/volk/volk_avx_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,18 @@
#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
#include <immintrin.h>

/*
* First order Newton-Raphson approximation of 1 / x
* x_1 = x_0 * (2 - x_0 * x)
*/
static inline __m256 _mm256_reciprocal_1_avx_ps(const __m256 x)
{
const __m256 TWO = _mm256_set1_ps(0x1.0p1f); // 2.0f
const __m256 x0 = _mm256_rcp_ps(x);
const __m256 x1 = _mm256_mul_ps(x0, _mm256_sub_ps(TWO, _mm256_mul_ps(x0, x)));
return x1;
}

/*
* Approximate arctan(x) via polynomial expansion
* on the interval [-1, 1]
Expand Down
158 changes: 158 additions & 0 deletions kernels/volk/volk_32f_reciprocal_32f.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
/* -*- c++ -*- */
/*
* Copyright 2023 Magnus Lundmark <[email protected]>
*
* This file is part of VOLK
*
* SPDX-License-Identifier: LGPL-3.0-or-later
*/

/*!
* \page volk_32f_reciprocal_32f
*
* \b Overview
*
* Computes the reciprocal of the input vector and stores the results
* in the output vector.
*
* <b>Dispatcher Prototype</b>
* \code
* void volk_32f_reciprocal_32f(float* out, const float* in, unsigned int num_points)
* \endcode
*
* \b Inputs
* \li in: A pointer to the input vector of floats.
* \li num_points: The number of data points.
*
* \b Outputs
* \li bVector: A pointer to the output vector of floats.
*
* \b Example
* \code
int N = 10;
unsigned int alignment = volk_get_alignment();
float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
float* out = (float*)volk_malloc(sizeof(float)*N, alignment);

for(unsigned int ii = 1; ii < N; ++ii){
in[ii] = (float)(ii*ii);
}

volk_32f_reciprocal_32f(out, in, N);

for(unsigned int ii = 0; ii < N; ++ii){
printf("out(%i) = %f\n", ii, out[ii]);
}

volk_free(in);
volk_free(out);
* \endcode
*/

#ifndef INCLUDED_volk_32f_reciprocal_32f_a_H
#define INCLUDED_volk_32f_reciprocal_32f_a_H

#ifdef LV_HAVE_GENERIC
static inline void
volk_32f_reciprocal_32f_generic(float* out, const float* in, unsigned int num_points)
{
for (unsigned int i = 0; i < num_points; i++) {
out[i] = 1.f / in[i];
}
}
#endif /* LV_HAVE_GENERIC */

#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include <immintrin.h>
#include <volk/volk_avx2_fma_intrinsics.h>
static inline void
volk_32f_reciprocal_32f_a_avx2_fma(float* out, const float* in, unsigned int num_points)
{
const unsigned int eighth_points = num_points / 8;
for (unsigned int number = 0; number < eighth_points; number++) {
__m256 x = _mm256_load_ps(in);
in += 8;

__m256 r = _mm256_reciprocal_1_avx2_fma_ps(x);

_mm256_store_ps(out, r);
out += 8;
}

const unsigned int done = eighth_points * 8;
volk_32f_reciprocal_32f_generic(out, in, num_points - done);
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */

#ifdef LV_HAVE_AVX
#include <immintrin.h>
#include <volk/volk_avx_intrinsics.h>
static inline void
volk_32f_reciprocal_32f_a_avx(float* out, const float* in, unsigned int num_points)
{
const unsigned int eighth_points = num_points / 8;
for (unsigned int number = 0; number < eighth_points; number++) {
__m256 x = _mm256_load_ps(in);
in += 8;

__m256 r = _mm256_reciprocal_1_avx_ps(x);

_mm256_store_ps(out, r);
out += 8;
}

const unsigned int done = eighth_points * 8;
volk_32f_reciprocal_32f_generic(out, in, num_points - done);
}
#endif /* LV_HAVE_AVX */

#endif /* INCLUDED_volk_32f_reciprocal_32f_a_H */

#ifndef INCLUDED_volk_32f_reciprocal_32f_u_H
#define INCLUDED_volk_32f_reciprocal_32f_u_H

#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include <immintrin.h>
#include <volk/volk_avx2_fma_intrinsics.h>
static inline void
volk_32f_reciprocal_32f_u_avx2_fma(float* out, const float* in, unsigned int num_points)
{
const unsigned int eighth_points = num_points / 8;
for (unsigned int number = 0; number < eighth_points; number++) {
__m256 x = _mm256_loadu_ps(in);
in += 8;

__m256 r = _mm256_reciprocal_1_avx2_fma_ps(x);

_mm256_storeu_ps(out, r);
out += 8;
}

const unsigned int done = eighth_points * 8;
volk_32f_reciprocal_32f_generic(out, in, num_points - done);
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */

#ifdef LV_HAVE_AVX
#include <immintrin.h>
#include <volk/volk_avx_intrinsics.h>
static inline void
volk_32f_reciprocal_32f_u_avx(float* out, const float* in, unsigned int num_points)
{
const unsigned int eighth_points = num_points / 8;
for (unsigned int number = 0; number < eighth_points; number++) {
__m256 x = _mm256_loadu_ps(in);
in += 8;

__m256 r = _mm256_reciprocal_1_avx_ps(x);

_mm256_storeu_ps(out, r);
out += 8;
}

const unsigned int done = eighth_points * 8;
volk_32f_reciprocal_32f_generic(out, in, num_points - done);
}
#endif /* LV_HAVE_AVX */

#endif /* INCLUDED_volk_32f_reciprocal_32f_u_H */
1 change: 1 addition & 0 deletions lib/kernel_tests.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
QA(VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params))
QA(VOLK_INIT_TEST(volk_32f_sqrt_32f, test_params_inacc))
QA(VOLK_INIT_TEST(volk_32f_s32f_stddev_32f, test_params_inacc))
QA(VOLK_INIT_TEST(volk_32f_reciprocal_32f, test_params))
QA(VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2, test_params.make_tol(1e-3)))
QA(VOLK_INIT_TEST(volk_32f_x2_subtract_32f, test_params))
QA(VOLK_INIT_TEST(volk_32f_x3_sum_of_poly_32f, test_params_inacc))
Expand Down
Loading