Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New kernel for reciprocal #643

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions include/volk/volk_avx2_fma_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,16 @@
#define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
#include <immintrin.h>

/*
* First order Newton-Raphson approximation of 1 / x
*/
static inline __m256 _mm256_rcp1_avx2_fma_ps(const __m256 x)
Ka-zam marked this conversation as resolved.
Show resolved Hide resolved
{
const __m256 TWO = _mm256_set1_ps(0x1.0p1f); // 2.0f
const __m256 x_inv = _mm256_rcp_ps(x);
return _mm256_mul_ps(x_inv, _mm256_fnmadd_ps(x_inv, x, TWO));
}

/*
* Approximate arctan(x) via polynomial expansion
* on the interval [-1, 1]
Expand Down
11 changes: 11 additions & 0 deletions include/volk/volk_avx_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,17 @@
#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
#include <immintrin.h>

/*
* First order Newton-Raphson approximation of 1 / x
*/
static inline __m256 _mm256_rcp1_avx_ps(const __m256 x)
{
const __m256 TWO = _mm256_set1_ps(0x1.0p1f); // 2.0f
const __m256 x_inv = _mm256_rcp_ps(x);
const __m256 y = _mm256_sub_ps(TWO, _mm256_mul_ps(x_inv, x));
return _mm256_mul_ps(x_inv, y);
}

/*
* Approximate arctan(x) via polynomial expansion
* on the interval [-1, 1]
Expand Down
228 changes: 228 additions & 0 deletions kernels/volk/volk_32f_reciprocal_32f.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
/* -*- c++ -*- */
/*
* Copyright 2023 Magnus Lundmark <[email protected]>
*
* This file is part of VOLK
*
* SPDX-License-Identifier: LGPL-3.0-or-later
*/

/*!
* \page volk_32f_reciprocal_32f
*
* \b Overview
*
* Computes the reciprocal of the input vector and stores the results
* in the output vector.
*
* <b>Dispatcher Prototype</b>
* \code
* void volk_32f_reciprocal_32f(float* out, const float* in, unsigned int num_points)
* \endcode
*
* \b Inputs
* \li in: A pointer to the input vector of floats.
* \li num_points: The number of data points.
*
* \b Outputs
* \li bVector: A pointer to the output vector of floats.
*
* \b Example
* \code
int N = 10;
unsigned int alignment = volk_get_alignment();
float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
float* out = (float*)volk_malloc(sizeof(float)*N, alignment);

for(unsigned int ii = 1; ii < N; ++ii){
in[ii] = (float)(ii*ii);
}

volk_32f_reciprocal_32f(out, in, N);

for(unsigned int ii = 0; ii < N; ++ii){
printf("out(%i) = %f\n", ii, out[ii]);
}

volk_free(in);
volk_free(out);
* \endcode
*/

#ifndef INCLUDED_volk_32f_reciprocal_32f_a_H
#define INCLUDED_volk_32f_reciprocal_32f_a_H

#ifdef LV_HAVE_GENERIC
static inline void
volk_32f_reciprocal_32f_generic(float* out, const float* in, unsigned int num_points)
{
unsigned int i = 0;
for (; i < num_points; i++) {
out[i] = 1.f / in[i];
}
Ka-zam marked this conversation as resolved.
Show resolved Hide resolved
}
#endif /* LV_HAVE_GENERIC */


#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include <immintrin.h>
#include <volk/volk_avx2_fma_intrinsics.h>
static inline void
volk_32f_reciprocal_32f_a_avx2_fma(float* out, const float* in, unsigned int num_points)
{
unsigned int number = 0;
Ka-zam marked this conversation as resolved.
Show resolved Hide resolved
const unsigned int eighth_points = num_points / 8;

for (; number < eighth_points; number++) {
__m256 x = _mm256_load_ps(in);
in += 8;

__m256 x_inv = _mm256_rcp1_avx2_fma_ps(x);

_mm256_store_ps(out, x_inv);
out += 8;
}

number = eighth_points * 8;
for (; number < num_points; number++) {
*out++ = 1.f / (*in++);
}
Ka-zam marked this conversation as resolved.
Show resolved Hide resolved
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */

#ifdef LV_HAVE_AVX
#include <immintrin.h>
#include <volk/volk_avx_intrinsics.h>
static inline void
volk_32f_reciprocal_32f_a_avx(float* out, const float* in, unsigned int num_points)
{
unsigned int number = 0;
const unsigned int eighth_points = num_points / 8;

for (; number < eighth_points; number++) {
__m256 x = _mm256_load_ps(in);
in += 8;

__m256 x_inv = _mm256_rcp1_avx_ps(x);

_mm256_store_ps(out, x_inv);
out += 8;
}

number = eighth_points * 8;
for (; number < num_points; number++) {
*out++ = 1.f / (*in++);
}
}
#endif /* LV_HAVE_AVX */

#ifdef LV_HAVE_AVX
#include <immintrin.h>
static inline void
volk_32f_reciprocal_32f_a_avx_div(float* out, const float* in, unsigned int num_points)
{
unsigned int number = 0;
const unsigned int eighth_points = num_points / 8;
const __m256 ONE = _mm256_set1_ps(1.0f);

for (; number < eighth_points; number++) {
__m256 x = _mm256_load_ps(in);
in += 8;

__m256 x_inv = _mm256_div_ps(ONE, x);

_mm256_store_ps(out, x_inv);
out += 8;
}

number = eighth_points * 8;
for (; number < num_points; number++) {
*out++ = 1.f / (*in++);
}
}
#endif /* LV_HAVE_AVX */

#endif /* INCLUDED_volk_32f_reciprocal_32f_a_H */

#ifndef INCLUDED_volk_32f_reciprocal_32f_u_H
#define INCLUDED_volk_32f_reciprocal_32f_u_H

#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include <immintrin.h>
#include <volk/volk_avx2_fma_intrinsics.h>
static inline void
volk_32f_reciprocal_32f_u_avx2_fma(float* out, const float* in, unsigned int num_points)
{
unsigned int number = 0;
const unsigned int eighth_points = num_points / 8;

for (; number < eighth_points; number++) {
__m256 x = _mm256_loadu_ps(in);
in += 8;

__m256 x_inv = _mm256_rcp1_avx2_fma_ps(x);

_mm256_storeu_ps(out, x_inv);
out += 8;
}

number = eighth_points * 8;
for (; number < num_points; number++) {
*out++ = 1.f / (*in++);
}
}
#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */

#ifdef LV_HAVE_AVX
#include <immintrin.h>
#include <volk/volk_avx_intrinsics.h>
static inline void
volk_32f_reciprocal_32f_u_avx(float* out, const float* in, unsigned int num_points)
{
unsigned int number = 0;
const unsigned int eighth_points = num_points / 8;

for (; number < eighth_points; number++) {
__m256 x = _mm256_loadu_ps(in);
in += 8;

__m256 x_inv = _mm256_rcp1_avx_ps(x);

_mm256_storeu_ps(out, x_inv);
out += 8;
}

number = eighth_points * 8;
for (; number < num_points; number++) {
*out++ = 1.f / (*in++);
}
}
#endif /* LV_HAVE_AVX */

#ifdef LV_HAVE_AVX
#include <immintrin.h>
static inline void
volk_32f_reciprocal_32f_u_avx_div(float* out, const float* in, unsigned int num_points)
{
unsigned int number = 0;
const unsigned int eighth_points = num_points / 8;
const __m256 ONE = _mm256_set1_ps(1.0f);

for (; number < eighth_points; number++) {
__m256 x = _mm256_loadu_ps(in);
in += 8;

__m256 x_inv = _mm256_div_ps(ONE, x);
Ka-zam marked this conversation as resolved.
Show resolved Hide resolved

_mm256_storeu_ps(out, x_inv);
out += 8;
}

number = eighth_points * 8;
for (; number < num_points; number++) {
*out++ = 1.f / (*in++);
}
}
#endif /* LV_HAVE_AVX */

#endif /* INCLUDED_volk_32f_reciprocal_32f_u_H */
1 change: 1 addition & 0 deletions lib/kernel_tests.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
QA(VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params))
QA(VOLK_INIT_TEST(volk_32f_sqrt_32f, test_params_inacc))
QA(VOLK_INIT_TEST(volk_32f_s32f_stddev_32f, test_params_inacc))
QA(VOLK_INIT_TEST(volk_32f_reciprocal_32f, test_params))
QA(VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2, test_params.make_tol(1e-3)))
QA(VOLK_INIT_TEST(volk_32f_x2_subtract_32f, test_params))
QA(VOLK_INIT_TEST(volk_32f_x3_sum_of_poly_32f, test_params_inacc))
Expand Down