Skip to content

Commit

Permalink
remove simd specifics; focus on naive implementation first
Browse files Browse the repository at this point in the history
  • Loading branch information
dromer committed Nov 17, 2023
1 parent c2b8a1b commit b57b589
Showing 1 changed file with 0 additions and 45 deletions.
45 changes: 0 additions & 45 deletions hvcc/generators/ir2c/static/HvSignalRFFT.c
Original file line number Diff line number Diff line change
Expand Up @@ -84,27 +84,10 @@ void __hv_rfft_f(SignalRFFT *o, hv_bInf_t bIn, hv_bOutf_t bOut0, hv_bOutf_t bOut
pffft_transform_ordered(o->setup, &bIn, bOut, work, PFFFT_FORWARD);

// uninterleave result into the output buffers
#if HV_SIMD_SSE || HV_SIMD_AVX
for (int i = 0, j = 0; j < n; j += 4, i += 8) {
__m128 a = _mm_load_ps(bOut+i); // LRLR
__m128 b = _mm_load_ps(bOut+4+i); // LRLR
__m128 x = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0,2,0)); // LLLL
__m128 y = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1,3,1)); // RRRR
_mm_store_ps(bOut0+j, x);
_mm_store_ps(bOut1+j, y);
}
#elif HV_SIMD_NEON
for (int i = 0, j = 0; j < n; j += 4, i += 8) {
float32x4x2_t a = vld2q_f32(bOut+i); // load and uninterleave
vst1q_f32(bOut0+j, a.val[0]);
vst1q_f32(bOut1+j, a.val[1]);
}
#else // HV_SIMD_NONE
for (int j = 0; j < n; ++j) {
bOut0[n+j] = bOut[0+2*j];
bOut1[n+j] = bOut[1+2*j];
}
#endif

__hv_store_f(inputs+h_orig, bIn); // store the new input to the inputs buffer
hTable_setHead(&o->inputs, wrap(h_orig+HV_N_SIMD, m));
Expand All @@ -130,40 +113,12 @@ void __hv_rifft_f(SignalRFFT *o, hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut
float *const bIn = (float *)(hv_alloca(sizeof(bOut)));

// interleave the input buffers into the transform buffer
#if HV_SIMD_AVX
for (int i = 0, j = 0; j < n; j += 8, i += 16) {
__m256 x = _mm256_load_ps(bIn00); // LLLLLLLL
__m256 y = _mm256_load_ps(bIn10); // RRRRRRRR
__m256 a = _mm256_unpacklo_ps(x, y); // LRLRLRLR
__m256 b = _mm256_unpackhi_ps(x, y); // LRLRLRLR
_mm256_store_ps(bIn+i, a);
_mm256_store_ps(bIn+8+i, b);
}
#elif HV_SIMD_SSE
for (int i = 0, j = 0; j < n4; j += 4, i += 8) {
__m128 x = _mm_load_ps(bIn00); // LLLL
__m128 y = _mm_load_ps(bIn10); // RRRR
__m128 a = _mm_unpacklo_ps(x, y); // LRLR
__m128 b = _mm_unpackhi_ps(x, y); // LRLR
_mm_store_ps(bIn+i, a);
_mm_store_ps(bIn+4+i, b);
}
#elif HV_SIMD_NEON
// https://community.arm.com/groups/processors/blog/2012/03/13/coding-for-neon--part-5-rearranging-vectors
for (int i = 0, j = 0; j < n4; j += 4, i += 8) {
float32x4_t x = vld1q_f32(bIn00);
float32x4_t y = vld1q_f32(bIn10);
float32x4x2_t z = {x, y};
vst2q_f32(bIn+i, z); // interleave and store
}
#else // HV_SIMD_NONE
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < n; ++j) {
bIn[0+2*j] = bIn00[n+j];
bIn[1+2*j] = bIn10[n+j];
}
}
#endif

pffft_transform_ordered(o->setup, bIn, bOut, work, PFFFT_BACKWARD);

Expand Down

0 comments on commit b57b589

Please sign in to comment.