Skip to content

Commit

Permalink
remove buggy and slow neonv8 kernel
Browse files Browse the repository at this point in the history
as noticed by argilo when fixing the integer generation, that kernel was buggy. It seems compilers are better at building byte-swapping code than people writing SIMD intrinsics, so falling back on generic doesn't hurt

Signed-off-by: Marcus Müller <[email protected]>
  • Loading branch information
marcusmueller committed Oct 23, 2023
1 parent e853e9b commit e952f32
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 45 deletions.
36 changes: 1 addition & 35 deletions kernels/volk/volk_64u_byteswap.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,41 +229,7 @@ static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap,
#endif /* LV_HAVE_SSSE3 */


#ifdef LV_HAVE_NEONV8
#include <arm_neon.h>

static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap, unsigned int num_points)
{
uint32_t* inputPtr = (uint32_t*)intsToSwap;
const unsigned int n4points = num_points / 4;
uint8x16x2_t input;
uint8x16_t idx = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };

unsigned int number = 0;
for (number = 0; number < n4points; ++number) {
__VOLK_PREFETCH(inputPtr + 8);
input = vld2q_u8((uint8_t*)inputPtr);
input.val[0] = vqtbl1q_u8(input.val[0], idx);
input.val[1] = vqtbl1q_u8(input.val[1], idx);
vst2q_u8((uint8_t*)inputPtr, input);

inputPtr += 8;
}

for (number = n4points * 4; number < num_points; ++number) {
uint32_t output1 = *inputPtr;
uint32_t output2 = inputPtr[1];

output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));

*inputPtr++ = output2;
*inputPtr++ = output1;
}
}
#else
#ifndef LV_HAVE_NEONV8
#ifdef LV_HAVE_NEON
#include <arm_neon.h>

Expand Down
11 changes: 1 addition & 10 deletions kernels/volk/volk_64u_byteswappuppet_64u.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,7 @@ static inline void volk_64u_byteswappuppet_64u_generic(uint64_t* output,
}
#endif

#ifdef LV_HAVE_NEONV8
static inline void volk_64u_byteswappuppet_64u_neonv8(uint64_t* output,
uint64_t* intsToSwap,
unsigned int num_points)
{

volk_64u_byteswap_neonv8((uint64_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
}
#else
#ifndef LV_HAVE_NEONV8
#ifdef LV_HAVE_NEON
static inline void volk_64u_byteswappuppet_64u_neon(uint64_t* output,
uint64_t* intsToSwap,
Expand Down

0 comments on commit e952f32

Please sign in to comment.