Skip to content

Commit

Permalink
remove buggy and slow neonv8 kernel
Browse files Browse the repository at this point in the history
as noticed by argilo when fixing the integer generation, that kernel was buggy. It seems compilers are better at building byte-swapping code than people writing SIMD intrinsics, so falling back on generic doesn't hurt

Signed-off-by: Marcus Müller <[email protected]>
  • Loading branch information
marcusmueller committed Oct 23, 2023
1 parent e853e9b commit 34f4575
Show file tree
Hide file tree
Showing 2 changed files with 0 additions and 47 deletions.
36 changes: 0 additions & 36 deletions kernels/volk/volk_64u_byteswap.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,41 +229,6 @@ static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap,
#endif /* LV_HAVE_SSSE3 */


#ifdef LV_HAVE_NEONV8
#include <arm_neon.h>

static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap, unsigned int num_points)
{
uint32_t* inputPtr = (uint32_t*)intsToSwap;
const unsigned int n4points = num_points / 4;
uint8x16x2_t input;
uint8x16_t idx = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };

unsigned int number = 0;
for (number = 0; number < n4points; ++number) {
__VOLK_PREFETCH(inputPtr + 8);
input = vld2q_u8((uint8_t*)inputPtr);
input.val[0] = vqtbl1q_u8(input.val[0], idx);
input.val[1] = vqtbl1q_u8(input.val[1], idx);
vst2q_u8((uint8_t*)inputPtr, input);

inputPtr += 8;
}

for (number = n4points * 4; number < num_points; ++number) {
uint32_t output1 = *inputPtr;
uint32_t output2 = inputPtr[1];

output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));

*inputPtr++ = output2;
*inputPtr++ = output1;
}
}
#else
#ifdef LV_HAVE_NEON
#include <arm_neon.h>

Expand Down Expand Up @@ -318,7 +283,6 @@ static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num
}
}
#endif /* LV_HAVE_NEON */
#endif

#endif /* INCLUDED_volk_64u_byteswap_u_H */
#ifndef INCLUDED_volk_64u_byteswap_a_H
Expand Down
11 changes: 0 additions & 11 deletions kernels/volk/volk_64u_byteswappuppet_64u.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,6 @@ static inline void volk_64u_byteswappuppet_64u_generic(uint64_t* output,
}
#endif

#ifdef LV_HAVE_NEONV8
static inline void volk_64u_byteswappuppet_64u_neonv8(uint64_t* output,
uint64_t* intsToSwap,
unsigned int num_points)
{

volk_64u_byteswap_neonv8((uint64_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
}
#else
#ifdef LV_HAVE_NEON
static inline void volk_64u_byteswappuppet_64u_neon(uint64_t* output,
uint64_t* intsToSwap,
Expand All @@ -46,7 +36,6 @@ static inline void volk_64u_byteswappuppet_64u_neon(uint64_t* output,
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
}
#endif
#endif

#ifdef LV_HAVE_SSE2
static inline void volk_64u_byteswappuppet_64u_u_sse2(uint64_t* output,
Expand Down

0 comments on commit 34f4575

Please sign in to comment.