From 34f4575749a9a4e11627efbbd6f202afba3a1594 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcus=20M=C3=BCller?= Date: Mon, 23 Oct 2023 19:09:59 +0200 Subject: [PATCH] remove buggy and slow neonv8 kernel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit as noticed by argilo when fixing the integer generation, that kernel was buggy. It seems compilers are better at building byte-swapping code than people writing SIMD intrinsics, so falling back on generic doesn't hurt Signed-off-by: Marcus Müller --- kernels/volk/volk_64u_byteswap.h | 36 ---------------------- kernels/volk/volk_64u_byteswappuppet_64u.h | 11 ------- 2 files changed, 47 deletions(-) diff --git a/kernels/volk/volk_64u_byteswap.h b/kernels/volk/volk_64u_byteswap.h index 22bccab2d..8a1ce8141 100644 --- a/kernels/volk/volk_64u_byteswap.h +++ b/kernels/volk/volk_64u_byteswap.h @@ -229,41 +229,6 @@ static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap, #endif /* LV_HAVE_SSSE3 */ -#ifdef LV_HAVE_NEONV8 -#include - -static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap, unsigned int num_points) -{ - uint32_t* inputPtr = (uint32_t*)intsToSwap; - const unsigned int n4points = num_points / 4; - uint8x16x2_t input; - uint8x16_t idx = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }; - - unsigned int number = 0; - for (number = 0; number < n4points; ++number) { - __VOLK_PREFETCH(inputPtr + 8); - input = vld2q_u8((uint8_t*)inputPtr); - input.val[0] = vqtbl1q_u8(input.val[0], idx); - input.val[1] = vqtbl1q_u8(input.val[1], idx); - vst2q_u8((uint8_t*)inputPtr, input); - - inputPtr += 8; - } - - for (number = n4points * 4; number < num_points; ++number) { - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; - - output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | - ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | - ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - - *inputPtr++ = output2; - *inputPtr++ = output1; - } -} -#else #ifdef LV_HAVE_NEON #include @@ -318,7 +283,6 @@ static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num } } #endif /* LV_HAVE_NEON */ -#endif #endif /* INCLUDED_volk_64u_byteswap_u_H */ #ifndef INCLUDED_volk_64u_byteswap_a_H diff --git a/kernels/volk/volk_64u_byteswappuppet_64u.h b/kernels/volk/volk_64u_byteswappuppet_64u.h index b1004bb04..8699593a1 100644 --- a/kernels/volk/volk_64u_byteswappuppet_64u.h +++ b/kernels/volk/volk_64u_byteswappuppet_64u.h @@ -26,16 +26,6 @@ static inline void volk_64u_byteswappuppet_64u_generic(uint64_t* output, } #endif -#ifdef LV_HAVE_NEONV8 -static inline void volk_64u_byteswappuppet_64u_neonv8(uint64_t* output, - uint64_t* intsToSwap, - unsigned int num_points) -{ - - volk_64u_byteswap_neonv8((uint64_t*)intsToSwap, num_points); - memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); -} -#else #ifdef LV_HAVE_NEON static inline void volk_64u_byteswappuppet_64u_neon(uint64_t* output, uint64_t* intsToSwap, @@ -46,7 +36,6 @@ static inline void volk_64u_byteswappuppet_64u_neon(uint64_t* output, memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); } #endif -#endif #ifdef LV_HAVE_SSE2 static inline void volk_64u_byteswappuppet_64u_u_sse2(uint64_t* output,