diff --git a/sse2neon.h b/sse2neon.h index c2228667..577cfc0c 100644 --- a/sse2neon.h +++ b/sse2neon.h @@ -114,7 +114,6 @@ #warning "Optimization may cause potential errors in sse2neon. see #648" #endif - /* C language does not allow initializing a variable with a function call. */ #ifdef __cplusplus #define _sse2neon_const static const @@ -122,6 +121,7 @@ #define _sse2neon_const const #endif +#include #include #include #include @@ -193,10 +193,7 @@ FORCE_INLINE int64_t sse2neon_recast_f64_s64(double f64) #define _sse2neon_return(ret) return ret #endif -#define _sse2neon_init(...) \ - { \ - __VA_ARGS__ \ - } +#define _sse2neon_init(...) {__VA_ARGS__} /* Compiler barrier */ #if defined(_MSC_VER) && !defined(__clang__) @@ -1806,7 +1803,7 @@ FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value) #if defined(_MSC_VER) && !defined(__clang__) _WriteStatusReg(ARM64_FPCR, value); #else - __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */ + __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */ #endif } @@ -1840,25 +1837,17 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void) { - union { - fpcr_bitfield field; -#if defined(__aarch64__) || defined(_M_ARM64) - uint64_t value; -#else - uint32_t value; -#endif - } r; - -#if defined(__aarch64__) || defined(_M_ARM64) - r.value = _sse2neon_get_fpcr(); -#else - __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ -#endif - - if (r.field.bit22) { - return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP; - } else { - return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST; + switch (fegetround()) { + case FE_TONEAREST: + return _MM_ROUND_NEAREST; + case FE_DOWNWARD: + return _MM_ROUND_DOWN; + case FE_UPWARD: + return _MM_ROUND_UP; + case FE_TOWARDZERO: + return _MM_ROUND_TOWARD_ZERO; + default: // FIXME + return _MM_ROUND_TOWARD_ZERO; } } @@ -2426,7 +2415,7 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag) #if defined(__aarch64__) || defined(_M_ARM64) _sse2neon_set_fpcr(r.value); #else - __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ + __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ #endif } @@ -2454,44 +2443,23 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) { - union { - fpcr_bitfield field; -#if defined(__aarch64__) || defined(_M_ARM64) - uint64_t value; -#else - uint32_t value; -#endif - } r; - -#if defined(__aarch64__) || defined(_M_ARM64) - r.value = _sse2neon_get_fpcr(); -#else - __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ -#endif - switch (rounding) { - case _MM_ROUND_TOWARD_ZERO: - r.field.bit22 = 1; - r.field.bit23 = 1; + case _MM_ROUND_NEAREST: + rounding = FE_TONEAREST; break; case _MM_ROUND_DOWN: - r.field.bit22 = 0; - r.field.bit23 = 1; + rounding = FE_DOWNWARD; break; case _MM_ROUND_UP: - r.field.bit22 = 1; - r.field.bit23 = 0; + rounding = FE_UPWARD; + break; + case _MM_ROUND_TOWARD_ZERO: + rounding = FE_TOWARDZERO; break; - default: //_MM_ROUND_NEAREST - r.field.bit22 = 0; - r.field.bit23 = 0; + default: // FIXME + rounding = FE_TOWARDZERO; } - -#if defined(__aarch64__) || defined(_M_ARM64) - _sse2neon_set_fpcr(r.value); -#else - __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ -#endif + fesetround(rounding); } // Copy single-precision (32-bit) floating-point element a to the lower element @@ -4990,11 +4958,11 @@ FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b1, signed char b0) { - int8_t ALIGN_STRUCT(16) - data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, - (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, - (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, - (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; + int8_t ALIGN_STRUCT(16) data[16] = { + (int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, + (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, + (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, + (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; return (__m128i) vld1q_s8(data); } @@ -5125,11 +5093,11 @@ FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, signed char b14, signed char b15) { - int8_t ALIGN_STRUCT(16) - data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, - (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, - (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, - (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; + int8_t ALIGN_STRUCT(16) data[16] = { + (int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, + (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, + (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, + (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; return (__m128i) vld1q_s8(data); } @@ -6282,7 +6250,7 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) uint8x8_t tmp_low; \ uint8x8_t tmp_high; \ if ((imm) >= 8) { \ - const int idx = (imm) -8; \ + const int idx = (imm) - 8; \ tmp_low = vreinterpret_u8_m64(_a); \ tmp_high = vdup_n_u8(0); \ ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ @@ -6803,14 +6771,14 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) _sse2neon_define2( \ __m128i, a, b, \ const uint16_t _mask[8] = \ - _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \ - ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \ - ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \ - ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \ - ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \ - ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \ - ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \ - ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0); \ + _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) - 1 : 0x0, \ + ((imm) & (1 << 1)) ? (uint16_t) - 1 : 0x0, \ + ((imm) & (1 << 2)) ? (uint16_t) - 1 : 0x0, \ + ((imm) & (1 << 3)) ? (uint16_t) - 1 : 0x0, \ + ((imm) & (1 << 4)) ? (uint16_t) - 1 : 0x0, \ + ((imm) & (1 << 5)) ? (uint16_t) - 1 : 0x0, \ + ((imm) & (1 << 6)) ? (uint16_t) - 1 : 0x0, \ + ((imm) & (1 << 7)) ? (uint16_t) - 1 : 0x0); \ uint16x8_t _mask_vec = vld1q_u16(_mask); \ uint16x8_t __a = vreinterpretq_u16_m128i(_a); \ uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \ @@ -6835,11 +6803,11 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8) { - const uint32_t ALIGN_STRUCT(16) - data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, - ((imm8) & (1 << 1)) ? UINT32_MAX : 0, - ((imm8) & (1 << 2)) ? UINT32_MAX : 0, - ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; + const uint32_t + ALIGN_STRUCT(16) data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, + ((imm8) & (1 << 1)) ? UINT32_MAX : 0, + ((imm8) & (1 << 2)) ? UINT32_MAX : 0, + ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; uint32x4_t mask = vld1q_u32(data); float32x4_t a = vreinterpretq_f32_m128(_a); float32x4_t b = vreinterpretq_f32_m128(_b); @@ -9340,8 +9308,7 @@ FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) #endif } -FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode( - unsigned int flag) +FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) { // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting, // regardless of the value of the FZ bit. @@ -9365,7 +9332,7 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode( #if defined(__aarch64__) || defined(_M_ARM64) _sse2neon_set_fpcr(r.value); #else - __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ + __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ #endif } diff --git a/tests/impl.cpp b/tests/impl.cpp index f29a4ae0..96dde158 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -4793,7 +4793,8 @@ result_t test_mm_cvttpd_epi32(const SSE2NEONTestImpl &impl, uint32_t iter) return validateInt32(ret, d0, d1, 0, 0); } -OPTNONE result_t test_mm_cvttpd_pi32(const SSE2NEONTestImpl &impl, uint32_t iter) +OPTNONE result_t test_mm_cvttpd_pi32(const SSE2NEONTestImpl &impl, + uint32_t iter) { const double *_a = (const double *) impl.mTestFloatPointer1; @@ -5877,7 +5878,7 @@ result_t test_mm_shuffle_epi32(const SSE2NEONTestImpl &impl, uint32_t iter) int32_t _d[4]; #define TEST_IMPL(IDX) \ - _d[0] = _a[((IDX) &0x3)]; \ + _d[0] = _a[((IDX) & 0x3)]; \ _d[1] = _a[((IDX >> 2) & 0x3)]; \ _d[2] = _a[((IDX >> 4) & 0x3)]; \ _d[3] = _a[((IDX >> 6) & 0x3)]; \ @@ -8957,6 +8958,7 @@ OPTNONE result_t test_mm_round_pd(const SSE2NEONTestImpl &impl, uint32_t iter) __m128d ret; __m128d a = load_m128d(_a); + switch (iter & 0x7) { case 0: d[0] = bankersRounding(_a[0]);