diff --git a/sse2neon.h b/sse2neon.h index 48da95fa..827c6565 100644 --- a/sse2neon.h +++ b/sse2neon.h @@ -382,6 +382,11 @@ typedef float32x4_t __m128d; #endif typedef int64x2_t __m128i; /* 128-bit vector containing integers */ +// Some intrinsics operate on unaligned data types. +typedef int16_t __attribute__((aligned(1))) unaligned_int16_t; +typedef int32_t __attribute__((aligned(1))) unaligned_int32_t; +typedef int64_t __attribute__((aligned(1))) unaligned_int64_t; + // __int64 is defined in the Intrinsics Guide which maps to different datatype // in different data model #if !(defined(_WIN32) || defined(_WIN64) || defined(__int64)) @@ -1935,7 +1940,7 @@ FORCE_INLINE __m128i _mm_loadu_si16(const void *p) FORCE_INLINE __m128i _mm_loadu_si64(const void *p) { return vreinterpretq_m128i_s64( - vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0))); + vcombine_s64(vld1_s64((const unaligned_int64_t *) p), vdup_n_s64(0))); } // Allocate size bytes of memory, aligned to the alignment specified in align, @@ -4360,7 +4365,7 @@ FORCE_INLINE __m128d _mm_loadu_pd(const double *p) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128 FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) { - return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); + return vreinterpretq_m128i_s32(vld1q_s32((const unaligned_int32_t *) p)); } // Load unaligned 32-bit integer from memory into the first element of dst. @@ -4368,7 +4373,7 @@ FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) FORCE_INLINE __m128i _mm_loadu_si32(const void *p) { return vreinterpretq_m128i_s32( - vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0)); + vsetq_lane_s32(*(const unaligned_int32_t *) p, vdupq_n_s32(0), 0)); } // Multiply packed signed 16-bit integers in a and b, producing intermediate diff --git a/tests/impl.cpp b/tests/impl.cpp index 0d39abb2..0e1478f9 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -28,6 +28,11 @@ /* run the 1st parameter */ #define IIF_1(t, ...) t +// Some intrinsics operate on unaligned data types. +typedef int16_t __attribute__((aligned(1))) unaligned_int16_t; +typedef int32_t __attribute__((aligned(1))) unaligned_int32_t; +typedef int64_t __attribute__((aligned(1))) unaligned_int64_t; + // This program a set of unit tests to ensure that each SSE call provide the // output we expect. If this fires an assert, then something didn't match up. // @@ -49,6 +54,9 @@ class SSE2NEONTestImpl : public SSE2NEONTest int32_t *mTestIntPointer2; float mTestFloats[MAX_TEST_VALUE]; int32_t mTestInts[MAX_TEST_VALUE]; + int8_t mTestUnalignedInts[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; virtual ~SSE2NEONTestImpl(void) { @@ -2141,7 +2149,8 @@ result_t test_mm_loadu_si16(const SSE2NEONTestImpl &impl, uint32_t iter) #if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10) return TEST_UNIMPL; #else - const int16_t *addr = (const int16_t *) impl.mTestIntPointer1; + const unaligned_int16_t *addr = + (const unaligned_int16_t *)(impl.mTestUnalignedInts + 1); __m128i ret = _mm_loadu_si16((const void *) addr); @@ -2157,7 +2166,8 @@ result_t test_mm_loadu_si64(const SSE2NEONTestImpl &impl, uint32_t iter) #if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ < 9) return TEST_UNIMPL; #else - const int64_t *addr = (const int64_t *) impl.mTestIntPointer1; + const unaligned_int64_t *addr = + (const unaligned_int64_t *)(impl.mTestUnalignedInts + 1); __m128i ret = _mm_loadu_si64((const void *) addr); @@ -5024,7 +5034,8 @@ result_t test_mm_loadu_pd(const SSE2NEONTestImpl &impl, uint32_t iter) result_t test_mm_loadu_si128(const SSE2NEONTestImpl &impl, uint32_t iter) { - const int32_t *_a = (const int32_t *) impl.mTestIntPointer1; + const unaligned_int32_t *_a = + (const unaligned_int32_t *)(impl.mTestUnalignedInts + 1); __m128i c = _mm_loadu_si128((const __m128i *) _a); return VALIDATE_INT32_M128(c, _a); } @@ -5037,7 +5048,8 @@ result_t test_mm_loadu_si32(const SSE2NEONTestImpl &impl, uint32_t iter) #if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10) return TEST_UNIMPL; #else - const int32_t *addr = (const int32_t *) impl.mTestIntPointer1; + const unaligned_int32_t *addr = + (const unaligned_int32_t *)(impl.mTestUnalignedInts + 1); __m128i ret = _mm_loadu_si32((const void *) addr);