Skip to content

Commit

Permalink
Use unaligned data types for unaligned intrinsics.
Browse files Browse the repository at this point in the history
  • Loading branch information
Logikable committed May 14, 2024
1 parent ab7a347 commit a68ef78
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 7 deletions.
11 changes: 8 additions & 3 deletions sse2neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,11 @@ typedef float32x4_t __m128d;
#endif
typedef int64x2_t __m128i; /* 128-bit vector containing integers */

// Some intrinsics operate on unaligned data types.
typedef int16_t __attribute__((aligned(1))) unaligned_int16_t;
typedef int32_t __attribute__((aligned(1))) unaligned_int32_t;
typedef int64_t __attribute__((aligned(1))) unaligned_int64_t;

// __int64 is defined in the Intrinsics Guide which maps to different datatype
// in different data model
#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
Expand Down Expand Up @@ -1935,7 +1940,7 @@ FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
{
return vreinterpretq_m128i_s64(
vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
vcombine_s64(vld1_s64((const unaligned_int64_t *) p), vdup_n_s64(0)));
}

// Allocate size bytes of memory, aligned to the alignment specified in align,
Expand Down Expand Up @@ -4360,15 +4365,15 @@ FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
{
return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
return vreinterpretq_m128i_s32(vld1q_s32((const unaligned_int32_t *) p));
}

// Load unaligned 32-bit integer from memory into the first element of dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
{
return vreinterpretq_m128i_s32(
vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
vsetq_lane_s32(*(const unaligned_int32_t *) p, vdupq_n_s32(0), 0));
}

// Multiply packed signed 16-bit integers in a and b, producing intermediate
Expand Down
20 changes: 16 additions & 4 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@
/* run the 1st parameter */
#define IIF_1(t, ...) t

// Some intrinsics operate on unaligned data types.
typedef int16_t __attribute__((aligned(1))) unaligned_int16_t;
typedef int32_t __attribute__((aligned(1))) unaligned_int32_t;
typedef int64_t __attribute__((aligned(1))) unaligned_int64_t;

// This program a set of unit tests to ensure that each SSE call provide the
// output we expect. If this fires an assert, then something didn't match up.
//
Expand All @@ -49,6 +54,9 @@ class SSE2NEONTestImpl : public SSE2NEONTest
int32_t *mTestIntPointer2;
float mTestFloats[MAX_TEST_VALUE];
int32_t mTestInts[MAX_TEST_VALUE];
int8_t mTestUnalignedInts[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31};

virtual ~SSE2NEONTestImpl(void)
{
Expand Down Expand Up @@ -2141,7 +2149,8 @@ result_t test_mm_loadu_si16(const SSE2NEONTestImpl &impl, uint32_t iter)
#if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
return TEST_UNIMPL;
#else
const int16_t *addr = (const int16_t *) impl.mTestIntPointer1;
const unaligned_int16_t *addr =
(const unaligned_int16_t *)(impl.mTestUnalignedInts + 1);

__m128i ret = _mm_loadu_si16((const void *) addr);

Expand All @@ -2157,7 +2166,8 @@ result_t test_mm_loadu_si64(const SSE2NEONTestImpl &impl, uint32_t iter)
#if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ < 9)
return TEST_UNIMPL;
#else
const int64_t *addr = (const int64_t *) impl.mTestIntPointer1;
const unaligned_int64_t *addr =
(const unaligned_int64_t *)(impl.mTestUnalignedInts + 1);

__m128i ret = _mm_loadu_si64((const void *) addr);

Expand Down Expand Up @@ -5024,7 +5034,8 @@ result_t test_mm_loadu_pd(const SSE2NEONTestImpl &impl, uint32_t iter)

result_t test_mm_loadu_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
{
const int32_t *_a = (const int32_t *) impl.mTestIntPointer1;
const unaligned_int32_t *_a =
(const unaligned_int32_t *)(impl.mTestUnalignedInts + 1);
__m128i c = _mm_loadu_si128((const __m128i *) _a);
return VALIDATE_INT32_M128(c, _a);
}
Expand All @@ -5037,7 +5048,8 @@ result_t test_mm_loadu_si32(const SSE2NEONTestImpl &impl, uint32_t iter)
#if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
return TEST_UNIMPL;
#else
const int32_t *addr = (const int32_t *) impl.mTestIntPointer1;
const unaligned_int32_t *addr =
(const unaligned_int32_t *)(impl.mTestUnalignedInts + 1);

__m128i ret = _mm_loadu_si32((const void *) addr);

Expand Down

0 comments on commit a68ef78

Please sign in to comment.