diff --git a/sse2neon.h b/sse2neon.h
index 48da95fa..827c6565 100644
--- a/sse2neon.h
+++ b/sse2neon.h
@@ -382,6 +382,11 @@ typedef float32x4_t __m128d;
 #endif
 typedef int64x2_t __m128i; /* 128-bit vector containing integers */
 
+// Some intrinsics operate on unaligned data types.
+typedef int16_t __attribute__((aligned(1))) unaligned_int16_t;
+typedef int32_t __attribute__((aligned(1))) unaligned_int32_t;
+typedef int64_t __attribute__((aligned(1))) unaligned_int64_t;
+
 // __int64 is defined in the Intrinsics Guide which maps to different datatype
 // in different data model
 #if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
@@ -1935,7 +1940,7 @@ FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
 FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
 {
     return vreinterpretq_m128i_s64(
-        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
+        vcombine_s64(vld1_s64((const unaligned_int64_t *) p), vdup_n_s64(0)));
 }
 
 // Allocate size bytes of memory, aligned to the alignment specified in align,
@@ -4360,7 +4365,7 @@ FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
 FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
 {
-    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+    return vreinterpretq_m128i_s32(vld1q_s32((const unaligned_int32_t *) p));
 }
 
 // Load unaligned 32-bit integer from memory into the first element of dst.
@@ -4368,7 +4373,7 @@ FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
 FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
 {
     return vreinterpretq_m128i_s32(
-        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
+        vsetq_lane_s32(*(const unaligned_int32_t *) p, vdupq_n_s32(0), 0));
 }
 
 // Multiply packed signed 16-bit integers in a and b, producing intermediate
diff --git a/tests/impl.cpp b/tests/impl.cpp
index 0d39abb2..0e1478f9 100644
--- a/tests/impl.cpp
+++ b/tests/impl.cpp
@@ -28,6 +28,11 @@
 /* run the 1st parameter */
 #define IIF_1(t, ...) t
 
+// Some intrinsics operate on unaligned data types.
+typedef int16_t __attribute__((aligned(1))) unaligned_int16_t;
+typedef int32_t __attribute__((aligned(1))) unaligned_int32_t;
+typedef int64_t __attribute__((aligned(1))) unaligned_int64_t;
+
 // This program a set of unit tests to ensure that each SSE call provide the
 // output we expect.  If this fires an assert, then something didn't match up.
 //
@@ -49,6 +54,9 @@ class SSE2NEONTestImpl : public SSE2NEONTest
     int32_t *mTestIntPointer2;
     float mTestFloats[MAX_TEST_VALUE];
     int32_t mTestInts[MAX_TEST_VALUE];
+    int8_t mTestUnalignedInts[32] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,
+                                     11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                                     22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
 
     virtual ~SSE2NEONTestImpl(void)
     {
@@ -2141,7 +2149,8 @@ result_t test_mm_loadu_si16(const SSE2NEONTestImpl &impl, uint32_t iter)
 #if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
     return TEST_UNIMPL;
 #else
-    const int16_t *addr = (const int16_t *) impl.mTestIntPointer1;
+    const unaligned_int16_t *addr =
+        (const unaligned_int16_t *)(impl.mTestUnalignedInts + 1);
 
     __m128i ret = _mm_loadu_si16((const void *) addr);
 
@@ -2157,7 +2166,8 @@ result_t test_mm_loadu_si64(const SSE2NEONTestImpl &impl, uint32_t iter)
 #if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ < 9)
     return TEST_UNIMPL;
 #else
-    const int64_t *addr = (const int64_t *) impl.mTestIntPointer1;
+    const unaligned_int64_t *addr =
+        (const unaligned_int64_t *)(impl.mTestUnalignedInts + 1);
 
     __m128i ret = _mm_loadu_si64((const void *) addr);
 
@@ -5024,7 +5034,8 @@ result_t test_mm_loadu_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 
 result_t test_mm_loadu_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
-    const int32_t *_a = (const int32_t *) impl.mTestIntPointer1;
+    const unaligned_int32_t *_a =
+        (const unaligned_int32_t *)(impl.mTestUnalignedInts + 1);
     __m128i c = _mm_loadu_si128((const __m128i *) _a);
     return VALIDATE_INT32_M128(c, _a);
 }
@@ -5037,7 +5048,8 @@ result_t test_mm_loadu_si32(const SSE2NEONTestImpl &impl, uint32_t iter)
 #if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
     return TEST_UNIMPL;
 #else
-    const int32_t *addr = (const int32_t *) impl.mTestIntPointer1;
+    const unaligned_int32_t *addr =
+        (const unaligned_int32_t *)(impl.mTestUnalignedInts + 1);
 
     __m128i ret = _mm_loadu_si32((const void *) addr);