Make memcpy_nontempora_sve actually compile

ska-sa · Nov 6, 2024 · 2124947 · 2124947
1 parent e8521fc
commit 2124947
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 7 deletions.
diff --git a/include/spead2/common_features.h.in b/include/spead2/common_features.h.in
@@ -54,7 +54,7 @@
 #endif
 
 /* Similarly for AARCH64 features */
-#if defined(__AARCH64LE__) || defined(__AARCH64BE__)
+#if defined(__aarch64__)
 # define SPEAD2_USE_SVE_STREAM @SPEAD2_USE_SVE_STREAM@
 #else
 # define SPEAD2_USE_SVE_STREAM 0

diff --git a/src/common_memcpy.cpp b/src/common_memcpy.cpp
@@ -59,6 +59,7 @@
 #endif
 
 #if SPEAD2_USE_SVE_STREAM
+# include <atomic>
 # include <sys/auxv.h>
 # include <arm_sve.h>
 #endif
@@ -68,7 +69,7 @@ namespace spead2
 
 #if SPEAD2_USE_SVE_STREAM
 [[gnu::target("+sve")]]
-static void *memcpy_nontemporal_sve(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
+void *memcpy_nontemporal_sve(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
 {
     /* The AArch64 memory model says
      *
@@ -78,14 +79,17 @@ static void *memcpy_nontemporal_sve(void * __restrict__ dest, const void * __res
      * accesses can be observed in any order by the other observers within the
      * shareability domain of the memory addresses being accessed."
      *
+     * This is probably not an issue in practice, unless the source address
+     * is obtained with memory_order_consume and the compiler actually tracks
+     * dependencies (which apparently none do).
+     *
      * It's not entirely clear to me whether that's an issue, but it sounds
      * like SVE non-temporal reads can be served from a load buffer that's not
      * coherent with other cores' caches. To be on the safe side, I'm adding a
-     * barrier here. The magic number makes this a read to read/write barrier.
+     * barrier here.
      */
-    __dmb(13);
+    std::atomic_thread_fence(std::memory_order_acquire);
 
-     */
     /* TODO: this is probably sub-optimal, since it doesn't do any unrolling
      * or alignment. Efficient unrolling probably requires doing separate body
      * and tail (where the body is a multiple of the vector length) to avoid
@@ -133,7 +137,7 @@ void *(*resolve_memcpy_nontemporal())(void *, const void *, std::size_t) noexcep
 
     /* aarch64 options */
 #if SPEAD2_USE_SVE_STREAM
-    unsigned long hwcaps = getauxval(AT_HWCAPS);
+    unsigned long hwcaps = getauxval(AT_HWCAP);
     if (hwcaps & HWCAP_SVE)
         return memcpy_nontemporal_sve;
 #endif

diff --git a/src/unittest_memcpy.cpp b/src/unittest_memcpy.cpp
@@ -26,6 +26,7 @@
 #include <cstdint>
 #include <ostream>
 #include <spead2/common_memcpy.h>
+#include <spead2/common_features.h>
 #if SPEAD2_USE_SVE_STREAM
 # include <sys/auxv.h>
 #endif
@@ -81,7 +82,7 @@ static const memcpy_function memcpy_functions[] =
     { "avx512", spead2::memcpy_nontemporal_avx512, bool(__builtin_cpu_supports("avx512f")) },
 #endif
 #if SPEAD2_USE_SVE_STREAM
-    { "sve", spead2::memcpy_nontemporal_avx512, getauxval(AT_HWCAP) & HWCAP_SVE },
+    { "sve", spead2::memcpy_nontemporal_sve, (getauxval(AT_HWCAP) & HWCAP_SVE) != 0 },
 #endif
 };