From 1f696898436dfa04aad65810a8d9ef56cc65aeba Mon Sep 17 00:00:00 2001
From: Bruce Merry <bmerry@sarao.ac.za>
Date: Thu, 25 Jul 2024 15:29:09 +0200
Subject: [PATCH] Add a simple SVE-based non-temporal copy for AArch64

This is probably not optimal, but will provide something to test with.
---
 src/common_memcpy.cpp   | 55 ++++++++++++++++++++++++++++++++++++++++-
 src/unittest_memcpy.cpp | 11 ++++++++-
 2 files changed, 64 insertions(+), 2 deletions(-)
diff --git a/src/common_memcpy.cpp b/src/common_memcpy.cpp
index b7c1207d7..67d263bb6 100644
--- a/src/common_memcpy.cpp
+++ b/src/common_memcpy.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2016, 2020, 2023 National Research Foundation (SARAO)
+/* Copyright 2016, 2020, 2023-2024 National Research Foundation (SARAO)
  *
  * This program is free software: you can redistribute it and/or modify it under
  * the terms of the GNU Lesser General Public License as published by the Free
@@ -58,11 +58,56 @@
 # include "common_memcpy_x86.h"
 #endif
 
+#if SPEAD2_USE_SVE_STREAM
+# include <sys/auxv.h>
+# include <arm_sve.h>
+#endif
+
 namespace spead2
 {
 
+#if SPEAD2_USE_SVE_STREAM
+[[gnu::target("+sve")]]
+static void *memcpy_nontemporal_sve(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
+{
+    /* The AArch64 memory model says
+     *
+     * "If an address dependency exists between two Read Memory and an SVE
+     * non-temporal vector load instruction generated the second read, then in
+     * the absence of any other barrier mechanism to achieve order, the memory
+     * accesses can be observed in any order by the other observers within the
+     * shareability domain of the memory addresses being accessed."
+     *
+     * It's not entirely clear to me whether that's an issue, but it sounds
+     * like SVE non-temporal reads can be served from a load buffer that's not
+     * coherent with other cores' caches. To be on the safe side, I'm adding a
+     * barrier here. The magic number makes this a read to read/write barrier.
+     */
+    __dmb(13);
+
+     */
+    /* TODO: this is probably sub-optimal, since it doesn't do any unrolling
+     * or alignment. Efficient unrolling probably requires doing separate body
+     * and tail (where the body is a multiple of the vector length) to avoid
+     * doing svwhilelt for every iteration.
+     */
+    std::uint8_t *destc = (std::uint8_t *) dest;
+    const std::uint8_t *srcc = (const std::uint8_t *) src;
+
+    size_t i = 0;
+    svbool_t pg = svwhilelt_b8(i, n);
+    do
+    {
+        svstnt1_u8(pg, &destc[i], svldnt1_u8(pg, &srcc[i]));
+        i += svcntb();
+    } while (svptest_first(svptrue_b8(), pg = svwhilelt_b8(i, n)));
+    return dest;
+}
+#endif // SPEAD2_USE_SVE_STREAM
+
 void *(*resolve_memcpy_nontemporal())(void *, const void *, std::size_t) noexcept
 {
+    /* x86 options */
 #if SPEAD2_USE_AVX512_STREAM || SPEAD2_USE_AVX_STREAM || SPEAD2_USE_SSE2_STREAM
     __builtin_cpu_init();
 #endif
@@ -85,6 +130,14 @@ void *(*resolve_memcpy_nontemporal())(void *, const void *, std::size_t) noexcep
     if (__builtin_cpu_supports("sse2"))
         return memcpy_nontemporal_sse2;
 #endif
+
+    /* aarch64 options */
+#if SPEAD2_USE_SVE_STREAM
+    unsigned long hwcaps = getauxval(AT_HWCAPS);
+    if (hwcaps & HWCAP_SVE)
+        return memcpy_nontemporal_sve;
+#endif
+
     /* Depending on the C library, std::memcpy might or might not be marked
      * as noexcept. If not, we need this explicit cast.
      */
diff --git a/src/unittest_memcpy.cpp b/src/unittest_memcpy.cpp
index 57c61adcc..376dace7f 100644
--- a/src/unittest_memcpy.cpp
+++ b/src/unittest_memcpy.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2016, 2021, 2023 National Research Foundation (SARAO)
+/* Copyright 2016, 2021, 2023-2024 National Research Foundation (SARAO)
  *
  * This program is free software: you can redistribute it and/or modify it under
  * the terms of the GNU Lesser General Public License as published by the Free
@@ -26,6 +26,9 @@
 #include <cstdint>
 #include <ostream>
 #include <spead2/common_memcpy.h>
+#if SPEAD2_USE_SVE_STREAM
+# include <sys/auxv.h>
+#endif
 
 /* Declare the implementations of the instruction-specific implementations, so
  * that we can test all of them (that the current CPU supports) rather than
@@ -42,6 +45,9 @@ void *memcpy_nontemporal_avx(void * __restrict__ dest, const void * __restrict__
 #if SPEAD2_USE_AVX512_STREAM
 void *memcpy_nontemporal_avx512(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept;
 #endif
+#if SPEAD2_USE_SVE_STREAM
+void *memcpy_nontemporal_sve(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept;
+#endif
 } // namespace spead2
 
 namespace spead2::unittest
@@ -74,6 +80,9 @@ static const memcpy_function memcpy_functions[] =
 #if SPEAD2_USE_AVX512_STREAM
     { "avx512", spead2::memcpy_nontemporal_avx512, bool(__builtin_cpu_supports("avx512f")) },
 #endif
+#if SPEAD2_USE_SVE_STREAM
+    { "sve", spead2::memcpy_nontemporal_avx512, getauxval(AT_HWCAP) & HWCAP_SVE },
+#endif
 };
 
 // Checks combinations of src and dest alignment relative to a page