Skip to content

Commit

Permalink
Make memcpy_nontempora_sve actually compile
Browse files Browse the repository at this point in the history
  • Loading branch information
bmerry committed Nov 6, 2024
1 parent e8521fc commit 2124947
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 7 deletions.
2 changes: 1 addition & 1 deletion include/spead2/common_features.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
#endif

/* Similarly for AARCH64 features */
#if defined(__AARCH64LE__) || defined(__AARCH64BE__)
#if defined(__aarch64__)
# define SPEAD2_USE_SVE_STREAM @SPEAD2_USE_SVE_STREAM@
#else
# define SPEAD2_USE_SVE_STREAM 0
Expand Down
14 changes: 9 additions & 5 deletions src/common_memcpy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
#endif

#if SPEAD2_USE_SVE_STREAM
# include <atomic>
# include <sys/auxv.h>
# include <arm_sve.h>
#endif
Expand All @@ -68,7 +69,7 @@ namespace spead2

#if SPEAD2_USE_SVE_STREAM
[[gnu::target("+sve")]]
static void *memcpy_nontemporal_sve(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
void *memcpy_nontemporal_sve(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
{
/* The AArch64 memory model says
*
Expand All @@ -78,14 +79,17 @@ static void *memcpy_nontemporal_sve(void * __restrict__ dest, const void * __res
* accesses can be observed in any order by the other observers within the
* shareability domain of the memory addresses being accessed."
*
* This is probably not an issue in practice, unless the source address
* is obtained with memory_order_consume and the compiler actually tracks
* dependencies (which apparently none do).
*
* It's not entirely clear to me whether that's an issue, but it sounds
* like SVE non-temporal reads can be served from a load buffer that's not
* coherent with other cores' caches. To be on the safe side, I'm adding a
* barrier here. The magic number makes this a read to read/write barrier.
* barrier here.
*/
__dmb(13);
std::atomic_thread_fence(std::memory_order_acquire);

*/
/* TODO: this is probably sub-optimal, since it doesn't do any unrolling
* or alignment. Efficient unrolling probably requires doing separate body
* and tail (where the body is a multiple of the vector length) to avoid
Expand Down Expand Up @@ -133,7 +137,7 @@ void *(*resolve_memcpy_nontemporal())(void *, const void *, std::size_t) noexcep

/* aarch64 options */
#if SPEAD2_USE_SVE_STREAM
unsigned long hwcaps = getauxval(AT_HWCAPS);
unsigned long hwcaps = getauxval(AT_HWCAP);
if (hwcaps & HWCAP_SVE)
return memcpy_nontemporal_sve;
#endif
Expand Down
3 changes: 2 additions & 1 deletion src/unittest_memcpy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <cstdint>
#include <ostream>
#include <spead2/common_memcpy.h>
#include <spead2/common_features.h>
#if SPEAD2_USE_SVE_STREAM
# include <sys/auxv.h>
#endif
Expand Down Expand Up @@ -81,7 +82,7 @@ static const memcpy_function memcpy_functions[] =
{ "avx512", spead2::memcpy_nontemporal_avx512, bool(__builtin_cpu_supports("avx512f")) },
#endif
#if SPEAD2_USE_SVE_STREAM
{ "sve", spead2::memcpy_nontemporal_avx512, getauxval(AT_HWCAP) & HWCAP_SVE },
{ "sve", spead2::memcpy_nontemporal_sve, (getauxval(AT_HWCAP) & HWCAP_SVE) != 0 },
#endif
};

Expand Down

0 comments on commit 2124947

Please sign in to comment.