Skip to content

Commit

Permalink
Add a simple SVE-based non-temporal copy for AArch64
Browse files Browse the repository at this point in the history
This is probably not optimal, but will provide something to test with.
  • Loading branch information
bmerry committed Jul 25, 2024
1 parent 569405f commit 1f69689
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 2 deletions.
55 changes: 54 additions & 1 deletion src/common_memcpy.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright 2016, 2020, 2023 National Research Foundation (SARAO)
/* Copyright 2016, 2020, 2023-2024 National Research Foundation (SARAO)
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
Expand Down Expand Up @@ -58,11 +58,56 @@
# include "common_memcpy_x86.h"
#endif

#if SPEAD2_USE_SVE_STREAM
# include <sys/auxv.h>
# include <arm_sve.h>
#endif

namespace spead2
{

#if SPEAD2_USE_SVE_STREAM
[[gnu::target("+sve")]]
static void *memcpy_nontemporal_sve(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
{
/* The AArch64 memory model says
*
* "If an address dependency exists between two Read Memory and an SVE
* non-temporal vector load instruction generated the second read, then in
* the absence of any other barrier mechanism to achieve order, the memory
* accesses can be observed in any order by the other observers within the
* shareability domain of the memory addresses being accessed."
*
* It's not entirely clear to me whether that's an issue, but it sounds
* like SVE non-temporal reads can be served from a load buffer that's not
* coherent with other cores' caches. To be on the safe side, I'm adding a
* barrier here. The magic number makes this a read to read/write barrier.
*/
__dmb(13);

*/
/* TODO: this is probably sub-optimal, since it doesn't do any unrolling
* or alignment. Efficient unrolling probably requires doing separate body
* and tail (where the body is a multiple of the vector length) to avoid
* doing svwhilelt for every iteration.
*/
std::uint8_t *destc = (std::uint8_t *) dest;
const std::uint8_t *srcc = (const std::uint8_t *) src;

size_t i = 0;
svbool_t pg = svwhilelt_b8(i, n);
do
{
svstnt1_u8(pg, &destc[i], svldnt1_u8(pg, &srcc[i]));
i += svcntb();
} while (svptest_first(svptrue_b8(), pg = svwhilelt_b8(i, n)));
return dest;
}
#endif // SPEAD2_USE_SVE_STREAM

void *(*resolve_memcpy_nontemporal())(void *, const void *, std::size_t) noexcept
{
/* x86 options */
#if SPEAD2_USE_AVX512_STREAM || SPEAD2_USE_AVX_STREAM || SPEAD2_USE_SSE2_STREAM
__builtin_cpu_init();
#endif
Expand All @@ -85,6 +130,14 @@ void *(*resolve_memcpy_nontemporal())(void *, const void *, std::size_t) noexcep
if (__builtin_cpu_supports("sse2"))
return memcpy_nontemporal_sse2;
#endif

/* aarch64 options */
#if SPEAD2_USE_SVE_STREAM
unsigned long hwcaps = getauxval(AT_HWCAPS);
if (hwcaps & HWCAP_SVE)
return memcpy_nontemporal_sve;
#endif

/* Depending on the C library, std::memcpy might or might not be marked
* as noexcept. If not, we need this explicit cast.
*/
Expand Down
11 changes: 10 additions & 1 deletion src/unittest_memcpy.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright 2016, 2021, 2023 National Research Foundation (SARAO)
/* Copyright 2016, 2021, 2023-2024 National Research Foundation (SARAO)
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
Expand Down Expand Up @@ -26,6 +26,9 @@
#include <cstdint>
#include <ostream>
#include <spead2/common_memcpy.h>
#if SPEAD2_USE_SVE_STREAM
# include <sys/auxv.h>
#endif

/* Declare the implementations of the instruction-specific implementations, so
* that we can test all of them (that the current CPU supports) rather than
Expand All @@ -42,6 +45,9 @@ void *memcpy_nontemporal_avx(void * __restrict__ dest, const void * __restrict__
#if SPEAD2_USE_AVX512_STREAM
void *memcpy_nontemporal_avx512(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept;
#endif
#if SPEAD2_USE_SVE_STREAM
void *memcpy_nontemporal_sve(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept;
#endif
} // namespace spead2

namespace spead2::unittest
Expand Down Expand Up @@ -74,6 +80,9 @@ static const memcpy_function memcpy_functions[] =
#if SPEAD2_USE_AVX512_STREAM
{ "avx512", spead2::memcpy_nontemporal_avx512, bool(__builtin_cpu_supports("avx512f")) },
#endif
#if SPEAD2_USE_SVE_STREAM
{ "sve", spead2::memcpy_nontemporal_avx512, getauxval(AT_HWCAP) & HWCAP_SVE },
#endif
};

// Checks combinations of src and dest alignment relative to a page
Expand Down

0 comments on commit 1f69689

Please sign in to comment.