Skip to content

Commit

Permalink
Add a simple SVE-based non-temporal copy for AArch64
Browse files Browse the repository at this point in the history
This is probably not optimal, but will provide something to test with.
  • Loading branch information
bmerry committed Nov 6, 2024
1 parent 1fc2def commit 4d26fb7
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 7 deletions.
2 changes: 1 addition & 1 deletion include/spead2/common_features.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
#endif

/* Similarly for AARCH64 features */
#if defined(__AARCH64LE__) || defined(__AARCH64BE__)
#if defined(__aarch64__)
# define SPEAD2_USE_SVE_STREAM @SPEAD2_USE_SVE_STREAM@
#else
# define SPEAD2_USE_SVE_STREAM 0
Expand Down
74 changes: 70 additions & 4 deletions src/common_memcpy.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright 2016, 2020, 2023 National Research Foundation (SARAO)
/* Copyright 2016, 2020, 2023-2024 National Research Foundation (SARAO)
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
Expand Down Expand Up @@ -58,11 +58,64 @@
# include "common_memcpy_x86.h"
#endif

#if SPEAD2_USE_SVE_STREAM
# include <atomic>
# include <sys/auxv.h>
# include <arm_sve.h>
#endif

namespace spead2
{

void *(*resolve_memcpy_nontemporal())(void *, const void *, std::size_t) noexcept
#if SPEAD2_USE_SVE_STREAM
[[gnu::target("+sve")]]
void *memcpy_nontemporal_sve(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
{
/* The AArch64 memory model says
*
* "If an address dependency exists between two Read Memory and an SVE
* non-temporal vector load instruction generated the second read, then in
* the absence of any other barrier mechanism to achieve order, the memory
* accesses can be observed in any order by the other observers within the
* shareability domain of the memory addresses being accessed."
*
* This is probably not an issue in practice, unless the source address
* is obtained with memory_order_consume and the compiler actually tracks
* dependencies (which apparently none do).
*
* It's not entirely clear to me whether that's an issue, but it sounds
* like SVE non-temporal reads can be served from a load buffer that's not
* coherent with other cores' caches. To be on the safe side, I'm adding a
* barrier here.
*/
std::atomic_thread_fence(std::memory_order_acquire);

/* TODO: this is probably sub-optimal, since it doesn't do any unrolling
* or alignment. Efficient unrolling probably requires doing separate body
* and tail (where the body is a multiple of the vector length) to avoid
* doing svwhilelt for every iteration.
*/
std::uint8_t *destc = (std::uint8_t *) dest;
const std::uint8_t *srcc = (const std::uint8_t *) src;

size_t i = 0;
svbool_t pg = svwhilelt_b8(i, n);
do
{
svstnt1_u8(pg, &destc[i], svldnt1_u8(pg, &srcc[i]));
i += svcntb();
} while (svptest_first(svptrue_b8(), pg = svwhilelt_b8(i, n)));
return dest;
}
#endif // SPEAD2_USE_SVE_STREAM

void *(*resolve_memcpy_nontemporal(
#ifdef __aarch64__
std::uint64_t hwcaps // See System V AVI for AArch64
#endif
))(void *, const void *, std::size_t) noexcept
{
/* x86 options */
#if SPEAD2_USE_AVX512_STREAM || SPEAD2_USE_AVX_STREAM || SPEAD2_USE_SSE2_STREAM
__builtin_cpu_init();
#endif
Expand All @@ -85,6 +138,13 @@ void *(*resolve_memcpy_nontemporal())(void *, const void *, std::size_t) noexcep
if (__builtin_cpu_supports("sse2"))
return memcpy_nontemporal_sse2;
#endif

/* aarch64 options */
#if SPEAD2_USE_SVE_STREAM
if (hwcaps & HWCAP_SVE)
return memcpy_nontemporal_sve;
#endif

/* Depending on the C library, std::memcpy might or might not be marked
* as noexcept. If not, we need this explicit cast.
*/
Expand All @@ -93,14 +153,20 @@ void *(*resolve_memcpy_nontemporal())(void *, const void *, std::size_t) noexcep

#if SPEAD2_USE_FMV

[[gnu::ifunc("_ZN6spead226resolve_memcpy_nontemporalEv")]]
[[gnu::ifunc("_ZN6spead226resolve_memcpy_nontemporalEm")]]
void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept;

#else

void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
{
static void *(*memcpy_nontemporal_ptr)(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept = resolve_memcpy_nontemporal();
#ifdef __aarch64__
static void *(*memcpy_nontemporal_ptr)(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept =
resolve_memcpy_nontemporal(getauxval(AT_HWCAPS));
#else
static void *(*memcpy_nontemporal_ptr)(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept =
resolve_memcpy_nontemporal();
#endif
return memcpy_nontemporal_ptr(dest, src, n);
}

Expand Down
19 changes: 17 additions & 2 deletions src/unittest_memcpy.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright 2016, 2021, 2023 National Research Foundation (SARAO)
/* Copyright 2016, 2021, 2023-2024 National Research Foundation (SARAO)
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
Expand Down Expand Up @@ -26,6 +26,10 @@
#include <cstdint>
#include <ostream>
#include <spead2/common_memcpy.h>
#include <spead2/common_features.h>
#if SPEAD2_USE_SVE_STREAM
# include <sys/auxv.h>
#endif

/* Declare the implementations of the instruction-specific implementations, so
* that we can test all of them (that the current CPU supports) rather than
Expand All @@ -42,6 +46,9 @@ void *memcpy_nontemporal_avx(void * __restrict__ dest, const void * __restrict__
#if SPEAD2_USE_AVX512_STREAM
void *memcpy_nontemporal_avx512(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept;
#endif
#if SPEAD2_USE_SVE_STREAM
void *memcpy_nontemporal_sve(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept;
#endif
} // namespace spead2

namespace spead2::unittest
Expand All @@ -62,9 +69,14 @@ std::ostream &operator<<(std::ostream &o, const memcpy_function &func)
return o << func.name;
}

static void *wrap_memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
{
return spead2::memcpy_nontemporal(dest, src, n);
}

static const memcpy_function memcpy_functions[] =
{
{ "default", spead2::memcpy_nontemporal, true },
{ "default", wrap_memcpy_nontemporal, true },
#if SPEAD2_USE_SSE2_STREAM
{ "sse2", spead2::memcpy_nontemporal_sse2, bool(__builtin_cpu_supports("sse2")) },
#endif
Expand All @@ -74,6 +86,9 @@ static const memcpy_function memcpy_functions[] =
#if SPEAD2_USE_AVX512_STREAM
{ "avx512", spead2::memcpy_nontemporal_avx512, bool(__builtin_cpu_supports("avx512f")) },
#endif
#if SPEAD2_USE_SVE_STREAM
{ "sve", spead2::memcpy_nontemporal_sve, (getauxval(AT_HWCAP) & HWCAP_SVE) != 0 },
#endif
};

// Checks combinations of src and dest alignment relative to a page
Expand Down

0 comments on commit 4d26fb7

Please sign in to comment.