diff --git a/src/common_memcpy.cpp b/src/common_memcpy.cpp index b7c1207d..67d263bb 100644 --- a/src/common_memcpy.cpp +++ b/src/common_memcpy.cpp @@ -1,4 +1,4 @@ -/* Copyright 2016, 2020, 2023 National Research Foundation (SARAO) +/* Copyright 2016, 2020, 2023-2024 National Research Foundation (SARAO) * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU Lesser General Public License as published by the Free @@ -58,11 +58,56 @@ # include "common_memcpy_x86.h" #endif +#if SPEAD2_USE_SVE_STREAM +# include +# include +#endif + namespace spead2 { +#if SPEAD2_USE_SVE_STREAM +[[gnu::target("+sve")]] +static void *memcpy_nontemporal_sve(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept +{ + /* The AArch64 memory model says + * + * "If an address dependency exists between two Read Memory and an SVE + * non-temporal vector load instruction generated the second read, then in + * the absence of any other barrier mechanism to achieve order, the memory + * accesses can be observed in any order by the other observers within the + * shareability domain of the memory addresses being accessed." + * + * It's not entirely clear to me whether that's an issue, but it sounds + * like SVE non-temporal reads can be served from a load buffer that's not + * coherent with other cores' caches. To be on the safe side, I'm adding a + * barrier here. The magic number makes this a read to read/write barrier. + */ + __dmb(13); + + */ + /* TODO: this is probably sub-optimal, since it doesn't do any unrolling + * or alignment. Efficient unrolling probably requires doing separate body + * and tail (where the body is a multiple of the vector length) to avoid + * doing svwhilelt for every iteration. + */ + std::uint8_t *destc = (std::uint8_t *) dest; + const std::uint8_t *srcc = (const std::uint8_t *) src; + + size_t i = 0; + svbool_t pg = svwhilelt_b8(i, n); + do + { + svstnt1_u8(pg, &destc[i], svldnt1_u8(pg, &srcc[i])); + i += svcntb(); + } while (svptest_first(svptrue_b8(), pg = svwhilelt_b8(i, n))); + return dest; +} +#endif // SPEAD2_USE_SVE_STREAM + void *(*resolve_memcpy_nontemporal())(void *, const void *, std::size_t) noexcept { + /* x86 options */ #if SPEAD2_USE_AVX512_STREAM || SPEAD2_USE_AVX_STREAM || SPEAD2_USE_SSE2_STREAM __builtin_cpu_init(); #endif @@ -85,6 +130,14 @@ void *(*resolve_memcpy_nontemporal())(void *, const void *, std::size_t) noexcep if (__builtin_cpu_supports("sse2")) return memcpy_nontemporal_sse2; #endif + + /* aarch64 options */ +#if SPEAD2_USE_SVE_STREAM + unsigned long hwcaps = getauxval(AT_HWCAPS); + if (hwcaps & HWCAP_SVE) + return memcpy_nontemporal_sve; +#endif + /* Depending on the C library, std::memcpy might or might not be marked * as noexcept. If not, we need this explicit cast. */ diff --git a/src/unittest_memcpy.cpp b/src/unittest_memcpy.cpp index 57c61adc..376dace7 100644 --- a/src/unittest_memcpy.cpp +++ b/src/unittest_memcpy.cpp @@ -1,4 +1,4 @@ -/* Copyright 2016, 2021, 2023 National Research Foundation (SARAO) +/* Copyright 2016, 2021, 2023-2024 National Research Foundation (SARAO) * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU Lesser General Public License as published by the Free @@ -26,6 +26,9 @@ #include #include #include +#if SPEAD2_USE_SVE_STREAM +# include +#endif /* Declare the implementations of the instruction-specific implementations, so * that we can test all of them (that the current CPU supports) rather than @@ -42,6 +45,9 @@ void *memcpy_nontemporal_avx(void * __restrict__ dest, const void * __restrict__ #if SPEAD2_USE_AVX512_STREAM void *memcpy_nontemporal_avx512(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept; #endif +#if SPEAD2_USE_SVE_STREAM +void *memcpy_nontemporal_sve(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept; +#endif } // namespace spead2 namespace spead2::unittest @@ -74,6 +80,9 @@ static const memcpy_function memcpy_functions[] = #if SPEAD2_USE_AVX512_STREAM { "avx512", spead2::memcpy_nontemporal_avx512, bool(__builtin_cpu_supports("avx512f")) }, #endif +#if SPEAD2_USE_SVE_STREAM + { "sve", spead2::memcpy_nontemporal_avx512, getauxval(AT_HWCAP) & HWCAP_SVE }, +#endif }; // Checks combinations of src and dest alignment relative to a page