From 8d9c33f409a8a19962b733bff3ff94c8ed9dba1b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 3 Mar 2024 21:56:12 -0800 Subject: [PATCH] lib/x86/adler32: refactor and improve implementations - Refactor the x86 implementations of Adler-32 to be organized like the x86 implementations of CRC-32, where there's an x86-specific template that expands into the different implementations. - Add an AVX512VNNI implementation using 256-bit vectors. - Increase the number of vectors processed per iteration of the inner loop of the AVX512VNNI implementations from 2 to 4. - Handle small amounts of data more efficiently. If the length is small, don't bother aligning the pointer at the beginning. Also optimize the handling of any bytes left over after the inner loop. Also avoid doing redundant reductions mod 65521. - Make the AVX-VNNI implementation dot with 1's so that all VNNI implementations use the same strategy. - Put "_x86" in the name of the functions, like what is done for CRC-32. --- CMakeLists.txt | 1 + lib/x86/adler32_impl.h | 421 ++++----------------------- lib/x86/adler32_template.h | 515 +++++++++++++++++++++++++++++++++ scripts/checksum_benchmarks.sh | 8 +- scripts/run_tests.sh | 3 +- 5 files changed, 584 insertions(+), 364 deletions(-) create mode 100644 lib/x86/adler32_template.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 917fefb0..f902c675 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -131,6 +131,7 @@ if(LIBDEFLATE_ZLIB_SUPPORT) lib/adler32_vec_template.h lib/arm/adler32_impl.h lib/x86/adler32_impl.h + lib/x86/adler32_template.h lib/zlib_constants.h ) if(LIBDEFLATE_COMPRESSION_SUPPORT) diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h index c8b5153b..f0c68ace 100644 --- a/lib/x86/adler32_impl.h +++ b/lib/x86/adler32_impl.h @@ -30,371 +30,67 @@ #include "cpu_features.h" -/* - * The following macros horizontally sum the s1 counters and add them to the - * real s1, and likewise for s2. They do this via a series of reductions, each - * of which halves the vector length, until just one counter remains. - * - * The s1 reductions don't depend on the s2 reductions and vice versa, so for - * efficiency they are interleaved. - * - * If used_sad=1, then the bytes were summed using one of the Sum of Absolute - * Differences instructions (psadbw or vpsadbw) before they were added to v_s1. - * In this case every other counter in v_s1 is 0, so we skip one of the s1 - * reductions when going from 128 => 32 bits. - */ - -#define ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2, used_sad) \ -{ \ - __m128i /* __v4su */ s1_last = (v_s1), s2_last = (v_s2); \ - \ - /* 128 => 32 bits */ \ - if (!(used_sad)) \ - s1_last = _mm_add_epi32(s1_last, \ - _mm_shuffle_epi32(s1_last, 0x31)); \ - s2_last = _mm_add_epi32(s2_last, _mm_shuffle_epi32(s2_last, 0x31)); \ - s1_last = _mm_add_epi32(s1_last, _mm_shuffle_epi32(s1_last, 0x02)); \ - s2_last = _mm_add_epi32(s2_last, _mm_shuffle_epi32(s2_last, 0x02)); \ - \ - *(s1) += (u32)_mm_cvtsi128_si32(s1_last); \ - *(s2) += (u32)_mm_cvtsi128_si32(s2_last); \ -} - -#define ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2, used_sad) \ -{ \ - __m128i /* __v4su */ s1_128bit, s2_128bit; \ - \ - /* 256 => 128 bits */ \ - s1_128bit = _mm_add_epi32(_mm256_extracti128_si256((v_s1), 0), \ - _mm256_extracti128_si256((v_s1), 1)); \ - s2_128bit = _mm_add_epi32(_mm256_extracti128_si256((v_s2), 0), \ - _mm256_extracti128_si256((v_s2), 1)); \ - \ - ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit, \ - (used_sad)); \ -} - -#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2, used_sad) \ -{ \ - __m256i /* __v8su */ s1_256bit, s2_256bit; \ - \ - /* 512 => 256 bits */ \ - s1_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s1), 0), \ - _mm512_extracti64x4_epi64((v_s1), 1)); \ - s2_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s2), 0), \ - _mm512_extracti64x4_epi64((v_s2), 1)); \ - \ - ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit, \ - (used_sad)); \ -} - -/* - * This is a very silly partial workaround for gcc bug - * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892. The bug causes gcc to - * generate extra move instructions in some loops containing vector intrinsics. - * - * An alternate workaround would be to use gcc native vector operations instead - * of vector intrinsics. But that would result in MSVC needing its own code. - */ -#if GCC_PREREQ(1, 0) -# define GCC_UPDATE_VARS(a, b, c, d, e, f) \ - __asm__("" : "+x" (a), "+x" (b), "+x" (c), "+x" (d), "+x" (e), "+x" (f)) -#else -# define GCC_UPDATE_VARS(a, b, c, d, e, f) \ - (void)a, (void)b, (void)c, (void)d, (void)e, (void)f -#endif - -/* - * SSE2 and AVX2 implementations. They are very similar; the AVX2 - * implementation just uses twice the vector width as the SSE2 one. - */ +/* SSE2 and AVX2 implementations. Used on older CPUs. */ #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) -# define adler32_sse2 adler32_sse2 -# define FUNCNAME adler32_sse2 -# define FUNCNAME_CHUNK adler32_sse2_chunk -# define IMPL_ALIGNMENT 16 -# define IMPL_SEGMENT_LEN 32 -/* - * The 16-bit precision byte counters must not be allowed to undergo *signed* - * overflow, otherwise the signed multiplications at the end (_mm_madd_epi16) - * would behave incorrectly. - */ -# define IMPL_MAX_CHUNK_LEN (32 * (0x7FFF / 0xFF)) -# define ATTRIBUTES _target_attribute("sse2") -static forceinline ATTRIBUTES void -adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2) -{ - static const u16 _aligned_attribute(16) mults[4][8] = { - { 32, 31, 30, 29, 28, 27, 26, 25 }, - { 24, 23, 22, 21, 20, 19, 18, 17 }, - { 16, 15, 14, 13, 12, 11, 10, 9 }, - { 8, 7, 6, 5, 4, 3, 2, 1 }, - }; - const __m128i /* __v8hu */ mults_a = _mm_load_si128((const __m128i *)mults[0]); - const __m128i /* __v8hu */ mults_b = _mm_load_si128((const __m128i *)mults[1]); - const __m128i /* __v8hu */ mults_c = _mm_load_si128((const __m128i *)mults[2]); - const __m128i /* __v8hu */ mults_d = _mm_load_si128((const __m128i *)mults[3]); - const __m128i zeroes = _mm_setzero_si128(); - - /* s1 counters: 32-bit, sum of bytes */ - __m128i /* __v4su */ v_s1 = zeroes; - - /* s2 counters: 32-bit, sum of s1 values */ - __m128i /* __v4su */ v_s2 = zeroes; - - /* - * Thirty-two 16-bit counters for byte sums. Each accumulates the bytes - * that eventually need to be multiplied by a number 32...1 for addition - * into s2. - */ - __m128i /* __v8hu */ v_byte_sums_a = zeroes; - __m128i /* __v8hu */ v_byte_sums_b = zeroes; - __m128i /* __v8hu */ v_byte_sums_c = zeroes; - __m128i /* __v8hu */ v_byte_sums_d = zeroes; - - do { - /* Load the next 32 bytes. */ - const __m128i bytes1 = *p++; - const __m128i bytes2 = *p++; - - /* - * Accumulate the previous s1 counters into the s2 counters. - * Logically, this really should be v_s2 += v_s1 * 32, but we - * can do the multiplication (or left shift) later. - */ - v_s2 = _mm_add_epi32(v_s2, v_s1); - - /* - * s1 update: use "Packed Sum of Absolute Differences" to add - * the bytes horizontally with 8 bytes per sum. Then add the - * sums to the s1 counters. - */ - v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zeroes)); - v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zeroes)); - - /* - * Also accumulate the bytes into 32 separate counters that have - * 16-bit precision. - */ - v_byte_sums_a = _mm_add_epi16( - v_byte_sums_a, _mm_unpacklo_epi8(bytes1, zeroes)); - v_byte_sums_b = _mm_add_epi16( - v_byte_sums_b, _mm_unpackhi_epi8(bytes1, zeroes)); - v_byte_sums_c = _mm_add_epi16( - v_byte_sums_c, _mm_unpacklo_epi8(bytes2, zeroes)); - v_byte_sums_d = _mm_add_epi16( - v_byte_sums_d, _mm_unpackhi_epi8(bytes2, zeroes)); - - GCC_UPDATE_VARS(v_s1, v_s2, v_byte_sums_a, v_byte_sums_b, - v_byte_sums_c, v_byte_sums_d); - } while (p != end); - - /* Finish calculating the s2 counters. */ - v_s2 = _mm_slli_epi32(v_s2, 5); - v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_a, mults_a)); - v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_b, mults_b)); - v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_c, mults_c)); - v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_d, mults_d)); - - /* Add the counters to the real s1 and s2. */ - ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2, 1); -} -# include "../adler32_vec_template.h" - -# define adler32_avx2 adler32_avx2 -# define FUNCNAME adler32_avx2 -# define FUNCNAME_CHUNK adler32_avx2_chunk -# define IMPL_ALIGNMENT 32 -# define IMPL_SEGMENT_LEN 64 -# define IMPL_MAX_CHUNK_LEN (64 * (0x7FFF / 0xFF)) -# define ATTRIBUTES _target_attribute("avx2") -static forceinline ATTRIBUTES void -adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) -{ - /* - * Note, the multipliers have to be in this order because - * _mm256_unpack{lo,hi}_epi8 work on each 128-bit lane separately. - */ - static const u16 _aligned_attribute(32) mults[4][16] = { - { 64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41 }, - { 56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33 }, - { 32, 31, 30, 29, 28, 27, 26, 25, 16, 15, 14, 13, 12, 11, 10, 9 }, - { 24, 23, 22, 21, 20, 19, 18, 17, 8, 7, 6, 5, 4, 3, 2, 1 }, - }; - const __m256i /* __v16hu */ mults_a = _mm256_load_si256((const __m256i *)mults[0]); - const __m256i /* __v16hu */ mults_b = _mm256_load_si256((const __m256i *)mults[1]); - const __m256i /* __v16hu */ mults_c = _mm256_load_si256((const __m256i *)mults[2]); - const __m256i /* __v16hu */ mults_d = _mm256_load_si256((const __m256i *)mults[3]); - const __m256i zeroes = _mm256_setzero_si256(); - __m256i /* __v8su */ v_s1 = zeroes; - __m256i /* __v8su */ v_s2 = zeroes; - __m256i /* __v16hu */ v_byte_sums_a = zeroes; - __m256i /* __v16hu */ v_byte_sums_b = zeroes; - __m256i /* __v16hu */ v_byte_sums_c = zeroes; - __m256i /* __v16hu */ v_byte_sums_d = zeroes; - - do { - const __m256i bytes1 = *p++; - const __m256i bytes2 = *p++; - - v_s2 = _mm256_add_epi32(v_s2, v_s1); - v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes1, zeroes)); - v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes2, zeroes)); - v_byte_sums_a = _mm256_add_epi16( - v_byte_sums_a, _mm256_unpacklo_epi8(bytes1, zeroes)); - v_byte_sums_b = _mm256_add_epi16( - v_byte_sums_b, _mm256_unpackhi_epi8(bytes1, zeroes)); - v_byte_sums_c = _mm256_add_epi16( - v_byte_sums_c, _mm256_unpacklo_epi8(bytes2, zeroes)); - v_byte_sums_d = _mm256_add_epi16( - v_byte_sums_d, _mm256_unpackhi_epi8(bytes2, zeroes)); - - GCC_UPDATE_VARS(v_s1, v_s2, v_byte_sums_a, v_byte_sums_b, - v_byte_sums_c, v_byte_sums_d); - } while (p != end); - - v_s2 = _mm256_slli_epi32(v_s2, 6); - v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_a, mults_a)); - v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_b, mults_b)); - v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_c, mults_c)); - v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_d, mults_d)); - ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2, 1); -} -# include "../adler32_vec_template.h" +# define adler32_x86_sse2 adler32_x86_sse2 +# define SUFFIX _x86_sse2 +# define ATTRIBUTES _target_attribute("sse2") +# define VL 16 +# define USE_VNNI 0 +# define USE_MASKING 0 +# include "adler32_template.h" + +# define adler32_x86_avx2 adler32_x86_avx2 +# define SUFFIX _x86_avx2 +# define ATTRIBUTES _target_attribute("avx2") +# define VL 32 +# define USE_VNNI 0 +# define USE_MASKING 0 +# include "adler32_template.h" #endif /* - * AVX2/AVX-VNNI implementation. This is similar to the AVX512BW/AVX512VNNI - * implementation, but instead of using AVX-512 it uses AVX2 plus AVX-VNNI. - * AVX-VNNI adds dot product instructions to CPUs without AVX-512. + * AVX-VNNI implementation. This is used on CPUs that have AVX2 and AVX-VNNI + * but don't have AVX-512, for example Intel Alder Lake. */ #if GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930) -# define adler32_avx2_vnni adler32_avx2_vnni -# define FUNCNAME adler32_avx2_vnni -# define FUNCNAME_CHUNK adler32_avx2_vnni_chunk -# define IMPL_ALIGNMENT 32 -# define IMPL_SEGMENT_LEN 128 -# define IMPL_MAX_CHUNK_LEN MAX_CHUNK_LEN +# define adler32_x86_avx2_vnni adler32_x86_avx2_vnni +# define SUFFIX _x86_avx2_vnni # define ATTRIBUTES _target_attribute("avx2,avxvnni") -static forceinline ATTRIBUTES void -adler32_avx2_vnni_chunk(const __m256i *p, const __m256i *const end, - u32 *s1, u32 *s2) -{ - static const u8 _aligned_attribute(32) mults[2][32] = { - { 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, - 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, }, - { 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, - 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, }, - }; - const __m256i /* __v32qu */ mults_a = _mm256_load_si256((const __m256i *)mults[0]); - const __m256i /* __v32qu */ mults_b = _mm256_load_si256((const __m256i *)mults[1]); - const __m256i zeroes = _mm256_setzero_si256(); - __m256i /* __v8su */ v_s1_a = zeroes; - __m256i /* __v8su */ v_s1_b = zeroes; - __m256i /* __v8su */ v_s1_sums_a = zeroes; - __m256i /* __v8su */ v_s1_sums_b = zeroes; - __m256i /* __v8su */ v_s2_a = zeroes; - __m256i /* __v8su */ v_s2_b = zeroes; - __m256i /* __v8su */ v_s2_c = zeroes; - __m256i /* __v8su */ v_s2_d = zeroes; - - do { - const __m256i bytes_a = *p++; - const __m256i bytes_b = *p++; - const __m256i bytes_c = *p++; - const __m256i bytes_d = *p++; - - v_s2_a = _mm256_dpbusd_avx_epi32(v_s2_a, bytes_a, mults_a); - v_s2_b = _mm256_dpbusd_avx_epi32(v_s2_b, bytes_b, mults_b); - v_s2_c = _mm256_dpbusd_avx_epi32(v_s2_c, bytes_c, mults_a); - v_s2_d = _mm256_dpbusd_avx_epi32(v_s2_d, bytes_d, mults_b); - v_s1_sums_a = _mm256_add_epi32(v_s1_sums_a, v_s1_a); - v_s1_sums_b = _mm256_add_epi32(v_s1_sums_b, v_s1_b); - v_s1_a = _mm256_add_epi32(v_s1_a, - _mm256_add_epi32(_mm256_sad_epu8(bytes_a, zeroes), - _mm256_sad_epu8(bytes_b, zeroes))); - v_s1_b = _mm256_add_epi32(v_s1_b, - _mm256_add_epi32(_mm256_sad_epu8(bytes_c, zeroes), - _mm256_sad_epu8(bytes_d, zeroes))); -#if GCC_PREREQ(1, 0) - __asm__("" : "+x" (v_s1_a), "+x" (v_s1_b), "+x" (v_s1_sums_a), - "+x" (v_s1_sums_b), "+x" (v_s2_a), "+x" (v_s2_b), - "+x" (v_s2_c), "+x" (v_s2_d)); +# define VL 32 +# define USE_VNNI 1 +# define USE_MASKING 0 +# include "adler32_template.h" #endif - } while (p != end); - v_s1_sums_a = _mm256_add_epi32(v_s1_sums_a, v_s1_sums_b); - v_s1_sums_a = _mm256_slli_epi32(v_s1_sums_a, 7); - v_s1_sums_a = _mm256_add_epi32(v_s1_sums_a, _mm256_slli_epi32(v_s1_a, 6)); - v_s1_a = _mm256_add_epi32(v_s1_a, v_s1_b); - v_s2_a = _mm256_add_epi32(v_s2_a, v_s2_b); - v_s2_c = _mm256_add_epi32(v_s2_c, v_s2_d); - v_s2_a = _mm256_add_epi32(v_s2_a, v_s2_c); - v_s2_a = _mm256_add_epi32(v_s2_a, v_s1_sums_a); - ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1_a, v_s2_a, 1); -} -# include "../adler32_vec_template.h" -#endif +#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920) +/* + * AVX512VNNI implementation using 256-bit vectors. This is very similar to the + * AVX-VNNI implementation but takes advantage of masking and more registers. + * This is used on CPUs that support AVX-512 but where using 512-bit vectors + * causes downclocking. This should also be the optimal implementation for CPUs + * that support AVX10/256 but not AVX10/512. + */ +# define adler32_x86_avx512_vl256_vnni adler32_x86_avx512_vl256_vnni +# define SUFFIX _x86_avx512_vl256_vnni +# define ATTRIBUTES _target_attribute("avx512bw,avx512vl,avx512vnni") +# define VL 32 +# define USE_VNNI 1 +# define USE_MASKING 1 +# include "adler32_template.h" /* - * AVX512BW/AVX512VNNI implementation. Uses the vpdpbusd (dot product) - * instruction from AVX512VNNI. + * AVX512VNNI implementation using 512-bit vectors. This is used on CPUs that + * have a good AVX-512 implementation including AVX512VNNI. This should also be + * the optimal implementation for CPUs that support AVX10/512. */ -#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920) -# define adler32_avx512_vnni adler32_avx512_vnni -# define FUNCNAME adler32_avx512_vnni -# define FUNCNAME_CHUNK adler32_avx512_vnni_chunk -# define IMPL_ALIGNMENT 64 -# define IMPL_SEGMENT_LEN 128 -# define IMPL_MAX_CHUNK_LEN MAX_CHUNK_LEN +# define adler32_x86_avx512_vl512_vnni adler32_x86_avx512_vl512_vnni +# define SUFFIX _x86_avx512_vl512_vnni # define ATTRIBUTES _target_attribute("avx512bw,avx512vnni") -static forceinline ATTRIBUTES void -adler32_avx512_vnni_chunk(const __m512i *p, const __m512i *const end, - u32 *s1, u32 *s2) -{ - static const u8 _aligned_attribute(64) mults[64] = { - 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, - 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, - 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, - 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, - }; - const __m512i /* __v64qu */ mults_a = _mm512_load_si512((const __m512i *)mults); - const __m512i /* __v64qu */ ones = _mm512_set1_epi8(1); - const __m512i zeroes = _mm512_setzero_si512(); - __m512i /* __v16su */ v_s1_a = zeroes; - __m512i /* __v16su */ v_s1_b = zeroes; - __m512i /* __v16su */ v_s1_sums_a = zeroes; - __m512i /* __v16su */ v_s1_sums_b = zeroes; - __m512i /* __v16su */ v_s2_a = zeroes; - __m512i /* __v16su */ v_s2_b = zeroes; - - do { - const __m512i bytes_a = *p++; - const __m512i bytes_b = *p++; - - v_s2_a = _mm512_dpbusd_epi32(v_s2_a, bytes_a, mults_a); - v_s2_b = _mm512_dpbusd_epi32(v_s2_b, bytes_b, mults_a); - v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1_a); - v_s1_sums_b = _mm512_add_epi32(v_s1_sums_b, v_s1_b); - /* - * vpdpbusd with 1's seems to be faster than vpsadbw + vpaddd - * here, provided that the accumulators are independent. - */ - v_s1_a = _mm512_dpbusd_epi32(v_s1_a, bytes_a, ones); - v_s1_b = _mm512_dpbusd_epi32(v_s1_b, bytes_b, ones); - GCC_UPDATE_VARS(v_s1_a, v_s1_b, v_s1_sums_a, v_s1_sums_b, - v_s2_a, v_s2_b); - } while (p != end); - - v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1_sums_b); - v_s1_sums_a = _mm512_slli_epi32(v_s1_sums_a, 7); - v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, _mm512_slli_epi32(v_s1_a, 6)); - v_s1_a = _mm512_add_epi32(v_s1_a, v_s1_b); - v_s2_a = _mm512_add_epi32(v_s2_a, v_s2_b); - v_s2_a = _mm512_add_epi32(v_s2_a, v_s1_sums_a); - ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1_a, v_s2_a, 0); -} -# include "../adler32_vec_template.h" +# define VL 64 +# define USE_VNNI 1 +# define USE_MASKING 1 +# include "adler32_template.h" #endif static inline adler32_func_t @@ -402,22 +98,27 @@ arch_select_adler32_func(void) { const u32 features MAYBE_UNUSED = get_x86_cpu_features(); -#ifdef adler32_avx512_vnni +#ifdef adler32_x86_avx512_vl512_vnni if ((features & X86_CPU_FEATURE_ZMM) && HAVE_AVX512BW(features) && HAVE_AVX512VNNI(features)) - return adler32_avx512_vnni; + return adler32_x86_avx512_vl512_vnni; +#endif +#ifdef adler32_x86_avx512_vl256_vnni + if (HAVE_AVX512BW(features) && HAVE_AVX512VL(features) && + HAVE_AVX512VNNI(features)) + return adler32_x86_avx512_vl256_vnni; #endif -#ifdef adler32_avx2_vnni +#ifdef adler32_x86_avx2_vnni if (HAVE_AVX2(features) && HAVE_AVXVNNI(features)) - return adler32_avx2_vnni; + return adler32_x86_avx2_vnni; #endif -#ifdef adler32_avx2 +#ifdef adler32_x86_avx2 if (HAVE_AVX2(features)) - return adler32_avx2; + return adler32_x86_avx2; #endif -#ifdef adler32_sse2 +#ifdef adler32_x86_sse2 if (HAVE_SSE2(features)) - return adler32_sse2; + return adler32_x86_sse2; #endif return NULL; } diff --git a/lib/x86/adler32_template.h b/lib/x86/adler32_template.h new file mode 100644 index 00000000..d0c867fd --- /dev/null +++ b/lib/x86/adler32_template.h @@ -0,0 +1,515 @@ +/* + * x86/adler32_template.h - template for vectorized Adler-32 implementations + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * This file is a "template" for instantiating Adler-32 functions for x86. + * The "parameters" are: + * + * SUFFIX: + * Name suffix to append to all instantiated functions. + * ATTRIBUTES: + * Target function attributes to use. Must satisfy the dependencies of the + * other parameters as follows: + * VL=16 && USE_VNNI=0 && USE_MASKING=0: at least sse2 + * VL=32 && USE_VNNI=0 && USE_MASKING=0: at least avx2 + * VL=32 && USE_VNNI=1 && USE_MASKING=0: at least avx2,avxvnni + * VL=32 && USE_VNNI=1 && USE_MASKING=1: at least avx512bw,avx512vl,avx512vnni + * VL=64 && USE_VNNI=1 && USE_MASKING=1: at least avx512bw,avx512vnni + * (Other combinations are not useful and have not been tested.) + * VL: + * Vector length in bytes. Must be 16, 32, and 64. + * USE_VNNI: + * If 1, use the VNNI dot product based algorithm. + * If 0, use the legacy SSE2 and AVX2 compatible algorithm. + * USE_MASKING: + * If 1, use AVX-512 features such as masking. + * If 0, assume that the CPU might not support AVX-512. + */ + +#if VL == 16 +# define vec_t __m128i +# define mask_t u16 +# define LOG2_VL 4 +# define VADD8(a, b) _mm_add_epi8((a), (b)) +# define VADD16(a, b) _mm_add_epi16((a), (b)) +# define VADD32(a, b) _mm_add_epi32((a), (b)) +# if USE_MASKING +# define VDPBUSD(a, b, c) _mm_dpbusd_epi32((a), (b), (c)) +# else +# define VDPBUSD(a, b, c) _mm_dpbusd_avx_epi32((a), (b), (c)) +# endif +# define VLOAD(p) _mm_load_si128((const void *)(p)) +# define VLOADU(p) _mm_loadu_si128((const void *)(p)) +# define VMADD16(a, b) _mm_madd_epi16((a), (b)) +# define VMASKZ_LOADU(mask, p) _mm_maskz_loadu_epi8((mask), (p)) +# define VMULLO32(a, b) _mm_mullo_epi32((a), (b)) +# define VSAD8(a, b) _mm_sad_epu8((a), (b)) +# define VSET1_32(a) _mm_set1_epi32(a) +# define VSET1_8(a) _mm_set1_epi8(a) +# define VSETZERO() _mm_setzero_si128() +# define VSLL32(a, b) _mm_slli_epi32((a), (b)) +# define VUNPACKHI8(a, b) _mm_unpackhi_epi8((a), (b)) +# define VUNPACKLO8(a, b) _mm_unpacklo_epi8((a), (b)) +#elif VL == 32 +# define vec_t __m256i +# define mask_t u32 +# define LOG2_VL 5 +# define VADD8(a, b) _mm256_add_epi8((a), (b)) +# define VADD16(a, b) _mm256_add_epi16((a), (b)) +# define VADD32(a, b) _mm256_add_epi32((a), (b)) +# if USE_MASKING +# define VDPBUSD(a, b, c) _mm256_dpbusd_epi32((a), (b), (c)) +# else +# define VDPBUSD(a, b, c) _mm256_dpbusd_avx_epi32((a), (b), (c)) +# endif +# define VLOAD(p) _mm256_load_si256((const void *)(p)) +# define VLOADU(p) _mm256_loadu_si256((const void *)(p)) +# define VMADD16(a, b) _mm256_madd_epi16((a), (b)) +# define VMASKZ_LOADU(mask, p) _mm256_maskz_loadu_epi8((mask), (p)) +# define VMULLO32(a, b) _mm256_mullo_epi32((a), (b)) +# define VSAD8(a, b) _mm256_sad_epu8((a), (b)) +# define VSET1_32(a) _mm256_set1_epi32(a) +# define VSET1_8(a) _mm256_set1_epi8(a) +# define VSETZERO() _mm256_setzero_si256() +# define VSLL32(a, b) _mm256_slli_epi32((a), (b)) +# define VUNPACKHI8(a, b) _mm256_unpackhi_epi8((a), (b)) +# define VUNPACKLO8(a, b) _mm256_unpacklo_epi8((a), (b)) +#elif VL == 64 +# define vec_t __m512i +# define mask_t u64 +# define LOG2_VL 6 +# define VADD8(a, b) _mm512_add_epi8((a), (b)) +# define VADD32(a, b) _mm512_add_epi32((a), (b)) +# define VDPBUSD(a, b, c) _mm512_dpbusd_epi32((a), (b), (c)) +# define VLOAD(p) _mm512_load_si512((const void *)(p)) +# define VLOADU(p) _mm512_loadu_si512((const void *)(p)) +# define VMASKZ_LOADU(mask, p) _mm512_maskz_loadu_epi8((mask), (p)) +# define VMULLO32(a, b) _mm512_mullo_epi32((a), (b)) +# define VSET1_32(a) _mm512_set1_epi32(a) +# define VSET1_8(a) _mm512_set1_epi8(a) +# define VSETZERO() _mm512_setzero_si512() +# define VSLL32(a, b) _mm512_slli_epi32((a), (b)) +#else +# error "unsupported vector length" +#endif + +#define VADD32_3X(a, b, c) VADD32(VADD32((a), (b)), (c)) +#define VADD32_4X(a, b, c, d) VADD32(VADD32((a), (b)), VADD32((c), (d))) +#define VADD32_5X(a, b, c, d, e) VADD32((a), VADD32_4X((b), (c), (d), (e))) +#define VADD32_7X(a, b, c, d, e, f, g) \ + VADD32(VADD32_3X((a), (b), (c)), VADD32_4X((d), (e), (f), (g))) + +/* Sum the 32-bit elements of v_s1 and add them to s1, and likewise for s2. */ +#undef reduce_to_32bits +static forceinline ATTRIBUTES void +ADD_SUFFIX(reduce_to_32bits)(vec_t v_s1, vec_t v_s2, u32 *s1_p, u32 *s2_p) +{ + __m128i v_s1_128, v_s2_128; +#if VL == 16 + { + v_s1_128 = v_s1; + v_s2_128 = v_s2; + } +#else + { + __m256i v_s1_256, v_s2_256; +# if VL == 32 + v_s1_256 = v_s1; + v_s2_256 = v_s2; +# else + /* Reduce 512 bits to 256 bits. */ + v_s1_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s1, 0), + _mm512_extracti64x4_epi64(v_s1, 1)); + v_s2_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s2, 0), + _mm512_extracti64x4_epi64(v_s2, 1)); +# endif + /* Reduce 256 bits to 128 bits. */ + v_s1_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s1_256, 0), + _mm256_extracti128_si256(v_s1_256, 1)); + v_s2_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s2_256, 0), + _mm256_extracti128_si256(v_s2_256, 1)); + } +#endif + + /* + * Reduce 128 bits to 32 bits. + * + * If the bytes were summed into v_s1 using psadbw + paddd, then ignore + * the odd-indexed elements of v_s1_128 since they are zero. + */ +#if USE_VNNI + v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x31)); +#endif + v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x31)); + v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x02)); + v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x02)); + + *s1_p += (u32)_mm_cvtsi128_si32(v_s1_128); + *s2_p += (u32)_mm_cvtsi128_si32(v_s2_128); +} +#define reduce_to_32bits ADD_SUFFIX(reduce_to_32bits) + +static u32 ATTRIBUTES +ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len) +{ +#if USE_VNNI + /* This contains the bytes [VL, VL-1, VL-2, ..., 1]. */ + static const u8 _aligned_attribute(VL) raw_mults[VL] = { + #if VL == 64 + 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, + 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, + #endif + #if VL >= 32 + 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, + #endif + 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, + }; +#else + /* + * This contains the 16-bit values [2*VL, 2*VL - 1, 2*VL - 2, ..., 1]. + * For VL==32 the ordering is weird because it has to match the way that + * vpunpcklbw and vpunpckhbw work on 128-bit lanes separately. + */ + static const u16 _aligned_attribute(VL) raw_mults[4][VL / 2] = { + #if VL == 16 + { 32, 31, 30, 29, 28, 27, 26, 25 }, + { 24, 23, 22, 21, 20, 19, 18, 17 }, + { 16, 15, 14, 13, 12, 11, 10, 9 }, + { 8, 7, 6, 5, 4, 3, 2, 1 }, + #elif VL == 32 + { 64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41 }, + { 56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33 }, + { 32, 31, 30, 29, 28, 27, 26, 25, 16, 15, 14, 13, 12, 11, 10, 9 }, + { 24, 23, 22, 21, 20, 19, 18, 17, 8, 7, 6, 5, 4, 3, 2, 1 }, + #else + # error "unsupported parameters" + #endif + }; + const vec_t mults_a = VLOAD(raw_mults[0]); + const vec_t mults_b = VLOAD(raw_mults[1]); + const vec_t mults_c = VLOAD(raw_mults[2]); + const vec_t mults_d = VLOAD(raw_mults[3]); +#endif + const vec_t zeroes = VSETZERO(); + const vec_t MAYBE_UNUSED ones = VSET1_8(1); + u32 s1 = adler & 0xFFFF; + u32 s2 = adler >> 16; + + /* + * If the length is large and the pointer is misaligned, align it. + * For smaller lengths, just take the unaligned load penalty. + */ + if (unlikely(len > 65536 && ((uintptr_t)p & (VL-1)))) { + do { + s1 += *p++; + s2 += s1; + len--; + } while ((uintptr_t)p & (VL-1)); + s1 %= DIVISOR; + s2 %= DIVISOR; + } + +#if USE_VNNI + /* + * This is Adler-32 using the vpdpbusd instruction from AVX512-VNNI or + * AVX-VNNI. vpdpbusd multiplies the unsigned bytes of one vector by + * the signed bytes of another vector and adds the sums in groups of 4 + * to the 32-bit elements of a third vector. We use it in two ways: + * multiplying the data bytes by a sequence like 64,63,62,...,1 for + * calculating part of s2, and multiplying the data bytes by an all-ones + * sequence 1,1,1,...,1 for calculating s1 and part of s2. The all-ones + * trick seems to be faster than the alternative of vpsadbw + vpaddd. + */ + while (len) { + /* + * Calculate the length of the next data chunk such that s1 and + * s2 are guaranteed to not exceed UINT32_MAX. + */ + size_t n = MIN(len, MAX_CHUNK_LEN & ~(4*VL - 1)); + vec_t mults = VLOAD(raw_mults); + vec_t v_s1 = zeroes; + vec_t v_s2 = zeroes; + + s2 += s1 * n; + len -= n; + + if (n >= 4 * VL) { + vec_t v_s1_b = zeroes; + vec_t v_s1_c = zeroes; + vec_t v_s1_d = zeroes; + vec_t v_s2_b = zeroes; + vec_t v_s2_c = zeroes; + vec_t v_s2_d = zeroes; + vec_t v_s1_sums = zeroes; + vec_t v_s1_sums_b = zeroes; + vec_t v_s1_sums_c = zeroes; + vec_t v_s1_sums_d = zeroes; + vec_t tmp0, tmp1; + + do { + vec_t data_a = VLOADU(p + 0*VL); + vec_t data_b = VLOADU(p + 1*VL); + vec_t data_c = VLOADU(p + 2*VL); + vec_t data_d = VLOADU(p + 3*VL); + + /* + * Workaround for gcc bug where it generates + * unnecessary move instructions + * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892) + */ + #if GCC_PREREQ(1, 0) + __asm__("" : "+v" (data_a), "+v" (data_b), + "+v" (data_c), "+v" (data_d)); + #endif + + v_s2 = VDPBUSD(v_s2, data_a, mults); + v_s2_b = VDPBUSD(v_s2_b, data_b, mults); + v_s2_c = VDPBUSD(v_s2_c, data_c, mults); + v_s2_d = VDPBUSD(v_s2_d, data_d, mults); + + v_s1_sums = VADD32(v_s1_sums, v_s1); + v_s1_sums_b = VADD32(v_s1_sums_b, v_s1_b); + v_s1_sums_c = VADD32(v_s1_sums_c, v_s1_c); + v_s1_sums_d = VADD32(v_s1_sums_d, v_s1_d); + + v_s1 = VDPBUSD(v_s1, data_a, ones); + v_s1_b = VDPBUSD(v_s1_b, data_b, ones); + v_s1_c = VDPBUSD(v_s1_c, data_c, ones); + v_s1_d = VDPBUSD(v_s1_d, data_d, ones); + + /* Same gcc bug workaround. See above */ + #if GCC_PREREQ(1, 0) && !defined(ARCH_X86_32) + __asm__("" : "+v" (v_s2), "+v" (v_s2_b), + "+v" (v_s2_c), "+v" (v_s2_d), + "+v" (v_s1_sums), + "+v" (v_s1_sums_b), + "+v" (v_s1_sums_c), + "+v" (v_s1_sums_d), + "+v" (v_s1), "+v" (v_s1_b), + "+v" (v_s1_c), "+v" (v_s1_d)); + #endif + p += 4*VL; + n -= 4*VL; + } while (n >= 4*VL); + + /* + * Reduce into v_s1 and v_s2 as follows: + * + * v_s2 = v_s2 + v_s2_b + v_s2_c + v_s2_d + + * (4*VL)*(v_s1_sums + v_s1_sums_b + + * v_s1_sums_c + v_s1_sums_d) + + * (3*VL)*v_s1 + (2*VL)*v_s1_b + VL*v_s1_c + * v_s1 = v_s1 + v_s1_b + v_s1_c + v_s1_d + */ + tmp0 = VADD32(v_s1, v_s1_b); + tmp1 = VADD32(v_s1, v_s1_c); + v_s1_sums = VADD32_4X(v_s1_sums, v_s1_sums_b, + v_s1_sums_c, v_s1_sums_d); + v_s1 = VADD32_3X(tmp0, v_s1_c, v_s1_d); + v_s2 = VADD32_7X(VSLL32(v_s1_sums, LOG2_VL + 2), + VSLL32(tmp0, LOG2_VL + 1), + VSLL32(tmp1, LOG2_VL), + v_s2, v_s2_b, v_s2_c, v_s2_d); + } + + /* Process the last 0 <= n < 4*VL bytes of the chunk. */ + if (n >= 2*VL) { + const vec_t data_a = VLOADU(p + 0*VL); + const vec_t data_b = VLOADU(p + 1*VL); + + v_s2 = VADD32(v_s2, VSLL32(v_s1, LOG2_VL + 1)); + v_s1 = VDPBUSD(v_s1, data_a, ones); + v_s1 = VDPBUSD(v_s1, data_b, ones); + v_s2 = VDPBUSD(v_s2, data_a, VSET1_8(VL)); + v_s2 = VDPBUSD(v_s2, data_a, mults); + v_s2 = VDPBUSD(v_s2, data_b, mults); + p += 2*VL; + n -= 2*VL; + } + if (n) { + /* Process the last 0 < n < 2*VL bytes of the chunk. */ + vec_t data; + + v_s2 = VADD32(v_s2, VMULLO32(v_s1, VSET1_32(n))); + + mults = VADD8(mults, VSET1_8((int)n - VL)); + if (n > VL) { + data = VLOADU(p); + v_s1 = VDPBUSD(v_s1, data, ones); + v_s2 = VDPBUSD(v_s2, data, mults); + p += VL; + n -= VL; + mults = VADD8(mults, VSET1_8(-VL)); + } + /* + * Process the last 0 < n <= VL bytes of the chunk. + * Utilize a masked load if it's available. + */ + #if USE_MASKING + data = VMASKZ_LOADU((mask_t)-1 >> (VL - n), p); + #else + data = zeroes; + memcpy(&data, p, n); + #endif + v_s1 = VDPBUSD(v_s1, data, ones); + v_s2 = VDPBUSD(v_s2, data, mults); + p += n; + } + + reduce_to_32bits(v_s1, v_s2, &s1, &s2); + s1 %= DIVISOR; + s2 %= DIVISOR; + } +#else /* USE_VNNI */ + /* + * This is Adler-32 for SSE2 and AVX2. + * + * To horizontally sum bytes, use psadbw + paddd, where one of the + * arguments to psadbw is all-zeroes. + * + * For the s2 contribution from (2*VL - i)*data[i] for each of the 2*VL + * bytes of each iteration of the inner loop, use punpck{l,h}bw + paddw + * to sum, for each i across iterations, byte i into a corresponding + * 16-bit counter in v_byte_sums_*. After the inner loop, use pmaddw to + * multiply each counter i by (2*VL - i), then add the products to s2. + * + * An alternative implementation would use pmaddubsw and pmaddwd in the + * inner loop to do (2*VL-i)*data[i] directly and add the products in + * groups of 4 to 32-bit counters. However, on average that approach + * seems to be slower than the current approach which delays the + * multiplications. Also, pmaddubsw requires SSSE3; the current + * approach keeps the implementation aligned between SSE2 and AVX2. + * + * The inner loop processes 2*VL bytes per iteration. Increasing this + * to 4*VL doesn't seem to be helpful here. + */ + while (len) { + /* + * Calculate the length of the next data chunk such that s1 and + * s2 are guaranteed to not exceed UINT32_MAX, and every + * v_byte_sums_* counter is guaranteed to not exceed INT16_MAX. + * It's INT16_MAX, not UINT16_MAX, because v_byte_sums_* are + * used with pmaddw which does signed multiplication. In the + * SSE2 case this limits chunks to 4096 bytes instead of 5504. + */ + size_t n = MIN(len, MIN(2 * VL * (INT16_MAX / UINT8_MAX), + MAX_CHUNK_LEN) & ~(2*VL - 1)); + len -= n; + + if (n >= 2*VL) { + vec_t v_s1 = zeroes; + vec_t v_s1_sums = zeroes; + vec_t v_byte_sums_a = zeroes; + vec_t v_byte_sums_b = zeroes; + vec_t v_byte_sums_c = zeroes; + vec_t v_byte_sums_d = zeroes; + vec_t v_s2; + + s2 += s1 * (n & ~(2*VL - 1)); + + do { + vec_t data_a = VLOADU(p + 0*VL); + vec_t data_b = VLOADU(p + 1*VL); + + v_s1_sums = VADD32(v_s1_sums, v_s1); + v_byte_sums_a = VADD16(v_byte_sums_a, + VUNPACKLO8(data_a, zeroes)); + v_byte_sums_b = VADD16(v_byte_sums_b, + VUNPACKHI8(data_a, zeroes)); + v_byte_sums_c = VADD16(v_byte_sums_c, + VUNPACKLO8(data_b, zeroes)); + v_byte_sums_d = VADD16(v_byte_sums_d, + VUNPACKHI8(data_b, zeroes)); + v_s1 = VADD32(v_s1, + VADD32(VSAD8(data_a, zeroes), + VSAD8(data_b, zeroes))); + /* + * Workaround for gcc bug where it generates + * unnecessary move instructions + * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892) + */ + #if GCC_PREREQ(1, 0) + __asm__("" : "+x" (v_s1), "+x" (v_s1_sums), + "+x" (v_byte_sums_a), + "+x" (v_byte_sums_b), + "+x" (v_byte_sums_c), + "+x" (v_byte_sums_d)); + #endif + p += 2*VL; + n -= 2*VL; + } while (n >= 2*VL); + + /* + * Calculate v_s2 as (2*VL)*v_s1_sums + + * [2*VL, 2*VL - 1, 2*VL - 2, ..., 1] * v_byte_sums. + * Then update s1 and s2 from v_s1 and v_s2. + */ + v_s2 = VADD32_5X(VSLL32(v_s1_sums, LOG2_VL + 1), + VMADD16(v_byte_sums_a, mults_a), + VMADD16(v_byte_sums_b, mults_b), + VMADD16(v_byte_sums_c, mults_c), + VMADD16(v_byte_sums_d, mults_d)); + reduce_to_32bits(v_s1, v_s2, &s1, &s2); + } + /* + * Process the last 0 <= n < 2*VL bytes of the chunk using + * scalar instructions, then reduce s1 and s2 mod DIVISOR. + */ + adler32_generic_noreduce(&s1, &s2, p, n); + p += n; + s1 %= DIVISOR; + s2 %= DIVISOR; + } +#endif /* !USE_VNNI */ + return (s2 << 16) | s1; +} + +#undef vec_t +#undef mask_t +#undef LOG2_VL +#undef VADD8 +#undef VADD16 +#undef VADD32 +#undef VDPBUSD +#undef VLOAD +#undef VLOADU +#undef VMADD16 +#undef VMASKZ_LOADU +#undef VMULLO32 +#undef VSAD8 +#undef VSET1_8 +#undef VSET1_32 +#undef VSETZERO +#undef VSLL32 +#undef VUNPACKHI8 +#undef VUNPACKLO8 + +#undef SUFFIX +#undef ATTRIBUTES +#undef VL +#undef USE_VNNI +#undef USE_MASKING diff --git a/scripts/checksum_benchmarks.sh b/scripts/checksum_benchmarks.sh index 03707752..12fb405f 100755 --- a/scripts/checksum_benchmarks.sh +++ b/scripts/checksum_benchmarks.sh @@ -198,12 +198,16 @@ echo case $ARCH in i386|x86_64) if have_cpu_features avx512bw avx512_vnni; then - do_benchmark "AVX512BW/AVX512VNNI" + do_benchmark "AVX512VNNI/VL512" + disable_cpu_feature zmm + if have_cpu_features avx512vl; then + do_benchmark "AVX512VNNI/VL256" + fi disable_cpu_feature avx512_vnni "-mno-avx512vnni" disable_cpu_feature avx512bw "-mno-avx512bw" fi if have_cpu_features avx2 avx_vnni; then - do_benchmark "AVX2/AVX-VNNI" + do_benchmark "AVX-VNNI" disable_cpu_feature avx_vnni "-mno-avxvnni" fi if have_cpu_features avx2; then diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh index 51a7f4b0..4419bc06 100755 --- a/scripts/run_tests.sh +++ b/scripts/run_tests.sh @@ -142,8 +142,7 @@ build_and_run_tests() if ! [[ "$CFLAGS" =~ "-march=native" ]] && ! $quick; then case "$ARCH" in i386|x86_64) - features+=(zmm avx_vnni avx512_vnni vpclmulqdq - avx512vl avx512bw avx512f + features+=(zmm avx512_vnni avx512vl avx_vnni vpclmulqdq avx2 avx bmi2 pclmulqdq sse2) ;; arm*|aarch*)