Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lib/x86/crc32: target pclmul,sse4.1 instead of pclmul #397

Merged
merged 1 commit into from
Oct 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion lib/x86/cpu_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,12 @@ void libdeflate_init_x86_cpu_features(void)
family += (a >> 20) & 0xff;
if (d & (1 << 26))
features |= X86_CPU_FEATURE_SSE2;
if (c & (1 << 1))
/*
* No known CPUs have pclmulqdq without sse4.1, so in practice code
* targeting pclmulqdq can use sse4.1 instructions. But to be safe,
* explicitly check for both the pclmulqdq and sse4.1 bits.
*/
if ((c & (1 << 1)) && (c & (1 << 19)))
features |= X86_CPU_FEATURE_PCLMULQDQ;
if (c & (1 << 27))
xcr0 = read_xcr(0);
Expand Down
3 changes: 2 additions & 1 deletion lib/x86/cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
# define HAVE_SSE2_NATIVE 0
#endif

#if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__))
#if (defined(__PCLMUL__) && defined(__SSE4_1__)) || \
(defined(_MSC_VER) && defined(__AVX2__))
# define HAVE_PCLMULQDQ(features) 1
#else
# define HAVE_PCLMULQDQ(features) ((features) & X86_CPU_FEATURE_PCLMULQDQ)
Expand Down
24 changes: 8 additions & 16 deletions lib/x86/crc32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,31 +44,26 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
};

#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
/* PCLMULQDQ implementation */
/*
* PCLMULQDQ implementation. This targets PCLMULQDQ+SSE4.1, since in practice
* all CPUs that support PCLMULQDQ also support SSE4.1.
*/
# define crc32_x86_pclmulqdq crc32_x86_pclmulqdq
# define SUFFIX _pclmulqdq
# define ATTRIBUTES _target_attribute("pclmul")
# define ATTRIBUTES _target_attribute("pclmul,sse4.1")
# define VL 16
# define USE_SSE4_1 0
# define USE_AVX512 0
# include "crc32_pclmul_template.h"

/*
* PCLMULQDQ/AVX implementation. Compared to the regular PCLMULQDQ
* implementation, this still uses 128-bit vectors, but it has two potential
* benefits. First, simply compiling against the AVX target can improve
* performance significantly (e.g. 10100 MB/s to 16700 MB/s on Skylake) without
* actually using any AVX intrinsics, probably due to the availability of
* non-destructive VEX-encoded instructions. Second, AVX support implies SSSE3
* and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for efficient
* handling of partial blocks. (We *could* compile a variant with
* PCLMULQDQ+SSE4.1 without AVX, but for simplicity we currently don't bother.)
* PCLMULQDQ/AVX implementation. Same as above, but this is compiled with AVX
* enabled so that the compiler can generate VEX-coded instructions which can be
* slightly more efficient. It still uses 128-bit vectors.
*/
# define crc32_x86_pclmulqdq_avx crc32_x86_pclmulqdq_avx
# define SUFFIX _pclmulqdq_avx
# define ATTRIBUTES _target_attribute("pclmul,avx")
# define VL 16
# define USE_SSE4_1 1
# define USE_AVX512 0
# include "crc32_pclmul_template.h"
#endif
Expand All @@ -90,7 +85,6 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
# define SUFFIX _vpclmulqdq_avx2
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx2")
# define VL 32
# define USE_SSE4_1 1
# define USE_AVX512 0
# include "crc32_pclmul_template.h"
#endif
Expand All @@ -108,7 +102,6 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
# define SUFFIX _vpclmulqdq_avx512_vl256
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl" NO_EVEX512)
# define VL 32
# define USE_SSE4_1 1
# define USE_AVX512 1
# include "crc32_pclmul_template.h"

Expand All @@ -121,7 +114,6 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
# define SUFFIX _vpclmulqdq_avx512_vl512
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl" EVEX512)
# define VL 64
# define USE_SSE4_1 1
# define USE_AVX512 1
# include "crc32_pclmul_template.h"
#endif
Expand Down
37 changes: 6 additions & 31 deletions lib/x86/crc32_pclmul_template.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,13 @@
* ATTRIBUTES:
* Target function attributes to use. Must satisfy the dependencies of the
* other parameters as follows:
* VL=16 && USE_SSE4_1=0 && USE_AVX512=0: at least pclmul
* VL=16 && USE_SSE4_1=1 && USE_AVX512=0: at least pclmul,sse4.1
* VL=32 && USE_SSE4_1=1 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2
* VL=32 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
* VL=64 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
* VL=16 && USE_AVX512=0: at least pclmul,sse4.1
* VL=32 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2
* VL=32 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
* VL=64 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
* (Other combinations are not useful and have not been tested.)
* VL:
* Vector length in bytes. Must be 16, 32, or 64.
* USE_SSE4_1:
* If 1, take advantage of SSE4.1 instructions such as pblendvb.
* If 0, assume that the CPU might not support SSE4.1.
* USE_AVX512:
* If 1, take advantage of AVX-512 features such as masking and the
* vpternlog instruction. This doesn't enable the use of 512-bit vectors;
Expand Down Expand Up @@ -149,7 +145,6 @@ ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i /* __v8du */ mults)
#define fold_vec512 ADD_SUFFIX(fold_vec512)
#endif /* VL >= 64 */

#if USE_SSE4_1
/*
* Given 'x' containing a 16-byte polynomial, and a pointer 'p' that points to
* the next '1 <= len <= 15' data bytes, rearrange the concatenation of 'x' and
Expand Down Expand Up @@ -181,7 +176,6 @@ ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len,
return fold_vec128(x0, x1, mults_128b);
}
#define fold_lessthan16bytes ADD_SUFFIX(fold_lessthan16bytes)
#endif /* USE_SSE4_1 */

static ATTRIBUTES u32
ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
Expand Down Expand Up @@ -273,7 +267,6 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
size_t align = -(uintptr_t)p & (VL-1);

len -= align;
#if USE_SSE4_1
x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), x0);
p += 16;
if (align & 15) {
Expand All @@ -296,11 +289,6 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
v0 = _mm512_inserti64x4(v0, *(const __m256i *)(p + 16), 1);
# endif
p -= 16;
#else
crc = crc32_slice1(crc, p, align);
p += align;
v0 = VXOR(VLOADU(p), M128I_TO_VEC(_mm_cvtsi32_si128(crc)));
#endif
} else {
v0 = VXOR(VLOADU(p), M128I_TO_VEC(x0));
}
Expand Down Expand Up @@ -395,14 +383,9 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
less_than_16_remaining:
len &= 15;

/*
* If fold_lessthan16bytes() is available, handle any remainder
* of 1 to 15 bytes now, before reducing to 32 bits.
*/
#if USE_SSE4_1
/* Handle any remainder of 1 to 15 bytes. */
if (len)
x0 = fold_lessthan16bytes(x0, p, len, mults_128b);
#endif
#if USE_AVX512
reduce_x0:
#endif
Expand Down Expand Up @@ -467,14 +450,7 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32),
barrett_reduction_constants, 0x10);
x0 = _mm_xor_si128(x0, x1);
#if USE_SSE4_1
crc = _mm_extract_epi32(x0, 1);
#else
crc = _mm_cvtsi128_si32(_mm_shuffle_epi32(x0, 0x01));
/* Process up to 15 bytes left over at the end. */
crc = crc32_slice1(crc, p, len);
#endif
return crc;
return _mm_extract_epi32(x0, 1);
}

#undef vec_t
Expand All @@ -491,5 +467,4 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
#undef SUFFIX
#undef ATTRIBUTES
#undef VL
#undef USE_SSE4_1
#undef USE_AVX512