Skip to content

Commit

Permalink
More efficient final reduction for CRC32
Browse files Browse the repository at this point in the history
  • Loading branch information
ebiggers committed Nov 5, 2024
1 parent d5070bb commit 56f9ad8
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 101 deletions.
37 changes: 13 additions & 24 deletions lib/arm/crc32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -434,13 +434,13 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
{ CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */
{ CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */
};
static const u64 _aligned_attribute(16) final_mults[3][2] = {
{ CRC32_X63_MODG, 0 },
{ CRC32_BARRETT_CONSTANT_1, 0 },
{ CRC32_BARRETT_CONSTANT_2, 0 },
static const u64 _aligned_attribute(16) barrett_consts[2][2] = {
{ CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_1 },
{ CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_2 },
};
static const u32 _aligned_attribute(16) mask32[4] = {
0, 0, 0xffffffff, 0
};
const uint8x16_t zeroes = vdupq_n_u8(0);
const uint8x16_t mask32 = vreinterpretq_u8_u64(vdupq_n_u64(0xFFFFFFFF));
const poly64x2_t multipliers_1 = load_multipliers(mults[0]);
uint8x16_t v0, v1, v2, v3;

Expand Down Expand Up @@ -497,24 +497,13 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
if (len)
v0 = fold_partial_vec(v0, p, len, multipliers_1);

/*
* Fold 128 => 96 bits. This also implicitly appends 32 zero bits,
* which is equivalent to multiplying by x^32. This is needed because
* the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
*/

v0 = veorq_u8(vextq_u8(v0, zeroes, 8),
clmul_high(vextq_u8(zeroes, v0, 8), multipliers_1));

/* Fold 96 => 64 bits. */
v0 = veorq_u8(vextq_u8(v0, zeroes, 4),
clmul_low(vandq_u8(v0, mask32),
load_multipliers(final_mults[0])));

/* Reduce 64 => 32 bits using Barrett reduction. */
v1 = clmul_low(vandq_u8(v0, mask32), load_multipliers(final_mults[1]));
v1 = clmul_low(vandq_u8(v1, mask32), load_multipliers(final_mults[2]));
return vgetq_lane_u32(vreinterpretq_u32_u8(veorq_u8(v0, v1)), 1);
/* Reduce to 32 bits, following lib/x86/crc32_pclmul_template.h */
v1 = clmul_low(v0, load_multipliers(barrett_consts[0]));
v1 = clmul_low(v1, load_multipliers(barrett_consts[1]));
v0 = veorq_u8(v0, vandq_u8(v1, vreinterpretq_u8_u32(vld1q_u32(mask32))));
v0 = clmul_high(v0, load_multipliers(barrett_consts[0]));
v0 = clmul_low(v0, load_multipliers(barrett_consts[1]));
return vgetq_lane_u32(vreinterpretq_u32_u8(v0), 2);
}
#undef SUFFIX
#undef ATTRIBUTES
Expand Down
4 changes: 1 addition & 3 deletions lib/crc32_multipliers.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,8 @@
#define CRC32_X4127_MODG 0x1072db28 /* x^4127 mod G(x) */
#define CRC32_X4063_MODG 0x0c30f51d /* x^4063 mod G(x) */

#define CRC32_X63_MODG 0xb8bc6765 /* x^63 mod G(x) */
#define CRC32_BARRETT_CONSTANT_1 0x00000001f7011641ULL /* floor(x^64 / G(x)) */
#define CRC32_BARRETT_CONSTANT_1 0xb4e5b025f7011641ULL /* floor(x^95 / G(x)) */
#define CRC32_BARRETT_CONSTANT_2 0x00000001db710641ULL /* G(x) */
#define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }

#define CRC32_NUM_CHUNKS 4
#define CRC32_MIN_VARIABLE_CHUNK_LEN 128UL
Expand Down
117 changes: 62 additions & 55 deletions lib/x86/crc32_pclmul_template.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,10 @@
* instructions. Note that the x86 crc32 instruction cannot be used, as it is
* for a different polynomial, not the gzip one. For an explanation of CRC
* folding with carryless multiplication instructions, see
* scripts/gen_crc32_multipliers.c and the following paper:
* scripts/gen_crc32_multipliers.c and the following blog posts and papers:
*
* "An alternative exposition of crc32_4k_pclmulqdq"
* https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq
*
* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
* https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
Expand Down Expand Up @@ -193,10 +196,9 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
const vec_t mults_2v = MULTS_2V;
const vec_t mults_1v = MULTS_1V;
const __m128i mults_128b = _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG);
const __m128i final_mult = _mm_set_epi64x(0, CRC32_X63_MODG);
const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
const __m128i barrett_reduction_constants =
_mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_1);
const __m128i mask32 = _mm_set_epi32(0, 0xFFFFFFFF, 0, 0);
vec_t v0, v1, v2, v3, v4, v5, v6, v7;
__m128i x0 = _mm_cvtsi32_si128(crc);
__m128i x1;
Expand Down Expand Up @@ -389,68 +391,73 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
#if USE_AVX512
reduce_x0:
#endif

/*
* Fold 128 => 96 bits. This also implicitly appends 32 zero bits,
* which is equivalent to multiplying by x^32. This is needed because
* the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
*/
x0 = _mm_xor_si128(_mm_srli_si128(x0, 8),
_mm_clmulepi64_si128(x0, mults_128b, 0x10));

/* Fold 96 => 64 bits. */
x0 = _mm_xor_si128(_mm_srli_si128(x0, 4),
_mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
final_mult, 0x00));

/*
* Reduce 64 => 32 bits using Barrett reduction.
*
* Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to
* compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)):
*
* R(x) = (A(x)*x^32 + B(x)) mod G(x)
* = (A(x)*x^32) mod G(x) + B(x)
*
* Then, by the Division Algorithm there exists a unique q(x) such that:
*
* A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
*
* Since the left-hand side is of maximum degree 31, the right-hand side
* must be too. This implies that we can apply 'mod x^32' to the
* right-hand side without changing its value:
* Generate the final CRC32 from the 128-bit value A = x0 as follows:
*
* (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
* crc = x^32 * A mod G
* = x^32 * (x^64*A_L + A_H) mod G
* = x^32 * ((x^32*(x^32*A_L mod G)) + A_H) mod G
*
* Note that '+' is equivalent to '-' in polynomials over GF(2).
* I.e.:
* crc := 0
* crc := x^32 * (x^32*crc + A_L) mod G
* crc := x^32 * (x^32*crc + A_H) mod G
*
* We also know that:
*
* / A(x)*x^32 \
* q(x) = floor ( --------- )
* \ G(x) /
* A_L and A_H refer to the physically low and high qwords of A, which
* contain the high and low polynomial coefficients respectively!
*/

/*
* Using Barrett reduction, the first step is:
*
* To compute this efficiently, we can multiply the top and bottom by
* x^32 and move the division by G(x) to the top:
* crc := x^32*A_L mod G
* = ((A_L * (x^(32+n) // G)) // x^n) * G mod x^32 [for n >= 63]
* = ((A_L * (x^95 // G)) // x^63) * G mod x^32
*
* / A(x) * floor(x^64 / G(x)) \
* q(x) = floor ( ------------------------- )
* \ x^32 /
* The // symbol denotes floored division, producing the unique quotient
* polynomial for the given dividend and divisor. Two carryless
* multiplications, denoted by *, are needed. The first is of A_L,
* which is of max degree 63, by the degree-63 constant x^95 // G. This
* produces a product of max degree 126. Considering CRC32's reverse
* coefficient order, the low qword of the product contains terms x^63
* through x^126, which become x^0 through x^63 after the division by
* x^63 which happens for free. Multiplying this value of max degree 63
* by the degree-32 constant G gives a product of max degree 95, equal
* to x^32*A_L + (x^32*A_L mod G). The desired mod G part is in the
* low-order 32 polynomial coefficients, i.e. the index 2 dword.
*
* Note that floor(x^64 / G(x)) is a constant.
* Note that when choosing n in (A_L * (x^(32+n) // G)) // x^n, only
* n=63 works. n < 63 would be less than the maximum degree of A_L,
* which would cause insufficient precision to be carried through the
* calculation. n > 63 would require a carryless multiplication
* instruction that takes an input larger than 64 bits.
*/
x1 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x00);
x1 = _mm_clmulepi64_si128(x1, barrett_reduction_constants, 0x10);

/*
* The second step, also using Barrett reduction:
*
* So finally we have:
* crc := x^32 * (x^32*crc + A_H) mod G
* = floor(((x^32*crc + A_H) * floor(x^95 / G)) / x^63) * G mod x^32
*
* / A(x) * floor(x^64 / G(x)) \
* R(x) = B(x) + G(x)*floor ( ------------------------- )
* \ x^32 /
* Same as previous but uses the high qword of A and starts with adding
* x^32*crc to it. Considering CRC32's reverse-coefficient order, that
* means xor'ing crc into the index 2 dword of A, which is conveniently
* the same dword index that crc is in. Select the crc dword by and-ing
* with a mask, then do the xor; that is a ternary boolean operation
* a ^ (b & c) which with AVX512 is a vpternlog with immediate 0x78.
*/
x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
barrett_reduction_constants, 0x00);
x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32),
barrett_reduction_constants, 0x10);
x0 = _mm_xor_si128(x0, x1);
return _mm_extract_epi32(x0, 1);
#if USE_AVX512
x0 = _mm_ternarylogic_epi32(x0, x1, mask32, 0x78);
#else
x0 = _mm_xor_si128(x0, _mm_and_si128(x1, mask32));
#endif
x0 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x01);
x0 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x10);
/* The resulting crc is in the index 2 dword. */

return _mm_extract_epi32(x0, 2);
}

#undef vec_t
Expand Down
40 changes: 21 additions & 19 deletions scripts/gen_crc32_multipliers.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,17 +74,28 @@ compute_xD_modG(size_t D)
return remainder;
}

/* Compute floor(x^64 / G(x)) */
/* Compute floor(x^95 / G(x)) */
static u64
compute_x64_div_G(void)
compute_x95_div_G(void)
{
/* The quotient, max order 95 - 32 = 63. */
u64 quotient = 0;
u64 dividend = 0x1;

for (int i = 0; i < 64 - 32 + 1; i++) {
if ((dividend >> i) & 1) {
quotient |= (u64)1 << i;
dividend ^= CRCPOLY_FULL << i;
/*
* The x^32 through x^95 terms of the remainder. This starts at x^95
* and is updated through long division. At the end only the
* x^0 through x^31 terms will be nonzero, but those are unneeded.
*/
u64 remainder = 0x1;

for (int i = 0; i < 64; i++) {
/*
* If the x^(95-i) term of remainder is nonzero, add
* x^(63-i) * G(x) to cancel it out. (G(x) has order 32.)
*/
if (remainder & (1ULL << i)) {
quotient |= 1ULL << i;
remainder ^= (u64)CRCPOLY_FULL << i;
}
}

Expand Down Expand Up @@ -123,20 +134,11 @@ gen_vec_folding_constants(void)
printf("\n");
}

/* Multiplier for final 96 => 64 bit fold */
printf("#define CRC32_X63_MODG 0x%08"PRIx32" /* x^63 mod G(x) */\n",
compute_xD_modG(63));

/*
* Constants for final 64 => 32 bit reduction. These constants are the
* odd ones out, as this final reduction step can't use the regular CRC
* folding described above. It uses Barrett reduction instead.
*/
printf("#define CRC32_BARRETT_CONSTANT_1 0x%016"PRIx64"ULL /* floor(x^64 / G(x)) */\n",
compute_x64_div_G());
/* Constants for the final 128 => 32 bit reduction */
printf("#define CRC32_BARRETT_CONSTANT_1 0x%016"PRIx64"ULL /* floor(x^95 / G(x)) */\n",
compute_x95_div_G());
printf("#define CRC32_BARRETT_CONSTANT_2 0x%016"PRIx64"ULL /* G(x) */\n",
CRCPOLY_FULL);
printf("#define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }\n");
}

/* Multipliers for combining the CRCs of separate chunks */
Expand Down

0 comments on commit 56f9ad8

Please sign in to comment.