liblzma: CRC32 CLMUL: Refactor the constants and simplify

By using modulus scaled constants, the final reduction can
be simplified.
This commit is contained in:
Lasse Collin 2024-06-10 15:31:01 +03:00
parent ef652ac391
commit d8fb098617
1 changed files with 9 additions and 43 deletions

View File

@ -211,31 +211,6 @@ crc_simd_body(const uint8_t *buf, const size_t size, __m128i *v0, __m128i *v1,
// x86 CLMUL CRC32 // // x86 CLMUL CRC32 //
///////////////////// /////////////////////
/*
// These functions were used to generate the constants
// at the top of crc32_arch_optimized().
static uint64_t
calc_lo(uint64_t p, uint64_t a, int n)
{
uint64_t b = 0; int i;
for (i = 0; i < n; i++) {
b = b >> 1 | (a & 1) << (n - 1);
a = (a >> 1) ^ ((0 - (a & 1)) & p);
}
return b;
}
// same as ~crc(&a, sizeof(a), ~0)
static uint64_t
calc_hi(uint64_t p, uint64_t a, int n)
{
int i;
for (i = 0; i < n; i++)
a = (a >> 1) ^ ((0 - (a & 1)) & p);
return a;
}
*/
#ifdef BUILDING_CRC32_CLMUL #ifdef BUILDING_CRC32_CLMUL
crc_attr_target crc_attr_target
@ -246,31 +221,22 @@ crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
if (size == 0) if (size == 0)
return crc; return crc;
// uint32_t poly = 0xedb88320; // See crc_clmul_consts_gen.c.
const int64_t p = 0x1db710640; // p << 1 const __m128i vfold16 = _mm_set_epi64x(0xccaa009e, 0xae689191);
const int64_t mu = 0x1f7011641; // calc_lo(p, p, 32) << 1 | 1 const __m128i mu_p = _mm_set_epi64x(
const int64_t k5 = 0x163cd6124; // calc_hi(p, p, 32) << 1 (int64_t)0xb4e5b025f7011641, 0x1db710640);
const int64_t k4 = 0x0ccaa009e; // calc_hi(p, p, 64) << 1
const int64_t k3 = 0x1751997d0; // calc_hi(p, p, 128) << 1
const __m128i vfold4 = _mm_set_epi64x(mu, p); __m128i v0, v1;
const __m128i vfold8 = _mm_set_epi64x(0, k5);
const __m128i vfold16 = _mm_set_epi64x(k4, k3);
__m128i v0, v1, v2;
crc_simd_body(buf, size, &v0, &v1, vfold16, crc_simd_body(buf, size, &v0, &v1, vfold16,
_mm_cvtsi32_si128((int32_t)~crc)); _mm_cvtsi32_si128((int32_t)~crc));
v1 = _mm_xor_si128( v1 = _mm_xor_si128(
_mm_clmulepi64_si128(v0, vfold16, 0x10), v1); // xxx0 _mm_clmulepi64_si128(v0, vfold16, 0x10), v1); // xxx0
v2 = _mm_shuffle_epi32(v1, 0xe7); // 0xx0
v0 = _mm_slli_epi64(v1, 32); // [0] v0 = _mm_clmulepi64_si128(v1, mu_p, 0x10); // v1 * mu
v0 = _mm_clmulepi64_si128(v0, vfold8, 0x00); v0 = _mm_clmulepi64_si128(v0, mu_p, 0x00); // v0 * p
v0 = _mm_xor_si128(v0, v2); // [1] [2] v0 = _mm_xor_si128(v0, v1);
v2 = _mm_clmulepi64_si128(v0, vfold4, 0x10);
v2 = _mm_clmulepi64_si128(v2, vfold4, 0x00);
v0 = _mm_xor_si128(v0, v2); // [2]
return ~(uint32_t)_mm_extract_epi32(v0, 2); return ~(uint32_t)_mm_extract_epi32(v0, 2);
} }
#endif // BUILDING_CRC32_CLMUL #endif // BUILDING_CRC32_CLMUL