liblzma: ARM64 CRC32: Align the buffer faster

Instead of doing it byte by byte, use the 1/2/4-byte CRC32 instructions.
This commit is contained in:
Lasse Collin 2024-06-28 14:20:49 +03:00
parent 7e99856f66
commit 0ed8936685
1 changed files with 40 additions and 14 deletions

View File

@ -8,6 +8,7 @@
// Authors: Chenxi Mao
// Jia Tan
// Hans Jansen
// Lasse Collin
//
///////////////////////////////////////////////////////////////////////////////
@ -54,25 +55,50 @@ crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
{
crc = ~crc;
// Align the input buffer because this was shown to be
// significantly faster than unaligned accesses.
const size_t align_amount = my_min(size, (0U - (uintptr_t)buf) & 7);
if (size >= 8) {
// Align the input buffer because this was shown to be
// significantly faster than unaligned accesses.
const size_t align = (0 - (uintptr_t)buf) & 7;
for (const uint8_t *limit = buf + align_amount; buf < limit; ++buf)
crc = __crc32b(crc, *buf);
if (align & 1)
crc = __crc32b(crc, *buf++);
size -= align_amount;
if (align & 2) {
crc = __crc32h(crc, aligned_read16le(buf));
buf += 2;
}
// Process 8 bytes at a time. The end point is determined by
// ignoring the least significant three bits of size to ensure
// we do not process past the bounds of the buffer. This guarantees
// that limit is a multiple of 8 and is strictly less than size.
for (const uint8_t *limit = buf + (size & ~(size_t)7);
buf < limit; buf += 8)
crc = __crc32d(crc, aligned_read64le(buf));
if (align & 4) {
crc = __crc32w(crc, aligned_read32le(buf));
buf += 4;
}
size -= align;
// Process 8 bytes at a time. The end point is determined by
// ignoring the least significant three bits of size to
// ensure we do not process past the bounds of the buffer.
// This guarantees that limit is a multiple of 8 and is
// strictly less than size.
for (const uint8_t *limit = buf + (size & ~(size_t)7);
buf < limit; buf += 8)
crc = __crc32d(crc, aligned_read64le(buf));
size &= 7;
}
// Process the remaining bytes that are not 8 byte aligned.
for (const uint8_t *limit = buf + (size & 7); buf < limit; ++buf)
if (size & 4) {
crc = __crc32w(crc, aligned_read32le(buf));
buf += 4;
}
if (size & 2) {
crc = __crc32h(crc, aligned_read16le(buf));
buf += 2;
}
if (size & 1)
crc = __crc32b(crc, *buf);
return ~crc;