liblzma: ARM64 CRC32: Align the buffer faster

Instead of doing it byte by byte, use the 1/2/4-byte CRC32 instructions.
This commit is contained in:
Lasse Collin 2024-06-28 14:20:49 +03:00
parent 7e99856f66
commit 0ed8936685
1 changed files with 40 additions and 14 deletions

View File

@ -8,6 +8,7 @@
// Authors: Chenxi Mao // Authors: Chenxi Mao
// Jia Tan // Jia Tan
// Hans Jansen // Hans Jansen
// Lasse Collin
// //
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@ -54,25 +55,50 @@ crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
{ {
crc = ~crc; crc = ~crc;
if (size >= 8) {
// Align the input buffer because this was shown to be // Align the input buffer because this was shown to be
// significantly faster than unaligned accesses. // significantly faster than unaligned accesses.
const size_t align_amount = my_min(size, (0U - (uintptr_t)buf) & 7); const size_t align = (0 - (uintptr_t)buf) & 7;
for (const uint8_t *limit = buf + align_amount; buf < limit; ++buf) if (align & 1)
crc = __crc32b(crc, *buf); crc = __crc32b(crc, *buf++);
size -= align_amount; if (align & 2) {
crc = __crc32h(crc, aligned_read16le(buf));
buf += 2;
}
if (align & 4) {
crc = __crc32w(crc, aligned_read32le(buf));
buf += 4;
}
size -= align;
// Process 8 bytes at a time. The end point is determined by // Process 8 bytes at a time. The end point is determined by
// ignoring the least significant three bits of size to ensure // ignoring the least significant three bits of size to
// we do not process past the bounds of the buffer. This guarantees // ensure we do not process past the bounds of the buffer.
// that limit is a multiple of 8 and is strictly less than size. // This guarantees that limit is a multiple of 8 and is
// strictly less than size.
for (const uint8_t *limit = buf + (size & ~(size_t)7); for (const uint8_t *limit = buf + (size & ~(size_t)7);
buf < limit; buf += 8) buf < limit; buf += 8)
crc = __crc32d(crc, aligned_read64le(buf)); crc = __crc32d(crc, aligned_read64le(buf));
size &= 7;
}
// Process the remaining bytes that are not 8 byte aligned. // Process the remaining bytes that are not 8 byte aligned.
for (const uint8_t *limit = buf + (size & 7); buf < limit; ++buf) if (size & 4) {
crc = __crc32w(crc, aligned_read32le(buf));
buf += 4;
}
if (size & 2) {
crc = __crc32h(crc, aligned_read16le(buf));
buf += 2;
}
if (size & 1)
crc = __crc32b(crc, *buf); crc = __crc32b(crc, *buf);
return ~crc; return ~crc;