From 0ed893668554fb0758003289f8a6af9bd08b89d1 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Fri, 28 Jun 2024 14:20:49 +0300 Subject: [PATCH] liblzma: ARM64 CRC32: Align the buffer faster Instead of doing it byte by byte, use the 1/2/4-byte CRC32 instructions. --- src/liblzma/check/crc32_arm64.h | 54 ++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/src/liblzma/check/crc32_arm64.h b/src/liblzma/check/crc32_arm64.h index 5bad0e00..c0609c36 100644 --- a/src/liblzma/check/crc32_arm64.h +++ b/src/liblzma/check/crc32_arm64.h @@ -8,6 +8,7 @@ // Authors: Chenxi Mao // Jia Tan // Hans Jansen +// Lasse Collin // /////////////////////////////////////////////////////////////////////////////// @@ -54,25 +55,50 @@ crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc) { crc = ~crc; - // Align the input buffer because this was shown to be - // significantly faster than unaligned accesses. - const size_t align_amount = my_min(size, (0U - (uintptr_t)buf) & 7); + if (size >= 8) { + // Align the input buffer because this was shown to be + // significantly faster than unaligned accesses. + const size_t align = (0 - (uintptr_t)buf) & 7; - for (const uint8_t *limit = buf + align_amount; buf < limit; ++buf) - crc = __crc32b(crc, *buf); + if (align & 1) + crc = __crc32b(crc, *buf++); - size -= align_amount; + if (align & 2) { + crc = __crc32h(crc, aligned_read16le(buf)); + buf += 2; + } - // Process 8 bytes at a time. The end point is determined by - // ignoring the least significant three bits of size to ensure - // we do not process past the bounds of the buffer. This guarantees - // that limit is a multiple of 8 and is strictly less than size. - for (const uint8_t *limit = buf + (size & ~(size_t)7); - buf < limit; buf += 8) - crc = __crc32d(crc, aligned_read64le(buf)); + if (align & 4) { + crc = __crc32w(crc, aligned_read32le(buf)); + buf += 4; + } + + size -= align; + + // Process 8 bytes at a time. The end point is determined by + // ignoring the least significant three bits of size to + // ensure we do not process past the bounds of the buffer. + // This guarantees that limit is a multiple of 8 and is + // strictly less than size. + for (const uint8_t *limit = buf + (size & ~(size_t)7); + buf < limit; buf += 8) + crc = __crc32d(crc, aligned_read64le(buf)); + + size &= 7; + } // Process the remaining bytes that are not 8 byte aligned. - for (const uint8_t *limit = buf + (size & 7); buf < limit; ++buf) + if (size & 4) { + crc = __crc32w(crc, aligned_read32le(buf)); + buf += 4; + } + + if (size & 2) { + crc = __crc32h(crc, aligned_read16le(buf)); + buf += 2; + } + + if (size & 1) crc = __crc32b(crc, *buf); return ~crc;