liblzma: ARM64 CRC32: Align the buffer faster

Instead of doing it byte by byte, use the 1/2/4-byte CRC32 instructions.
2025-10-28 12:02:55 +00:00 · 2024-06-28 14:20:49 +03:00 · 2024-06-28 14:20:49 +03:00 · 0ed8936685
commit 0ed8936685
parent 7e99856f66
1 changed files with 40 additions and 14 deletions
--- a/src/liblzma/check/crc32_arm64.h
+++ b/src/liblzma/check/crc32_arm64.h
@ -8,6 +8,7 @@
 //  Authors:    Chenxi Mao
 //              Jia Tan
 //              Hans Jansen
 //              Lasse Collin
 //
 ///////////////////////////////////////////////////////////////////////////////
@ -54,25 +55,50 @@ crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
 {
 	crc = ~crc;
 	if (size >= 8) {
 		// Align the input buffer because this was shown to be
 		// significantly faster than unaligned accesses.
-	const size_t align_amount = my_min(size, (0U - (uintptr_t)buf) & 7);
+		const size_t align = (0 - (uintptr_t)buf) & 7;
-	for (const uint8_t *limit = buf + align_amount; buf < limit; ++buf)
+		if (align & 1)
-		crc = __crc32b(crc, *buf);
+			crc = __crc32b(crc, *buf++);
-	size -= align_amount;
+		if (align & 2) {
 			crc = __crc32h(crc, aligned_read16le(buf));
 			buf += 2;
 		}
 		if (align & 4) {
 			crc = __crc32w(crc, aligned_read32le(buf));
 			buf += 4;
 		}
 		size -= align;
 		// Process 8 bytes at a time. The end point is determined by
-	// ignoring the least significant three bits of size to ensure
+		// ignoring the least significant three bits of size to
-	// we do not process past the bounds of the buffer. This guarantees
+		// ensure we do not process past the bounds of the buffer.
-	// that limit is a multiple of 8 and is strictly less than size.
+		// This guarantees that limit is a multiple of 8 and is
 		// strictly less than size.
 		for (const uint8_t *limit = buf + (size & ~(size_t)7);
 				buf < limit; buf += 8)
 			crc = __crc32d(crc, aligned_read64le(buf));
 		size &= 7;
 	}
 	// Process the remaining bytes that are not 8 byte aligned.
-	for (const uint8_t *limit = buf + (size & 7); buf < limit; ++buf)
+	if (size & 4) {
 		crc = __crc32w(crc, aligned_read32le(buf));
 		buf += 4;
 	}
 	if (size & 2) {
 		crc = __crc32h(crc, aligned_read16le(buf));
 		buf += 2;
 	}
 	if (size & 1)
 		crc = __crc32b(crc, *buf);
 	return ~crc;