liblzma: ARM64 CRC32: Align the buffer faster

Instead of doing it byte by byte, use the 1/2/4-byte CRC32 instructions.
2025-07-22 04:16:38 +00:00 · 2024-06-28 14:20:49 +03:00 · 2024-06-28 14:20:49 +03:00 · 0ed8936685
commit 0ed8936685
parent 7e99856f66
1 changed files with 40 additions and 14 deletions
--- a/src/liblzma/check/crc32_arm64.h
+++ b/src/liblzma/check/crc32_arm64.h
@ -8,6 +8,7 @@
 //  Authors:    Chenxi Mao
 //              Jia Tan
 //              Hans Jansen
+//              Lasse Collin
 //
 ///////////////////////////////////////////////////////////////////////////////

@ -54,25 +55,50 @@ crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
 {
 	crc = ~crc;

-	// Align the input buffer because this was shown to be
-	// significantly faster than unaligned accesses.
-	const size_t align_amount = my_min(size, (0U - (uintptr_t)buf) & 7);
+	if (size >= 8) {
+		// Align the input buffer because this was shown to be
+		// significantly faster than unaligned accesses.
+		const size_t align = (0 - (uintptr_t)buf) & 7;

-	for (const uint8_t *limit = buf + align_amount; buf < limit; ++buf)
-		crc = __crc32b(crc, *buf);
+		if (align & 1)
+			crc = __crc32b(crc, *buf++);

-	size -= align_amount;
+		if (align & 2) {
+			crc = __crc32h(crc, aligned_read16le(buf));
+			buf += 2;
+		}

-	// Process 8 bytes at a time. The end point is determined by
-	// ignoring the least significant three bits of size to ensure
-	// we do not process past the bounds of the buffer. This guarantees
-	// that limit is a multiple of 8 and is strictly less than size.
-	for (const uint8_t *limit = buf + (size & ~(size_t)7);
-			buf < limit; buf += 8)
-		crc = __crc32d(crc, aligned_read64le(buf));
+		if (align & 4) {
+			crc = __crc32w(crc, aligned_read32le(buf));
+			buf += 4;
+		}
+
+		size -= align;
+
+		// Process 8 bytes at a time. The end point is determined by
+		// ignoring the least significant three bits of size to
+		// ensure we do not process past the bounds of the buffer.
+		// This guarantees that limit is a multiple of 8 and is
+		// strictly less than size.
+		for (const uint8_t *limit = buf + (size & ~(size_t)7);
+				buf < limit; buf += 8)
+			crc = __crc32d(crc, aligned_read64le(buf));
+
+		size &= 7;
+	}

 	// Process the remaining bytes that are not 8 byte aligned.
-	for (const uint8_t *limit = buf + (size & 7); buf < limit; ++buf)
+	if (size & 4) {
+		crc = __crc32w(crc, aligned_read32le(buf));
+		buf += 4;
+	}
+
+	if (size & 2) {
+		crc = __crc32h(crc, aligned_read16le(buf));
+		buf += 2;
+	}
+
+	if (size & 1)
 		crc = __crc32b(crc, *buf);

 	return ~crc;