mirror of https://git.tukaani.org/xz.git
liblzma: ARM64 CRC32: Align the buffer faster
Instead of doing it byte by byte, use the 1/2/4-byte CRC32 instructions.
This commit is contained in:
parent
7e99856f66
commit
0ed8936685
|
@ -8,6 +8,7 @@
|
|||
// Authors: Chenxi Mao
|
||||
// Jia Tan
|
||||
// Hans Jansen
|
||||
// Lasse Collin
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
@ -54,25 +55,50 @@ crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
|
|||
{
|
||||
crc = ~crc;
|
||||
|
||||
// Align the input buffer because this was shown to be
|
||||
// significantly faster than unaligned accesses.
|
||||
const size_t align_amount = my_min(size, (0U - (uintptr_t)buf) & 7);
|
||||
if (size >= 8) {
|
||||
// Align the input buffer because this was shown to be
|
||||
// significantly faster than unaligned accesses.
|
||||
const size_t align = (0 - (uintptr_t)buf) & 7;
|
||||
|
||||
for (const uint8_t *limit = buf + align_amount; buf < limit; ++buf)
|
||||
crc = __crc32b(crc, *buf);
|
||||
if (align & 1)
|
||||
crc = __crc32b(crc, *buf++);
|
||||
|
||||
size -= align_amount;
|
||||
if (align & 2) {
|
||||
crc = __crc32h(crc, aligned_read16le(buf));
|
||||
buf += 2;
|
||||
}
|
||||
|
||||
// Process 8 bytes at a time. The end point is determined by
|
||||
// ignoring the least significant three bits of size to ensure
|
||||
// we do not process past the bounds of the buffer. This guarantees
|
||||
// that limit is a multiple of 8 and is strictly less than size.
|
||||
for (const uint8_t *limit = buf + (size & ~(size_t)7);
|
||||
buf < limit; buf += 8)
|
||||
crc = __crc32d(crc, aligned_read64le(buf));
|
||||
if (align & 4) {
|
||||
crc = __crc32w(crc, aligned_read32le(buf));
|
||||
buf += 4;
|
||||
}
|
||||
|
||||
size -= align;
|
||||
|
||||
// Process 8 bytes at a time. The end point is determined by
|
||||
// ignoring the least significant three bits of size to
|
||||
// ensure we do not process past the bounds of the buffer.
|
||||
// This guarantees that limit is a multiple of 8 and is
|
||||
// strictly less than size.
|
||||
for (const uint8_t *limit = buf + (size & ~(size_t)7);
|
||||
buf < limit; buf += 8)
|
||||
crc = __crc32d(crc, aligned_read64le(buf));
|
||||
|
||||
size &= 7;
|
||||
}
|
||||
|
||||
// Process the remaining bytes that are not 8 byte aligned.
|
||||
for (const uint8_t *limit = buf + (size & 7); buf < limit; ++buf)
|
||||
if (size & 4) {
|
||||
crc = __crc32w(crc, aligned_read32le(buf));
|
||||
buf += 4;
|
||||
}
|
||||
|
||||
if (size & 2) {
|
||||
crc = __crc32h(crc, aligned_read16le(buf));
|
||||
buf += 2;
|
||||
}
|
||||
|
||||
if (size & 1)
|
||||
crc = __crc32b(crc, *buf);
|
||||
|
||||
return ~crc;
|
||||
|
|
Loading…
Reference in New Issue