From b3a756188004a16de5956c368e3b0efd1a9bccb0 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Fri, 22 Mar 2024 17:46:30 +0200 Subject: [PATCH] liblzma: memcmplen.h: Add a comment why subtraction is used. (cherry picked from commit 0b99783d63f27606936bb79a16c52d0d70c0b56f) --- src/liblzma/common/memcmplen.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/liblzma/common/memcmplen.h b/src/liblzma/common/memcmplen.h index d8c42040..394a4856 100644 --- a/src/liblzma/common/memcmplen.h +++ b/src/liblzma/common/memcmplen.h @@ -67,6 +67,19 @@ lzma_memcmplen(const uint8_t *buf1, const uint8_t *buf2, // This is only for x86-64 and ARM64 for now. This might be fine on // other 64-bit processors too. On big endian one should use xor // instead of subtraction and switch to __builtin_clzll(). + // + // Reasons to use subtraction instead of xor: + // + // - On some x86-64 processors (Intel Sandy Bridge to Tiger Lake), + // sub+jz and sub+jnz can be fused but xor+jz or xor+jnz cannot. + // Thus using subtraction has potential to be a tiny amount faster + // since the code checks if the quotient is non-zero. + // + // - Some processors (Intel Pentium 4) used to have more ALU + // resources for add/sub instructions than and/or/xor. + // + // The processor info is based on Agner Fog's microarchitecture.pdf + // version 2023-05-26. https://www.agner.org/optimize/ #define LZMA_MEMCMPLEN_EXTRA 8 while (len < limit) { const uint64_t x = read64ne(buf1 + len) - read64ne(buf2 + len);