From 943b012d09f717f7b44284c4e4976ea41264c731 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Tue, 25 Mar 2025 15:18:31 +0200 Subject: [PATCH] liblzma: Use SSE2 intrinsics instead of memcpy() in dict_repeat() SSE2 is supported on every x86-64 processor. The SSE2 code is used on 32-bit x86 if compiler options permit unconditional use of SSE2. dict_repeat() copies short random-sized unaligned buffers. At least on glibc, FreeBSD, and Windows (MSYS2, UCRT, MSVCRT), memcpy() is clearly faster than byte-by-byte copying in this use case. Compared to the memcpy() version, the new SSE2 version reduces decompression time by 0-5 % depending on the machine and libc. It should never be slower than the memcpy() version. However, on musl 1.2.5 on x86-64, the memcpy() version is the slowest. Compared to the memcpy() version: - The byte-by-version takes 6-7 % less time to decompress. - The SSE2 version takes 16-18 % less time to decompress. The numbers are from decompressing a Linux kernel source tarball in single-threaded mode on older AMD and Intel systems. The tarball compresses well, and thus dict_repeat() performance matters more than with some other files. --- src/liblzma/lz/lz_decoder.c | 14 ++++-- src/liblzma/lz/lz_decoder.h | 87 +++++++++++++++++++++++++++++++++---- 2 files changed, 90 insertions(+), 11 deletions(-) diff --git a/src/liblzma/lz/lz_decoder.c b/src/liblzma/lz/lz_decoder.c index 697000d7..6ca0bcc4 100644 --- a/src/liblzma/lz/lz_decoder.c +++ b/src/liblzma/lz/lz_decoder.c @@ -261,10 +261,12 @@ lzma_lz_decoder_init(lzma_next_coder *next, const lzma_allocator *allocator, // recommended to give aligned buffers to liblzma. // // Reserve 2 * LZ_DICT_REPEAT_MAX bytes of extra space which is - // needed for alloc_size. + // needed for alloc_size. Reserve also LZ_DICT_EXTRA bytes of extra + // space which is *not* counted in alloc_size or coder->dict.size. // // Avoid integer overflow. - if (lz_options.dict_size > SIZE_MAX - 15 - 2 * LZ_DICT_REPEAT_MAX) + if (lz_options.dict_size > SIZE_MAX - 15 - 2 * LZ_DICT_REPEAT_MAX + - LZ_DICT_EXTRA) return LZMA_MEM_ERROR; lz_options.dict_size = (lz_options.dict_size + 15) & ~((size_t)(15)); @@ -277,7 +279,13 @@ lzma_lz_decoder_init(lzma_next_coder *next, const lzma_allocator *allocator, // Allocate and initialize the dictionary. if (coder->dict.size != alloc_size) { lzma_free(coder->dict.buf, allocator); - coder->dict.buf = lzma_alloc(alloc_size, allocator); + + // The LZ_DICT_EXTRA bytes at the end of the buffer aren't + // included in alloc_size. These extra bytes allow + // dict_repeat() to read and write more data than requested. + // Otherwise this extra space is ignored. + coder->dict.buf = lzma_alloc(alloc_size + LZ_DICT_EXTRA, + allocator); if (coder->dict.buf == NULL) return LZMA_MEM_ERROR; diff --git a/src/liblzma/lz/lz_decoder.h b/src/liblzma/lz/lz_decoder.h index ac9334ad..2698e016 100644 --- a/src/liblzma/lz/lz_decoder.h +++ b/src/liblzma/lz/lz_decoder.h @@ -15,10 +15,40 @@ #include "common.h" +#ifdef HAVE_IMMINTRIN_H +# include +#endif -/// Maximum length of a match rounded up to a nice power of 2 which is -/// a good size for aligned memcpy(). The allocated dictionary buffer will -/// be 2 * LZ_DICT_REPEAT_MAX bytes larger than the actual dictionary size: + +// dict_repeat() implementation variant: +// 0 = Byte-by-byte copying only. +// 1 = Use memcpy() for non-overlapping copies. +// 2 = Use x86 SSE2 for non-overlapping copies. +#ifndef LZMA_LZ_DECODER_CONFIG +# if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ + && defined(HAVE_IMMINTRIN_H) \ + && (defined(__SSE2__) || defined(_M_X64) \ + || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)) +# define LZMA_LZ_DECODER_CONFIG 2 +# else +# define LZMA_LZ_DECODER_CONFIG 1 +# endif +#endif + +/// Byte-by-byte and memcpy() copy exactly the amount needed. Other methods +/// can copy up to LZ_DICT_EXTRA bytes more than requested, and this amount +/// of extra space is needed at the end of the allocated dictionary buffer. +/// +/// NOTE: If this is increased, update LZMA_DICT_REPEAT_MAX too. +#if LZMA_LZ_DECODER_CONFIG >= 2 +# define LZ_DICT_EXTRA 32 +#else +# define LZ_DICT_EXTRA 0 +#endif + +/// Maximum number of bytes that dict_repeat() may copy. The allocated +/// dictionary buffer will be 2 * LZ_DICT_REPEAT_MAX + LZMA_DICT_EXTRA bytes +/// larger than the actual dictionary size: /// /// (1) Every time the decoder reaches the end of the dictionary buffer, /// the last LZ_DICT_REPEAT_MAX bytes will be copied to the beginning. @@ -27,12 +57,21 @@ /// /// (2) The other LZ_DICT_REPEAT_MAX bytes is kept as a buffer between /// the oldest byte still in the dictionary and the current write -/// position. This way dict_repeat(dict, dict->size - 1, &len) +/// position. This way dict_repeat() with the maximum valid distance /// won't need memmove() as the copying cannot overlap. /// +/// (3) LZ_DICT_EXTRA bytes are required at the end of the dictionary buffer +/// so that extra copying done by dict_repeat() won't write or read past +/// the end of the allocated buffer. This amount is *not* counted as part +/// of lzma_dict.size. +/// /// Note that memcpy() still cannot be used if distance < len. /// -/// LZMA's longest match length is 273 so pick a multiple of 16 above that. +/// LZMA's longest match length is 273 bytes. The LZMA decoder looks at +/// the lowest four bits of the dictionary position, thus 273 must be +/// rounded up to the next multiple of 16 (288). In addition, optimized +/// dict_repeat() copies 32 bytes at a time, thus this must also be +/// a multiple of 32. #define LZ_DICT_REPEAT_MAX 288 /// Initial position in lzma_dict.buf when the dictionary is empty. @@ -173,9 +212,17 @@ dict_repeat(lzma_dict *restrict dict, if (distance >= dict->pos) back += dict->size - LZ_DICT_REPEAT_MAX; - // Repeat a block of data from the history. Because memcpy() is faster - // than copying byte by byte in a loop, the copying process gets split - // into two cases. +#if LZMA_LZ_DECODER_CONFIG == 0 + // Minimal byte-by-byte method. This might be the least bad choice + // if memcpy() isn't fast and there's no replacement for it below. + while (left-- > 0) { + dict->buf[dict->pos++] = dict->buf[back++]; + } + +#else + // Because memcpy() or a similar method can be faster than copying + // byte by byte in a loop, the copying process is split into + // two cases. if (distance < left) { // Source and target areas overlap, thus we can't use // memcpy() nor even memmove() safely. @@ -183,9 +230,33 @@ dict_repeat(lzma_dict *restrict dict, dict->buf[dict->pos++] = dict->buf[back++]; } while (--left > 0); } else { +# if LZMA_LZ_DECODER_CONFIG == 1 memcpy(dict->buf + dict->pos, dict->buf + back, left); dict->pos += left; + +# elif LZMA_LZ_DECODER_CONFIG == 2 + // This can copy up to 32 bytes more than required. + // (If left == 0, we still copy 32 bytes.) + size_t pos = dict->pos; + dict->pos += left; + do { + const __m128i x0 = _mm_loadu_si128( + (__m128i *)(dict->buf + back)); + const __m128i x1 = _mm_loadu_si128( + (__m128i *)(dict->buf + back + 16)); + back += 32; + _mm_storeu_si128( + (__m128i *)(dict->buf + pos), x0); + _mm_storeu_si128( + (__m128i *)(dict->buf + pos + 16), x1); + pos += 32; + } while (pos < dict->pos); + +# else +# error "Invalid LZMA_LZ_DECODER_CONFIG value" +# endif } +#endif // Update how full the dictionary is. if (!dict->has_wrapped)