1
0
mirror of https://git.tukaani.org/xz.git synced 2025-04-04 06:40:58 +00:00

liblzma: Use SSE2 intrinsics instead of memcpy() in dict_repeat()

SSE2 is supported on every x86-64 processor. The SSE2 code is used on
32-bit x86 if compiler options permit unconditional use of SSE2.

dict_repeat() copies short random-sized unaligned buffers. At least
on glibc, FreeBSD, and Windows (MSYS2, UCRT, MSVCRT), memcpy() is
clearly faster than byte-by-byte copying in this use case. Compared
to the memcpy() version, the new SSE2 version reduces decompression
time by 0-5 % depending on the machine and libc. It should never be
slower than the memcpy() version.

However, on musl 1.2.5 on x86-64, the memcpy() version is the slowest.
Compared to the memcpy() version:

  - The byte-by-version takes 6-7 % less time to decompress.
  - The SSE2 version takes 16-18 % less time to decompress.

The numbers are from decompressing a Linux kernel source tarball in
single-threaded mode on older AMD and Intel systems. The tarball
compresses well, and thus dict_repeat() performance matters more
than with some other files.
This commit is contained in:
Lasse Collin 2025-03-25 15:18:31 +02:00
parent bc14e4c94e
commit 943b012d09
No known key found for this signature in database
GPG Key ID: 38EE757D69184620
2 changed files with 90 additions and 11 deletions

View File

@ -261,10 +261,12 @@ lzma_lz_decoder_init(lzma_next_coder *next, const lzma_allocator *allocator,
// recommended to give aligned buffers to liblzma.
//
// Reserve 2 * LZ_DICT_REPEAT_MAX bytes of extra space which is
// needed for alloc_size.
// needed for alloc_size. Reserve also LZ_DICT_EXTRA bytes of extra
// space which is *not* counted in alloc_size or coder->dict.size.
//
// Avoid integer overflow.
if (lz_options.dict_size > SIZE_MAX - 15 - 2 * LZ_DICT_REPEAT_MAX)
if (lz_options.dict_size > SIZE_MAX - 15 - 2 * LZ_DICT_REPEAT_MAX
- LZ_DICT_EXTRA)
return LZMA_MEM_ERROR;
lz_options.dict_size = (lz_options.dict_size + 15) & ~((size_t)(15));
@ -277,7 +279,13 @@ lzma_lz_decoder_init(lzma_next_coder *next, const lzma_allocator *allocator,
// Allocate and initialize the dictionary.
if (coder->dict.size != alloc_size) {
lzma_free(coder->dict.buf, allocator);
coder->dict.buf = lzma_alloc(alloc_size, allocator);
// The LZ_DICT_EXTRA bytes at the end of the buffer aren't
// included in alloc_size. These extra bytes allow
// dict_repeat() to read and write more data than requested.
// Otherwise this extra space is ignored.
coder->dict.buf = lzma_alloc(alloc_size + LZ_DICT_EXTRA,
allocator);
if (coder->dict.buf == NULL)
return LZMA_MEM_ERROR;

View File

@ -15,10 +15,40 @@
#include "common.h"
#ifdef HAVE_IMMINTRIN_H
# include <immintrin.h>
#endif
/// Maximum length of a match rounded up to a nice power of 2 which is
/// a good size for aligned memcpy(). The allocated dictionary buffer will
/// be 2 * LZ_DICT_REPEAT_MAX bytes larger than the actual dictionary size:
// dict_repeat() implementation variant:
// 0 = Byte-by-byte copying only.
// 1 = Use memcpy() for non-overlapping copies.
// 2 = Use x86 SSE2 for non-overlapping copies.
#ifndef LZMA_LZ_DECODER_CONFIG
# if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
&& defined(HAVE_IMMINTRIN_H) \
&& (defined(__SSE2__) || defined(_M_X64) \
|| (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
# define LZMA_LZ_DECODER_CONFIG 2
# else
# define LZMA_LZ_DECODER_CONFIG 1
# endif
#endif
/// Byte-by-byte and memcpy() copy exactly the amount needed. Other methods
/// can copy up to LZ_DICT_EXTRA bytes more than requested, and this amount
/// of extra space is needed at the end of the allocated dictionary buffer.
///
/// NOTE: If this is increased, update LZMA_DICT_REPEAT_MAX too.
#if LZMA_LZ_DECODER_CONFIG >= 2
# define LZ_DICT_EXTRA 32
#else
# define LZ_DICT_EXTRA 0
#endif
/// Maximum number of bytes that dict_repeat() may copy. The allocated
/// dictionary buffer will be 2 * LZ_DICT_REPEAT_MAX + LZMA_DICT_EXTRA bytes
/// larger than the actual dictionary size:
///
/// (1) Every time the decoder reaches the end of the dictionary buffer,
/// the last LZ_DICT_REPEAT_MAX bytes will be copied to the beginning.
@ -27,12 +57,21 @@
///
/// (2) The other LZ_DICT_REPEAT_MAX bytes is kept as a buffer between
/// the oldest byte still in the dictionary and the current write
/// position. This way dict_repeat(dict, dict->size - 1, &len)
/// position. This way dict_repeat() with the maximum valid distance
/// won't need memmove() as the copying cannot overlap.
///
/// (3) LZ_DICT_EXTRA bytes are required at the end of the dictionary buffer
/// so that extra copying done by dict_repeat() won't write or read past
/// the end of the allocated buffer. This amount is *not* counted as part
/// of lzma_dict.size.
///
/// Note that memcpy() still cannot be used if distance < len.
///
/// LZMA's longest match length is 273 so pick a multiple of 16 above that.
/// LZMA's longest match length is 273 bytes. The LZMA decoder looks at
/// the lowest four bits of the dictionary position, thus 273 must be
/// rounded up to the next multiple of 16 (288). In addition, optimized
/// dict_repeat() copies 32 bytes at a time, thus this must also be
/// a multiple of 32.
#define LZ_DICT_REPEAT_MAX 288
/// Initial position in lzma_dict.buf when the dictionary is empty.
@ -173,9 +212,17 @@ dict_repeat(lzma_dict *restrict dict,
if (distance >= dict->pos)
back += dict->size - LZ_DICT_REPEAT_MAX;
// Repeat a block of data from the history. Because memcpy() is faster
// than copying byte by byte in a loop, the copying process gets split
// into two cases.
#if LZMA_LZ_DECODER_CONFIG == 0
// Minimal byte-by-byte method. This might be the least bad choice
// if memcpy() isn't fast and there's no replacement for it below.
while (left-- > 0) {
dict->buf[dict->pos++] = dict->buf[back++];
}
#else
// Because memcpy() or a similar method can be faster than copying
// byte by byte in a loop, the copying process is split into
// two cases.
if (distance < left) {
// Source and target areas overlap, thus we can't use
// memcpy() nor even memmove() safely.
@ -183,9 +230,33 @@ dict_repeat(lzma_dict *restrict dict,
dict->buf[dict->pos++] = dict->buf[back++];
} while (--left > 0);
} else {
# if LZMA_LZ_DECODER_CONFIG == 1
memcpy(dict->buf + dict->pos, dict->buf + back, left);
dict->pos += left;
# elif LZMA_LZ_DECODER_CONFIG == 2
// This can copy up to 32 bytes more than required.
// (If left == 0, we still copy 32 bytes.)
size_t pos = dict->pos;
dict->pos += left;
do {
const __m128i x0 = _mm_loadu_si128(
(__m128i *)(dict->buf + back));
const __m128i x1 = _mm_loadu_si128(
(__m128i *)(dict->buf + back + 16));
back += 32;
_mm_storeu_si128(
(__m128i *)(dict->buf + pos), x0);
_mm_storeu_si128(
(__m128i *)(dict->buf + pos + 16), x1);
pos += 32;
} while (pos < dict->pos);
# else
# error "Invalid LZMA_LZ_DECODER_CONFIG value"
# endif
}
#endif
// Update how full the dictionary is.
if (!dict->has_wrapped)