mirror of
https://git.tukaani.org/xz.git
synced 2025-04-10 01:31:13 +00:00
liblzma: Use SSE2 intrinsics instead of memcpy() in dict_repeat()
SSE2 is supported on every x86-64 processor. The SSE2 code is used on 32-bit x86 if compiler options permit unconditional use of SSE2. dict_repeat() copies short random-sized unaligned buffers. At least on glibc, FreeBSD, and Windows (MSYS2, UCRT, MSVCRT), memcpy() is clearly faster than byte-by-byte copying in this use case. Compared to the memcpy() version, the new SSE2 version reduces decompression time by 0-5 % depending on the machine and libc. It should never be slower than the memcpy() version. However, on musl 1.2.5 on x86-64, the memcpy() version is the slowest. Compared to the memcpy() version: - The byte-by-version takes 6-7 % less time to decompress. - The SSE2 version takes 16-18 % less time to decompress. The numbers are from decompressing a Linux kernel source tarball in single-threaded mode on older AMD and Intel systems. The tarball compresses well, and thus dict_repeat() performance matters more than with some other files.
This commit is contained in:
parent
bc14e4c94e
commit
943b012d09
@ -261,10 +261,12 @@ lzma_lz_decoder_init(lzma_next_coder *next, const lzma_allocator *allocator,
|
|||||||
// recommended to give aligned buffers to liblzma.
|
// recommended to give aligned buffers to liblzma.
|
||||||
//
|
//
|
||||||
// Reserve 2 * LZ_DICT_REPEAT_MAX bytes of extra space which is
|
// Reserve 2 * LZ_DICT_REPEAT_MAX bytes of extra space which is
|
||||||
// needed for alloc_size.
|
// needed for alloc_size. Reserve also LZ_DICT_EXTRA bytes of extra
|
||||||
|
// space which is *not* counted in alloc_size or coder->dict.size.
|
||||||
//
|
//
|
||||||
// Avoid integer overflow.
|
// Avoid integer overflow.
|
||||||
if (lz_options.dict_size > SIZE_MAX - 15 - 2 * LZ_DICT_REPEAT_MAX)
|
if (lz_options.dict_size > SIZE_MAX - 15 - 2 * LZ_DICT_REPEAT_MAX
|
||||||
|
- LZ_DICT_EXTRA)
|
||||||
return LZMA_MEM_ERROR;
|
return LZMA_MEM_ERROR;
|
||||||
|
|
||||||
lz_options.dict_size = (lz_options.dict_size + 15) & ~((size_t)(15));
|
lz_options.dict_size = (lz_options.dict_size + 15) & ~((size_t)(15));
|
||||||
@ -277,7 +279,13 @@ lzma_lz_decoder_init(lzma_next_coder *next, const lzma_allocator *allocator,
|
|||||||
// Allocate and initialize the dictionary.
|
// Allocate and initialize the dictionary.
|
||||||
if (coder->dict.size != alloc_size) {
|
if (coder->dict.size != alloc_size) {
|
||||||
lzma_free(coder->dict.buf, allocator);
|
lzma_free(coder->dict.buf, allocator);
|
||||||
coder->dict.buf = lzma_alloc(alloc_size, allocator);
|
|
||||||
|
// The LZ_DICT_EXTRA bytes at the end of the buffer aren't
|
||||||
|
// included in alloc_size. These extra bytes allow
|
||||||
|
// dict_repeat() to read and write more data than requested.
|
||||||
|
// Otherwise this extra space is ignored.
|
||||||
|
coder->dict.buf = lzma_alloc(alloc_size + LZ_DICT_EXTRA,
|
||||||
|
allocator);
|
||||||
if (coder->dict.buf == NULL)
|
if (coder->dict.buf == NULL)
|
||||||
return LZMA_MEM_ERROR;
|
return LZMA_MEM_ERROR;
|
||||||
|
|
||||||
|
@ -15,10 +15,40 @@
|
|||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
#ifdef HAVE_IMMINTRIN_H
|
||||||
|
# include <immintrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
/// Maximum length of a match rounded up to a nice power of 2 which is
|
|
||||||
/// a good size for aligned memcpy(). The allocated dictionary buffer will
|
// dict_repeat() implementation variant:
|
||||||
/// be 2 * LZ_DICT_REPEAT_MAX bytes larger than the actual dictionary size:
|
// 0 = Byte-by-byte copying only.
|
||||||
|
// 1 = Use memcpy() for non-overlapping copies.
|
||||||
|
// 2 = Use x86 SSE2 for non-overlapping copies.
|
||||||
|
#ifndef LZMA_LZ_DECODER_CONFIG
|
||||||
|
# if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
|
||||||
|
&& defined(HAVE_IMMINTRIN_H) \
|
||||||
|
&& (defined(__SSE2__) || defined(_M_X64) \
|
||||||
|
|| (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
|
||||||
|
# define LZMA_LZ_DECODER_CONFIG 2
|
||||||
|
# else
|
||||||
|
# define LZMA_LZ_DECODER_CONFIG 1
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/// Byte-by-byte and memcpy() copy exactly the amount needed. Other methods
|
||||||
|
/// can copy up to LZ_DICT_EXTRA bytes more than requested, and this amount
|
||||||
|
/// of extra space is needed at the end of the allocated dictionary buffer.
|
||||||
|
///
|
||||||
|
/// NOTE: If this is increased, update LZMA_DICT_REPEAT_MAX too.
|
||||||
|
#if LZMA_LZ_DECODER_CONFIG >= 2
|
||||||
|
# define LZ_DICT_EXTRA 32
|
||||||
|
#else
|
||||||
|
# define LZ_DICT_EXTRA 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/// Maximum number of bytes that dict_repeat() may copy. The allocated
|
||||||
|
/// dictionary buffer will be 2 * LZ_DICT_REPEAT_MAX + LZMA_DICT_EXTRA bytes
|
||||||
|
/// larger than the actual dictionary size:
|
||||||
///
|
///
|
||||||
/// (1) Every time the decoder reaches the end of the dictionary buffer,
|
/// (1) Every time the decoder reaches the end of the dictionary buffer,
|
||||||
/// the last LZ_DICT_REPEAT_MAX bytes will be copied to the beginning.
|
/// the last LZ_DICT_REPEAT_MAX bytes will be copied to the beginning.
|
||||||
@ -27,12 +57,21 @@
|
|||||||
///
|
///
|
||||||
/// (2) The other LZ_DICT_REPEAT_MAX bytes is kept as a buffer between
|
/// (2) The other LZ_DICT_REPEAT_MAX bytes is kept as a buffer between
|
||||||
/// the oldest byte still in the dictionary and the current write
|
/// the oldest byte still in the dictionary and the current write
|
||||||
/// position. This way dict_repeat(dict, dict->size - 1, &len)
|
/// position. This way dict_repeat() with the maximum valid distance
|
||||||
/// won't need memmove() as the copying cannot overlap.
|
/// won't need memmove() as the copying cannot overlap.
|
||||||
///
|
///
|
||||||
|
/// (3) LZ_DICT_EXTRA bytes are required at the end of the dictionary buffer
|
||||||
|
/// so that extra copying done by dict_repeat() won't write or read past
|
||||||
|
/// the end of the allocated buffer. This amount is *not* counted as part
|
||||||
|
/// of lzma_dict.size.
|
||||||
|
///
|
||||||
/// Note that memcpy() still cannot be used if distance < len.
|
/// Note that memcpy() still cannot be used if distance < len.
|
||||||
///
|
///
|
||||||
/// LZMA's longest match length is 273 so pick a multiple of 16 above that.
|
/// LZMA's longest match length is 273 bytes. The LZMA decoder looks at
|
||||||
|
/// the lowest four bits of the dictionary position, thus 273 must be
|
||||||
|
/// rounded up to the next multiple of 16 (288). In addition, optimized
|
||||||
|
/// dict_repeat() copies 32 bytes at a time, thus this must also be
|
||||||
|
/// a multiple of 32.
|
||||||
#define LZ_DICT_REPEAT_MAX 288
|
#define LZ_DICT_REPEAT_MAX 288
|
||||||
|
|
||||||
/// Initial position in lzma_dict.buf when the dictionary is empty.
|
/// Initial position in lzma_dict.buf when the dictionary is empty.
|
||||||
@ -173,9 +212,17 @@ dict_repeat(lzma_dict *restrict dict,
|
|||||||
if (distance >= dict->pos)
|
if (distance >= dict->pos)
|
||||||
back += dict->size - LZ_DICT_REPEAT_MAX;
|
back += dict->size - LZ_DICT_REPEAT_MAX;
|
||||||
|
|
||||||
// Repeat a block of data from the history. Because memcpy() is faster
|
#if LZMA_LZ_DECODER_CONFIG == 0
|
||||||
// than copying byte by byte in a loop, the copying process gets split
|
// Minimal byte-by-byte method. This might be the least bad choice
|
||||||
// into two cases.
|
// if memcpy() isn't fast and there's no replacement for it below.
|
||||||
|
while (left-- > 0) {
|
||||||
|
dict->buf[dict->pos++] = dict->buf[back++];
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
// Because memcpy() or a similar method can be faster than copying
|
||||||
|
// byte by byte in a loop, the copying process is split into
|
||||||
|
// two cases.
|
||||||
if (distance < left) {
|
if (distance < left) {
|
||||||
// Source and target areas overlap, thus we can't use
|
// Source and target areas overlap, thus we can't use
|
||||||
// memcpy() nor even memmove() safely.
|
// memcpy() nor even memmove() safely.
|
||||||
@ -183,9 +230,33 @@ dict_repeat(lzma_dict *restrict dict,
|
|||||||
dict->buf[dict->pos++] = dict->buf[back++];
|
dict->buf[dict->pos++] = dict->buf[back++];
|
||||||
} while (--left > 0);
|
} while (--left > 0);
|
||||||
} else {
|
} else {
|
||||||
|
# if LZMA_LZ_DECODER_CONFIG == 1
|
||||||
memcpy(dict->buf + dict->pos, dict->buf + back, left);
|
memcpy(dict->buf + dict->pos, dict->buf + back, left);
|
||||||
dict->pos += left;
|
dict->pos += left;
|
||||||
|
|
||||||
|
# elif LZMA_LZ_DECODER_CONFIG == 2
|
||||||
|
// This can copy up to 32 bytes more than required.
|
||||||
|
// (If left == 0, we still copy 32 bytes.)
|
||||||
|
size_t pos = dict->pos;
|
||||||
|
dict->pos += left;
|
||||||
|
do {
|
||||||
|
const __m128i x0 = _mm_loadu_si128(
|
||||||
|
(__m128i *)(dict->buf + back));
|
||||||
|
const __m128i x1 = _mm_loadu_si128(
|
||||||
|
(__m128i *)(dict->buf + back + 16));
|
||||||
|
back += 32;
|
||||||
|
_mm_storeu_si128(
|
||||||
|
(__m128i *)(dict->buf + pos), x0);
|
||||||
|
_mm_storeu_si128(
|
||||||
|
(__m128i *)(dict->buf + pos + 16), x1);
|
||||||
|
pos += 32;
|
||||||
|
} while (pos < dict->pos);
|
||||||
|
|
||||||
|
# else
|
||||||
|
# error "Invalid LZMA_LZ_DECODER_CONFIG value"
|
||||||
|
# endif
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Update how full the dictionary is.
|
// Update how full the dictionary is.
|
||||||
if (!dict->has_wrapped)
|
if (!dict->has_wrapped)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user