From 943b012d09f717f7b44284c4e4976ea41264c731 Mon Sep 17 00:00:00 2001
From: Lasse Collin <lasse.collin@tukaani.org>
Date: Tue, 25 Mar 2025 15:18:31 +0200
Subject: [PATCH] liblzma: Use SSE2 intrinsics instead of memcpy() in
 dict_repeat()

SSE2 is supported on every x86-64 processor. The SSE2 code is used on
32-bit x86 if compiler options permit unconditional use of SSE2.

dict_repeat() copies short random-sized unaligned buffers. At least
on glibc, FreeBSD, and Windows (MSYS2, UCRT, MSVCRT), memcpy() is
clearly faster than byte-by-byte copying in this use case. Compared
to the memcpy() version, the new SSE2 version reduces decompression
time by 0-5 % depending on the machine and libc. It should never be
slower than the memcpy() version.

However, on musl 1.2.5 on x86-64, the memcpy() version is the slowest.
Compared to the memcpy() version:

  - The byte-by-version takes 6-7 % less time to decompress.
  - The SSE2 version takes 16-18 % less time to decompress.

The numbers are from decompressing a Linux kernel source tarball in
single-threaded mode on older AMD and Intel systems. The tarball
compresses well, and thus dict_repeat() performance matters more
than with some other files.
---
 src/liblzma/lz/lz_decoder.c | 14 ++++--
 src/liblzma/lz/lz_decoder.h | 87 +++++++++++++++++++++++++++++++++----
 2 files changed, 90 insertions(+), 11 deletions(-)

diff --git a/src/liblzma/lz/lz_decoder.c b/src/liblzma/lz/lz_decoder.c
index 697000d7..6ca0bcc4 100644
--- a/src/liblzma/lz/lz_decoder.c
+++ b/src/liblzma/lz/lz_decoder.c
@@ -261,10 +261,12 @@ lzma_lz_decoder_init(lzma_next_coder *next, const lzma_allocator *allocator,
 	// recommended to give aligned buffers to liblzma.
 	//
 	// Reserve 2 * LZ_DICT_REPEAT_MAX bytes of extra space which is
-	// needed for alloc_size.
+	// needed for alloc_size. Reserve also LZ_DICT_EXTRA bytes of extra
+	// space which is *not* counted in alloc_size or coder->dict.size.
 	//
 	// Avoid integer overflow.
-	if (lz_options.dict_size > SIZE_MAX - 15 - 2 * LZ_DICT_REPEAT_MAX)
+	if (lz_options.dict_size > SIZE_MAX - 15 - 2 * LZ_DICT_REPEAT_MAX
+			- LZ_DICT_EXTRA)
 		return LZMA_MEM_ERROR;
 
 	lz_options.dict_size = (lz_options.dict_size + 15) & ~((size_t)(15));
@@ -277,7 +279,13 @@ lzma_lz_decoder_init(lzma_next_coder *next, const lzma_allocator *allocator,
 	// Allocate and initialize the dictionary.
 	if (coder->dict.size != alloc_size) {
 		lzma_free(coder->dict.buf, allocator);
-		coder->dict.buf = lzma_alloc(alloc_size, allocator);
+
+		// The LZ_DICT_EXTRA bytes at the end of the buffer aren't
+		// included in alloc_size. These extra bytes allow
+		// dict_repeat() to read and write more data than requested.
+		// Otherwise this extra space is ignored.
+		coder->dict.buf = lzma_alloc(alloc_size + LZ_DICT_EXTRA,
+				allocator);
 		if (coder->dict.buf == NULL)
 			return LZMA_MEM_ERROR;
 
diff --git a/src/liblzma/lz/lz_decoder.h b/src/liblzma/lz/lz_decoder.h
index ac9334ad..2698e016 100644
--- a/src/liblzma/lz/lz_decoder.h
+++ b/src/liblzma/lz/lz_decoder.h
@@ -15,10 +15,40 @@
 
 #include "common.h"
 
+#ifdef HAVE_IMMINTRIN_H
+#	include <immintrin.h>
+#endif
 
-/// Maximum length of a match rounded up to a nice power of 2 which is
-/// a good size for aligned memcpy(). The allocated dictionary buffer will
-/// be 2 * LZ_DICT_REPEAT_MAX bytes larger than the actual dictionary size:
+
+// dict_repeat() implementation variant:
+// 0 = Byte-by-byte copying only.
+// 1 = Use memcpy() for non-overlapping copies.
+// 2 = Use x86 SSE2 for non-overlapping copies.
+#ifndef LZMA_LZ_DECODER_CONFIG
+#	if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
+		&& defined(HAVE_IMMINTRIN_H) \
+		&& (defined(__SSE2__) || defined(_M_X64) \
+			|| (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
+#		define LZMA_LZ_DECODER_CONFIG 2
+#	else
+#		define LZMA_LZ_DECODER_CONFIG 1
+#	endif
+#endif
+
+/// Byte-by-byte and memcpy() copy exactly the amount needed. Other methods
+/// can copy up to LZ_DICT_EXTRA bytes more than requested, and this amount
+/// of extra space is needed at the end of the allocated dictionary buffer.
+///
+/// NOTE: If this is increased, update LZMA_DICT_REPEAT_MAX too.
+#if LZMA_LZ_DECODER_CONFIG >= 2
+#	define LZ_DICT_EXTRA 32
+#else
+#	define LZ_DICT_EXTRA 0
+#endif
+
+/// Maximum number of bytes that dict_repeat() may copy. The allocated
+/// dictionary buffer will be 2 * LZ_DICT_REPEAT_MAX + LZMA_DICT_EXTRA bytes
+/// larger than the actual dictionary size:
 ///
 /// (1) Every time the decoder reaches the end of the dictionary buffer,
 ///     the last LZ_DICT_REPEAT_MAX bytes will be copied to the beginning.
@@ -27,12 +57,21 @@
 ///
 /// (2) The other LZ_DICT_REPEAT_MAX bytes is kept as a buffer between
 ///     the oldest byte still in the dictionary and the current write
-///     position. This way dict_repeat(dict, dict->size - 1, &len)
+///     position. This way dict_repeat() with the maximum valid distance
 ///     won't need memmove() as the copying cannot overlap.
 ///
+/// (3) LZ_DICT_EXTRA bytes are required at the end of the dictionary buffer
+///     so that extra copying done by dict_repeat() won't write or read past
+///     the end of the allocated buffer. This amount is *not* counted as part
+///     of lzma_dict.size.
+///
 /// Note that memcpy() still cannot be used if distance < len.
 ///
-/// LZMA's longest match length is 273 so pick a multiple of 16 above that.
+/// LZMA's longest match length is 273 bytes. The LZMA decoder looks at
+/// the lowest four bits of the dictionary position, thus 273 must be
+/// rounded up to the next multiple of 16 (288). In addition, optimized
+/// dict_repeat() copies 32 bytes at a time, thus this must also be
+/// a multiple of 32.
 #define LZ_DICT_REPEAT_MAX 288
 
 /// Initial position in lzma_dict.buf when the dictionary is empty.
@@ -173,9 +212,17 @@ dict_repeat(lzma_dict *restrict dict,
 	if (distance >= dict->pos)
 		back += dict->size - LZ_DICT_REPEAT_MAX;
 
-	// Repeat a block of data from the history. Because memcpy() is faster
-	// than copying byte by byte in a loop, the copying process gets split
-	// into two cases.
+#if LZMA_LZ_DECODER_CONFIG == 0
+	// Minimal byte-by-byte method. This might be the least bad choice
+	// if memcpy() isn't fast and there's no replacement for it below.
+	while (left-- > 0) {
+		dict->buf[dict->pos++] = dict->buf[back++];
+	}
+
+#else
+	// Because memcpy() or a similar method can be faster than copying
+	// byte by byte in a loop, the copying process is split into
+	// two cases.
 	if (distance < left) {
 		// Source and target areas overlap, thus we can't use
 		// memcpy() nor even memmove() safely.
@@ -183,9 +230,33 @@ dict_repeat(lzma_dict *restrict dict,
 			dict->buf[dict->pos++] = dict->buf[back++];
 		} while (--left > 0);
 	} else {
+#	if LZMA_LZ_DECODER_CONFIG == 1
 		memcpy(dict->buf + dict->pos, dict->buf + back, left);
 		dict->pos += left;
+
+#	elif LZMA_LZ_DECODER_CONFIG == 2
+		// This can copy up to 32 bytes more than required.
+		// (If left == 0, we still copy 32 bytes.)
+		size_t pos = dict->pos;
+		dict->pos += left;
+		do {
+			const __m128i x0 = _mm_loadu_si128(
+					(__m128i *)(dict->buf + back));
+			const __m128i x1 = _mm_loadu_si128(
+					(__m128i *)(dict->buf + back + 16));
+			back += 32;
+			_mm_storeu_si128(
+					(__m128i *)(dict->buf + pos), x0);
+			_mm_storeu_si128(
+					(__m128i *)(dict->buf + pos + 16), x1);
+			pos += 32;
+		} while (pos < dict->pos);
+
+#	else
+#		error "Invalid LZMA_LZ_DECODER_CONFIG value"
+#	endif
 	}
+#endif
 
 	// Update how full the dictionary is.
 	if (!dict->has_wrapped)