liblzma: Optimize LZ decoder slightly.

Now extra buffer space is reserved so that repeating bytes for
any single match will never need to copy from two places (both
the beginning and the end of the buffer). This simplifies
dict_repeat() and helps a little with speed.

This seems to reduce .lzma decompression time about 2 %, so
with .xz and CRC it could be slightly less. The small things
add up still.
This commit is contained in:
Lasse Collin 2024-02-12 17:09:10 +02:00
parent eb518446e5
commit f3872a5947
3 changed files with 88 additions and 60 deletions

View File

@ -53,9 +53,10 @@ typedef struct {
static void static void
lz_decoder_reset(lzma_coder *coder) lz_decoder_reset(lzma_coder *coder)
{ {
coder->dict.pos = 0; coder->dict.pos = 2 * LZ_DICT_REPEAT_MAX;
coder->dict.full = 0; coder->dict.full = 0;
coder->dict.buf[coder->dict.size - 1] = '\0'; coder->dict.buf[2 * LZ_DICT_REPEAT_MAX - 1] = '\0';
coder->dict.has_wrapped = false;
coder->dict.need_reset = false; coder->dict.need_reset = false;
return; return;
} }
@ -69,8 +70,15 @@ decode_buffer(lzma_coder *coder,
{ {
while (true) { while (true) {
// Wrap the dictionary if needed. // Wrap the dictionary if needed.
if (coder->dict.pos == coder->dict.size) if (coder->dict.pos == coder->dict.size) {
coder->dict.pos = 0; // See the comment of #define LZ_DICT_REPEAT_MAX.
coder->dict.pos = LZ_DICT_REPEAT_MAX;
coder->dict.has_wrapped = true;
memcpy(coder->dict.buf, coder->dict.buf
+ coder->dict.size
- LZ_DICT_REPEAT_MAX,
LZ_DICT_REPEAT_MAX);
}
// Store the current dictionary position. It is needed to know // Store the current dictionary position. It is needed to know
// where to start copying to the out[] buffer. // where to start copying to the out[] buffer.
@ -252,21 +260,31 @@ lzma_lz_decoder_init(lzma_next_coder *next, const lzma_allocator *allocator,
// dictionary to the output buffer, since applications are // dictionary to the output buffer, since applications are
// recommended to give aligned buffers to liblzma. // recommended to give aligned buffers to liblzma.
// //
// Reserve 2 * LZ_DICT_REPEAT_MAX bytes of extra space which is
// needed for alloc_size.
//
// Avoid integer overflow. // Avoid integer overflow.
if (lz_options.dict_size > SIZE_MAX - 15) if (lz_options.dict_size > SIZE_MAX - 15 - 2 * LZ_DICT_REPEAT_MAX)
return LZMA_MEM_ERROR; return LZMA_MEM_ERROR;
lz_options.dict_size = (lz_options.dict_size + 15) & ~((size_t)(15)); lz_options.dict_size = (lz_options.dict_size + 15) & ~((size_t)(15));
// Reserve extra space as explained in the comment
// of #define LZ_DICT_REPEAT_MAX.
const size_t alloc_size
= lz_options.dict_size + 2 * LZ_DICT_REPEAT_MAX;
// Allocate and initialize the dictionary. // Allocate and initialize the dictionary.
if (coder->dict.size != lz_options.dict_size) { if (coder->dict.size != alloc_size) {
lzma_free(coder->dict.buf, allocator); lzma_free(coder->dict.buf, allocator);
coder->dict.buf coder->dict.buf = lzma_alloc(alloc_size, allocator);
= lzma_alloc(lz_options.dict_size, allocator);
if (coder->dict.buf == NULL) if (coder->dict.buf == NULL)
return LZMA_MEM_ERROR; return LZMA_MEM_ERROR;
coder->dict.size = lz_options.dict_size; // NOTE: Yes, alloc_size, not lz_options.dict_size. The way
// coder->dict.full is updated will take care that we will
// still reject distances larger than lz_options.dict_size.
coder->dict.size = alloc_size;
} }
lz_decoder_reset(next->coder); lz_decoder_reset(next->coder);
@ -279,9 +297,12 @@ lzma_lz_decoder_init(lzma_next_coder *next, const lzma_allocator *allocator,
const size_t copy_size = my_min(lz_options.preset_dict_size, const size_t copy_size = my_min(lz_options.preset_dict_size,
lz_options.dict_size); lz_options.dict_size);
const size_t offset = lz_options.preset_dict_size - copy_size; const size_t offset = lz_options.preset_dict_size - copy_size;
memcpy(coder->dict.buf, lz_options.preset_dict + offset, memcpy(coder->dict.buf + coder->dict.pos,
lz_options.preset_dict + offset,
copy_size); copy_size);
coder->dict.pos = copy_size;
// dict.pos isn't zero after lz_decoder_reset().
coder->dict.pos += copy_size;
coder->dict.full = copy_size; coder->dict.full = copy_size;
} }

View File

@ -16,10 +16,28 @@
#include "common.h" #include "common.h"
/// Maximum length of a match rounded up to a nice power of 2 which is
/// a good size for aligned memcpy(). The allocated dictionary buffer will
/// be 2 * LZ_DICT_REPEAT_MAX bytes larger than the actual dictionary size:
///
/// (1) Every time the decoder reaches the end of the dictionary buffer,
/// the last LZ_DICT_REPEAT_MAX bytes will be copied to the beginning.
/// This way dict_repeat() will only need to copy from one place,
/// never from both the end and beginning of the buffer.
///
/// (2) The other LZ_DICT_REPEAT_MAX bytes is kept as a buffer between
/// the oldest byte still in the dictionary and the current write
/// position. This way dict_repeat(dict, dict->size - 1, &len)
/// won't need memmove() as the copying cannot overlap.
///
/// Note that memcpy() still cannot be used if distance < len.
///
/// LZMA's longest match length is 273 so pick a multiple of 16 above that.
#define LZ_DICT_REPEAT_MAX 288
typedef struct { typedef struct {
/// Pointer to the dictionary buffer. It can be an allocated buffer /// Pointer to the dictionary buffer.
/// internal to liblzma, or it can a be a buffer given by the
/// application when in single-call mode (not implemented yet).
uint8_t *buf; uint8_t *buf;
/// Write position in dictionary. The next byte will be written to /// Write position in dictionary. The next byte will be written to
@ -34,9 +52,16 @@ typedef struct {
/// Write limit /// Write limit
size_t limit; size_t limit;
/// Size of the dictionary /// Allocated size of buf. This is 2 * LZ_DICT_REPEAT_MAX bytes
/// larger than the actual dictionary size. This is enforced by
/// how the value for "full" is set; it can be at most
/// "size - 2 * LZ_DICT_REPEAT_MAX".
size_t size; size_t size;
/// True once the dictionary has become full and the writing position
/// has been wrapped in decode_buffer() in lz_decoder.c.
bool has_wrapped;
/// True when dictionary should be reset before decoding more data. /// True when dictionary should be reset before decoding more data.
bool need_reset; bool need_reset;
@ -102,7 +127,16 @@ static inline uint8_t
dict_get(const lzma_dict *const dict, const uint32_t distance) dict_get(const lzma_dict *const dict, const uint32_t distance)
{ {
return dict->buf[dict->pos - distance - 1 return dict->buf[dict->pos - distance - 1
+ (distance < dict->pos ? 0 : dict->size)]; + (distance < dict->pos
? 0 : dict->size - LZ_DICT_REPEAT_MAX)];
}
/// Optimized version of dict_get(dict, 0)
static inline uint8_t
dict_get0(const lzma_dict *const dict)
{
return dict->buf[dict->pos - 1];
} }
@ -131,50 +165,27 @@ dict_repeat(lzma_dict *dict, uint32_t distance, uint32_t *len)
uint32_t left = my_min(dict_avail, *len); uint32_t left = my_min(dict_avail, *len);
*len -= left; *len -= left;
size_t back = dict->pos - distance - 1;
if (distance >= dict->pos)
back += dict->size - LZ_DICT_REPEAT_MAX;
// Repeat a block of data from the history. Because memcpy() is faster // Repeat a block of data from the history. Because memcpy() is faster
// than copying byte by byte in a loop, the copying process gets split // than copying byte by byte in a loop, the copying process gets split
// into three cases. // into two cases.
if (distance < left) { if (distance < left) {
// Source and target areas overlap, thus we can't use // Source and target areas overlap, thus we can't use
// memcpy() nor even memmove() safely. // memcpy() nor even memmove() safely.
do { do {
dict->buf[dict->pos] = dict_get(dict, distance); dict->buf[dict->pos++] = dict->buf[back++];
++dict->pos;
} while (--left > 0); } while (--left > 0);
} else if (distance < dict->pos) {
// The easiest and fastest case
memcpy(dict->buf + dict->pos,
dict->buf + dict->pos - distance - 1,
left);
dict->pos += left;
} else { } else {
// The bigger the dictionary, the more rare this memcpy(dict->buf + dict->pos, dict->buf + back, left);
// case occurs. We need to "wrap" the dict, thus dict->pos += left;
// we might need two memcpy() to copy all the data.
assert(dict->full == dict->size);
const uint32_t copy_pos
= dict->pos - distance - 1 + dict->size;
uint32_t copy_size = dict->size - copy_pos;
if (copy_size < left) {
memmove(dict->buf + dict->pos, dict->buf + copy_pos,
copy_size);
dict->pos += copy_size;
copy_size = left - copy_size;
memcpy(dict->buf + dict->pos, dict->buf, copy_size);
dict->pos += copy_size;
} else {
memmove(dict->buf + dict->pos, dict->buf + copy_pos,
left);
dict->pos += left;
}
} }
// Update how full the dictionary is. // Update how full the dictionary is.
if (dict->full < dict->pos) if (!dict->has_wrapped)
dict->full = dict->pos; dict->full = dict->pos - 2 * LZ_DICT_REPEAT_MAX;
return unlikely(*len != 0); return unlikely(*len != 0);
} }
@ -185,8 +196,8 @@ dict_put(lzma_dict *dict, uint8_t byte)
{ {
dict->buf[dict->pos++] = byte; dict->buf[dict->pos++] = byte;
if (dict->pos > dict->full) if (!dict->has_wrapped)
dict->full = dict->pos; dict->full = dict->pos - 2 * LZ_DICT_REPEAT_MAX;
} }
@ -198,11 +209,7 @@ dict_put_safe(lzma_dict *dict, uint8_t byte)
if (dict->pos == dict->limit) if (dict->pos == dict->limit)
return true; return true;
dict->buf[dict->pos++] = byte; dict_put(dict, byte);
if (dict->pos > dict->full)
dict->full = dict->pos;
return false; return false;
} }
@ -226,8 +233,8 @@ dict_write(lzma_dict *restrict dict, const uint8_t *restrict in,
*left -= lzma_bufcpy(in, in_pos, in_size, *left -= lzma_bufcpy(in, in_pos, in_size,
dict->buf, &dict->pos, dict->limit); dict->buf, &dict->pos, dict->limit);
if (dict->pos > dict->full) if (!dict->has_wrapped)
dict->full = dict->pos; dict->full = dict->pos - 2 * LZ_DICT_REPEAT_MAX;
return; return;
} }

View File

@ -360,7 +360,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
// lc params. // lc params.
probs = literal_subcoder(coder->literal, probs = literal_subcoder(coder->literal,
literal_context_bits, literal_pos_mask, literal_context_bits, literal_pos_mask,
dict.pos, dict_get(&dict, 0)); dict.pos, dict_get0(&dict));
if (is_literal_state(state)) { if (is_literal_state(state)) {
update_literal_normal(state); update_literal_normal(state);
@ -685,7 +685,7 @@ slow:
probs = literal_subcoder(coder->literal, probs = literal_subcoder(coder->literal,
literal_context_bits, literal_pos_mask, literal_context_bits, literal_pos_mask,
dict.pos, dict_get(&dict, 0)); dict.pos, dict_get0(&dict));
symbol = 1; symbol = 1;
if (is_literal_state(state)) { if (is_literal_state(state)) {