From e0c0ee475c0800c08291ae45e0d66aa00d5ce604 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Mon, 12 Feb 2024 17:09:10 +0200 Subject: [PATCH] liblzma: LZMA decoder improvements. This adds macros for bittree decoding which prepares the code for alternative C versions and inline assembly. --- src/liblzma/lzma/lzma_decoder.c | 262 ++++++++----------------- src/liblzma/rangecoder/range_common.h | 4 + src/liblzma/rangecoder/range_decoder.h | 144 ++++++++++++-- 3 files changed, 210 insertions(+), 200 deletions(-) diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c index 6203bae9..0788558f 100644 --- a/src/liblzma/lzma/lzma_decoder.c +++ b/src/liblzma/lzma/lzma_decoder.c @@ -24,9 +24,8 @@ // Minimum number of input bytes to safely decode one LZMA symbol. // The worst case is that we decode 22 bits using probabilities and 26 -// direct bits. This may decode at maximum 20 bytes of input plus one -// extra byte after the final EOPM normalization. -#define LZMA_IN_REQUIRED 21 +// direct bits. This may decode at maximum 20 bytes of input. +#define LZMA_IN_REQUIRED 20 // Macros for (somewhat) size-optimized code. @@ -73,32 +72,22 @@ do { \ symbol = 1; \ rc_if_0(ld.choice) { \ rc_update_0(ld.choice); \ - rc_bit(ld.low[pos_state][symbol], , ); \ - rc_bit(ld.low[pos_state][symbol], , ); \ - rc_bit(ld.low[pos_state][symbol], , ); \ - target = symbol - LEN_LOW_SYMBOLS + MATCH_LEN_MIN; \ + rc_bittree3(ld.low[pos_state], \ + -LEN_LOW_SYMBOLS + MATCH_LEN_MIN); \ + target = symbol; \ } else { \ rc_update_1(ld.choice); \ rc_if_0(ld.choice2) { \ rc_update_0(ld.choice2); \ - rc_bit(ld.mid[pos_state][symbol], , ); \ - rc_bit(ld.mid[pos_state][symbol], , ); \ - rc_bit(ld.mid[pos_state][symbol], , ); \ - target = symbol - LEN_MID_SYMBOLS \ - + MATCH_LEN_MIN + LEN_LOW_SYMBOLS; \ + rc_bittree3(ld.mid[pos_state], -LEN_MID_SYMBOLS \ + + MATCH_LEN_MIN + LEN_LOW_SYMBOLS); \ + target = symbol; \ } else { \ rc_update_1(ld.choice2); \ - rc_bit(ld.high[symbol], , ); \ - rc_bit(ld.high[symbol], , ); \ - rc_bit(ld.high[symbol], , ); \ - rc_bit(ld.high[symbol], , ); \ - rc_bit(ld.high[symbol], , ); \ - rc_bit(ld.high[symbol], , ); \ - rc_bit(ld.high[symbol], , ); \ - rc_bit(ld.high[symbol], , ); \ - target = symbol - LEN_HIGH_SYMBOLS \ + rc_bittree8(ld.high, -LEN_HIGH_SYMBOLS \ + MATCH_LEN_MIN \ - + LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS; \ + + LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS); \ + target = symbol; \ } \ } \ } while (0) @@ -369,8 +358,8 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, // If there is not enough room for another LZMA symbol // go to Resumable mode. - if (rc_in_pos + LZMA_IN_REQUIRED > in_size - || dict.pos == dict.limit) + if (unlikely(rc_in_end - rc_in_ptr < LZMA_IN_REQUIRED + || dict.pos == dict.limit)) goto slow; // Decode the first bit from the next LZMA symbol. @@ -390,64 +379,14 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, probs = literal_subcoder(coder->literal, literal_context_bits, literal_pos_mask, dict.pos, dict_get(&dict, 0)); - symbol = 1; if (is_literal_state(state)) { // Decode literal without match byte. - // We need to decode 8 bits, so instead - // of looping from 1 - 8, we unroll the - // loop for a speed optimization. - rc_bit(probs[symbol], , ); - rc_bit(probs[symbol], , ); - rc_bit(probs[symbol], , ); - rc_bit(probs[symbol], , ); - rc_bit(probs[symbol], , ); - rc_bit(probs[symbol], , ); - rc_bit(probs[symbol], , ); - rc_bit(probs[symbol], , ); + rc_bittree8(probs, 0); } else { // Decode literal with match byte. - // - // We store the byte we compare against - // ("match byte") to "len" to minimize the - // number of variables we need to store - // between decoder calls. - - len = (uint32_t)(dict_get(&dict, rep0)) << 1; - - // The usage of "offset" allows omitting some - // branches, which should give tiny speed - // improvement on some CPUs. "offset" gets - // set to zero if match_bit didn't match. - offset = 0x100; - - // Unroll the loop. - uint32_t match_bit; - uint32_t subcoder_index; - -# define decode_with_match_bit \ - match_bit = len & offset; \ - subcoder_index = offset + match_bit + symbol; \ - rc_bit(probs[subcoder_index], \ - offset &= ~match_bit, \ - offset &= match_bit) - - decode_with_match_bit; - len <<= 1; - decode_with_match_bit; - len <<= 1; - decode_with_match_bit; - len <<= 1; - decode_with_match_bit; - len <<= 1; - decode_with_match_bit; - len <<= 1; - decode_with_match_bit; - len <<= 1; - decode_with_match_bit; - len <<= 1; - decode_with_match_bit; -# undef decode_match_bit + rc_matched_literal(probs, + dict_get(&dict, rep0)); } state = next_state[state]; @@ -501,18 +440,8 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, // The next 6 bits determine how to decode the // rest of the distance. probs = coder->dist_slot[get_dist_state(len)]; - symbol = 1; - rc_bit(probs[symbol], , ); - rc_bit(probs[symbol], , ); - rc_bit(probs[symbol], , ); - rc_bit(probs[symbol], , ); - rc_bit(probs[symbol], , ); - rc_bit(probs[symbol], , ); - - // Get rid of the highest bit that was needed for - // indexing of the probability array. - symbol -= DIST_SLOTS; + rc_bittree6(probs, -DIST_SLOTS); assert(symbol <= 63); if (symbol < DIST_MODEL_START) { @@ -540,6 +469,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, assert(limit <= 5); rep0 <<= limit; assert(rep0 <= 96); + // -1 is fine, because we start // decoding at probs[1], not probs[0]. // NOTE: This violates the C standard, @@ -553,106 +483,51 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, probs = coder->pos_special + rep0 - symbol - 1; symbol = 1; - offset = 0; + offset = 1; - switch (limit) { - case 5: - assert(offset == 0); - rc_bit(probs[symbol], , - rep0 += 1U); - ++offset; - --limit; - case 4: - rc_bit(probs[symbol], , - rep0 += 1U << offset); - ++offset; - --limit; - case 3: - rc_bit(probs[symbol], , - rep0 += 1U << offset); - ++offset; - --limit; - case 2: - rc_bit(probs[symbol], , - rep0 += 1U << offset); - ++offset; - --limit; - case 1: - // We need "symbol" only for - // indexing the probability - // array, thus we can use - // rc_bit_last() here to - // omit the unneeded updating - // of "symbol". - rc_bit_last(probs[symbol], , - rep0 += 1U << offset); - } + // Variable number (1-5) of bits + // from a reverse bittree. This + // isn't worth manual unrolling. + do { + rc_bit_add_if_1(probs, + rep0, offset); + offset <<= 1; + } while (--limit > 0); } else { // The distance is >= 128. Decode the // lower bits without probabilities // except the lowest four bits. assert(symbol >= 14); assert(limit >= 6); + limit -= ALIGN_BITS; assert(limit >= 2); - // Not worth manual unrolling - do { - rc_direct(rep0); - } while (--limit > 0); + rc_direct(rep0, limit); // Decode the lowest four bits using // probabilities. rep0 <<= ALIGN_BITS; - symbol = 1; + rc_bittree_rev4(coder->pos_align); + rep0 += symbol; - rc_bit(coder->pos_align[symbol], , - rep0 += 1); - - rc_bit(coder->pos_align[symbol], , - rep0 += 2); - - rc_bit(coder->pos_align[symbol], , - rep0 += 4); - - // Like when distance [4, 127], we - // don't need "symbol" for anything - // other than indexing the probability - // array. - rc_bit_last( - coder->pos_align[symbol], , - rep0 += 8); - - if (rep0 == UINT32_MAX) { - /////////////////////////// - // End of payload marker // - /////////////////////////// - - // End of payload marker was - // found. It may only be - // present if - // - uncompressed size is - // unknown or - // - after known uncompressed - // size amount of bytes has - // been decompressed and - // caller has indicated - // that EOPM might be used - // (it's not allowed in - // LZMA2). - if (!eopm_is_valid) { - ret = LZMA_DATA_ERROR; - goto out; - } - - // LZMA1 stream with - // end-of-payload marker. - rc_normalize(); - ret = rc_is_finished(rc) - ? LZMA_STREAM_END - : LZMA_DATA_ERROR; - goto out; - } + // If the end of payload marker (EOPM) + // is detected, jump to the safe code. + // The EOPM handling isn't speed + // critical at all. + // + // A final normalization is needed + // after the EOPM (there can be a + // dummy byte to read in some cases). + // If the normalization was done here + // in the fast code, it would need to + // be taken into account in the value + // of LZMA_IN_REQUIRED. Using the + // safe code allows keeping + // LZMA_IN_REQUIRED as 20 instead of + // 21. + if (rep0 == UINT32_MAX) + goto eopm; } } @@ -948,31 +823,48 @@ slow: limit -= ALIGN_BITS; assert(limit >= 2); case SEQ_DIRECT: - do { - rc_direct_safe(rep0, - SEQ_DIRECT); - } while (--limit > 0); + rc_direct_safe(rep0, limit, + SEQ_DIRECT); rep0 <<= ALIGN_BITS; - symbol = 1; - - offset = 0; + symbol = 0; + offset = 1; case SEQ_ALIGN: do { - rc_bit_safe(coder->pos_align[ - symbol], , - rep0 += 1U << offset, + rc_bit_last_safe( + coder->pos_align[ + offset + + symbol], + , + symbol += offset, SEQ_ALIGN); - } while (++offset < ALIGN_BITS); + offset <<= 1; + } while (offset < ALIGN_SIZE); + + rep0 += symbol; - // End of payload marker if (rep0 == UINT32_MAX) { + // End of payload marker was + // found. It may only be + // present if + // - uncompressed size is + // unknown or + // - after known uncompressed + // size amount of bytes has + // been decompressed and + // caller has indicated + // that EOPM might be used + // (it's not allowed in + // LZMA2). +eopm: if (!eopm_is_valid) { ret = LZMA_DATA_ERROR; goto out; } case SEQ_EOPM: + // LZMA1 stream with + // end-of-payload marker. rc_normalize_safe(SEQ_EOPM); ret = rc_is_finished(rc) ? LZMA_STREAM_END diff --git a/src/liblzma/rangecoder/range_common.h b/src/liblzma/rangecoder/range_common.h index bcfd966e..ac4dbe19 100644 --- a/src/liblzma/rangecoder/range_common.h +++ b/src/liblzma/rangecoder/range_common.h @@ -68,6 +68,10 @@ /// /// I will be sticking to uint16_t unless some specific architectures /// are *much* faster (20-50 %) with uint32_t. +/// +/// Update in 2024: The branchless C and x86-64 assembly was written so that +/// probability is assumed to be uint16_t. (In contrast, LZMA SDK 23.01 +/// assembly supports both types.) typedef uint16_t probability; #endif diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h index 5e813f56..40de80c0 100644 --- a/src/liblzma/rangecoder/range_decoder.h +++ b/src/liblzma/rangecoder/range_decoder.h @@ -16,6 +16,17 @@ #include "range_common.h" +// Negative RC_BIT_MODEL_TOTAL but the lowest RC_MOVE_BITS are flipped. +// This is useful for updating probability variables in branchless decoding: +// +// uint32_t decoded_bit = ...; +// probability tmp = RC_BIT_MODEL_OFFSET; +// tmp &= decoded_bit - 1; +// prob -= (prob + tmp) >> RC_MOVE_BITS; +#define RC_BIT_MODEL_OFFSET \ + ((UINT32_C(1) << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL) + + typedef struct { uint32_t range; uint32_t code; @@ -52,7 +63,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in, /// variables 'in' and 'in_size' to be defined. #define rc_to_local(range_decoder, in_pos) \ lzma_range_decoder rc = range_decoder; \ - size_t rc_in_pos = (in_pos); \ + const uint8_t *rc_in_ptr = in + (in_pos); \ + const uint8_t *rc_in_end = in + in_size; \ uint32_t rc_bound @@ -60,7 +72,7 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in, #define rc_from_local(range_decoder, in_pos) \ do { \ range_decoder = rc; \ - in_pos = rc_in_pos; \ + in_pos = (size_t)(rc_in_ptr - in); \ } while (0) @@ -85,7 +97,7 @@ do { \ do { \ if (rc.range < RC_TOP_VALUE) { \ rc.range <<= RC_SHIFT_BITS; \ - rc.code = (rc.code << RC_SHIFT_BITS) | in[rc_in_pos++]; \ + rc.code = (rc.code << RC_SHIFT_BITS) | *rc_in_ptr++; \ } \ } while (0) @@ -98,12 +110,12 @@ do { \ #define rc_normalize_safe(seq) \ do { \ if (rc.range < RC_TOP_VALUE) { \ - if (unlikely(rc_in_pos == in_size)) { \ + if (rc_in_ptr == rc_in_end) { \ coder->sequence = seq; \ goto out; \ } \ rc.range <<= RC_SHIFT_BITS; \ - rc.code = (rc.code << RC_SHIFT_BITS) | in[rc_in_pos++]; \ + rc.code = (rc.code << RC_SHIFT_BITS) | *rc_in_ptr++; \ } \ } while (0) @@ -133,10 +145,14 @@ do { \ /// Update the range decoder state and the used probability variable to /// match a decoded bit of 0. +/// +/// The x86-64 assemly uses the commented method but it seems that, +/// at least on x86-64, the first version is slightly faster as C code. #define rc_update_0(prob) \ do { \ rc.range = rc_bound; \ prob += (RC_BIT_MODEL_TOTAL - (prob)) >> RC_MOVE_BITS; \ + /* prob -= ((prob) + RC_BIT_MODEL_OFFSET) >> RC_MOVE_BITS; */ \ } while (0) @@ -192,19 +208,121 @@ do { \ symbol = (symbol << 1) + 1; action1, \ seq); -/// Decode a bit without using a probability. -#define rc_direct(dest) \ +// Unroll fixed-sized bittree decoding. +// +// A compile-time constant in final_add can be used to get rid of the high bit +// from symbol that is used for the array indexing (1U << bittree_bits). +// final_add may also be used to add offset to the result (LZMA length +// decoder does that). +// +// The reason to have final_add here is that in the asm code the addition +// can be done for free: in x86-64 there is SBB instruction with -1 as +// the immediate value, and final_add is combined with that value. +#define rc_bittree_bit(prob) \ + rc_bit(prob, , ) + +#define rc_bittree3(probs, final_add) \ do { \ + symbol = 1; \ + rc_bittree_bit(probs[symbol]); \ + rc_bittree_bit(probs[symbol]); \ + rc_bittree_bit(probs[symbol]); \ + symbol += (uint32_t)(final_add); \ +} while (0) + +#define rc_bittree6(probs, final_add) \ +do { \ + symbol = 1; \ + rc_bittree_bit(probs[symbol]); \ + rc_bittree_bit(probs[symbol]); \ + rc_bittree_bit(probs[symbol]); \ + rc_bittree_bit(probs[symbol]); \ + rc_bittree_bit(probs[symbol]); \ + rc_bittree_bit(probs[symbol]); \ + symbol += (uint32_t)(final_add); \ +} while (0) + +#define rc_bittree8(probs, final_add) \ +do { \ + symbol = 1; \ + rc_bittree_bit(probs[symbol]); \ + rc_bittree_bit(probs[symbol]); \ + rc_bittree_bit(probs[symbol]); \ + rc_bittree_bit(probs[symbol]); \ + rc_bittree_bit(probs[symbol]); \ + rc_bittree_bit(probs[symbol]); \ + rc_bittree_bit(probs[symbol]); \ + rc_bittree_bit(probs[symbol]); \ + symbol += (uint32_t)(final_add); \ +} while (0) + + +// Fixed-sized reverse bittree +#define rc_bittree_rev4(probs) \ +do { \ + symbol = 0; \ + rc_bit_last(probs[symbol + 1], , symbol += 1); \ + rc_bit_last(probs[symbol + 2], , symbol += 2); \ + rc_bit_last(probs[symbol + 4], , symbol += 4); \ + rc_bit_last(probs[symbol + 8], , symbol += 8); \ +} while (0) + + +// Decode one bit from variable-sized reverse bittree. +// The loop is done in the code that uses this macro. +#define rc_bit_add_if_1(probs, dest, value_to_add_if_1) \ + rc_bit(probs[symbol], \ + , \ + dest += value_to_add_if_1); + + +// Matched literal +#define decode_with_match_bit \ + t_match_byte <<= 1; \ + t_match_bit = t_match_byte & t_offset; \ + t_subcoder_index = t_offset + t_match_bit + symbol; \ + rc_bit(probs[t_subcoder_index], \ + t_offset &= ~t_match_bit, \ + t_offset &= t_match_bit) + +#define rc_matched_literal(probs_base_var, match_byte) \ +do { \ + uint32_t t_match_byte = (match_byte); \ + uint32_t t_match_bit; \ + uint32_t t_subcoder_index; \ + uint32_t t_offset = 0x100; \ + symbol = 1; \ + decode_with_match_bit; \ + decode_with_match_bit; \ + decode_with_match_bit; \ + decode_with_match_bit; \ + decode_with_match_bit; \ + decode_with_match_bit; \ + decode_with_match_bit; \ + decode_with_match_bit; \ +} while (0) + + +/// Decode a bit without using a probability. +// +// NOTE: GCC 13 and Clang/LLVM 16 can, at least on x86-64, optimize the bound +// calculation to use an arithmetic right shift so there's no need to provide +// the alternative code which, according to C99/C11/C23 6.3.1.3-p3 isn't +// perfectly portable: rc_bound = (uint32_t)((int32_t)rc.code >> 31); +#define rc_direct(dest, count_var) \ +do { \ + dest = (dest << 1) + 1; \ rc_normalize(); \ rc.range >>= 1; \ rc.code -= rc.range; \ rc_bound = UINT32_C(0) - (rc.code >> 31); \ + dest += rc_bound; \ rc.code += rc.range & rc_bound; \ - dest = (dest << 1) + (rc_bound + 1); \ -} while (0) +} while (--count_var > 0) -#define rc_direct_safe(dest, seq) \ + +#define rc_direct_safe(dest, count_var, seq) \ do { \ rc_normalize_safe(seq); \ rc.range >>= 1; \ @@ -212,10 +330,6 @@ do { \ rc_bound = UINT32_C(0) - (rc.code >> 31); \ rc.code += rc.range & rc_bound; \ dest = (dest << 1) + (rc_bound + 1); \ -} while (0) - - -// NOTE: No macros are provided for bittree decoding. It seems to be simpler -// to just write them open in the code. +} while (--count_var > 0) #endif