mirror of https://git.tukaani.org/xz.git
Demystified the "state" variable in LZMA code. Use the
word literal instead of char for better consistency. There are still some names with _char instead of _literal in lzma_optimum, these may be changed later. Renamed length coder variables. This commit doesn't change the program logic.
This commit is contained in:
parent
e6eb0a2675
commit
63b74d000e
|
@ -31,8 +31,6 @@
|
|||
///////////////
|
||||
|
||||
#define REP_DISTANCES 4
|
||||
#define STATES 12
|
||||
#define LIT_STATES 7
|
||||
|
||||
#define POS_SLOT_BITS 6
|
||||
#define DICT_LOG_SIZE_MAX 30
|
||||
|
@ -105,25 +103,62 @@
|
|||
// State //
|
||||
///////////
|
||||
|
||||
// Used for updating strm->data->state in both encoder and decoder.
|
||||
/// This enum is used to track which events have occurred most recently and
|
||||
/// in which order. This information is used to predict the next event.
|
||||
///
|
||||
/// Events:
|
||||
/// - Literal: One 8-bit byte
|
||||
/// - Match: Repeat a chunk of data at some distance
|
||||
/// - Long repeat: Multi-byte match at a recently seen distance
|
||||
/// - Short repeat: One-byte repeat at a recently seen distance
|
||||
///
|
||||
/// The event names are in from STATE_oldest_older_previous. REP means
|
||||
/// either short or long repeated match, and NONLIT means any non-literal.
|
||||
typedef enum {
|
||||
STATE_LIT_LIT,
|
||||
STATE_MATCH_LIT_LIT,
|
||||
STATE_REP_LIT_LIT,
|
||||
STATE_SHORTREP_LIT_LIT,
|
||||
STATE_MATCH_LIT,
|
||||
STATE_REP_LIT,
|
||||
STATE_SHORTREP_LIT,
|
||||
STATE_LIT_MATCH,
|
||||
STATE_LIT_LONGREP,
|
||||
STATE_LIT_SHORTREP,
|
||||
STATE_NONLIT_MATCH,
|
||||
STATE_NONLIT_REP,
|
||||
} lzma_lzma_state;
|
||||
|
||||
#define update_char(index) \
|
||||
index = ((index) < 4 \
|
||||
? 0 \
|
||||
: ((index) < 10 \
|
||||
? (index) - 3 \
|
||||
: (index) - 6))
|
||||
|
||||
#define update_match(index) \
|
||||
index = ((index) < LIT_STATES ? 7 : 10)
|
||||
/// Total number of states
|
||||
#define STATES 12
|
||||
|
||||
#define update_rep(index) \
|
||||
index = ((index) < LIT_STATES ? 8 : 11)
|
||||
/// The lowest 7 states indicate that the previous state was a literal.
|
||||
#define LIT_STATES 7
|
||||
|
||||
#define update_short_rep(index) \
|
||||
index = ((index) < LIT_STATES ? 9 : 11)
|
||||
|
||||
#define is_char_state(index) \
|
||||
((index) < LIT_STATES)
|
||||
/// Indicate that the latest state was a literal.
|
||||
#define update_literal(state) \
|
||||
state = ((state) <= STATE_SHORTREP_LIT_LIT \
|
||||
? STATE_LIT_LIT \
|
||||
: ((state) <= STATE_LIT_SHORTREP \
|
||||
? (state) - 3 \
|
||||
: (state) - 6))
|
||||
|
||||
/// Indicate that the latest state was a match.
|
||||
#define update_match(state) \
|
||||
state = ((state) < LIT_STATES ? STATE_LIT_MATCH : STATE_NONLIT_MATCH)
|
||||
|
||||
/// Indicate that the latest state was a long repeated match.
|
||||
#define update_long_rep(state) \
|
||||
state = ((state) < LIT_STATES ? STATE_LIT_LONGREP : STATE_NONLIT_REP)
|
||||
|
||||
/// Indicate that the latest state was a short match.
|
||||
#define update_short_rep(state) \
|
||||
state = ((state) < LIT_STATES ? STATE_LIT_SHORTREP : STATE_NONLIT_REP)
|
||||
|
||||
/// Test if the previous state was a literal.
|
||||
#define is_literal_state(state) \
|
||||
((state) < LIT_STATES)
|
||||
|
||||
#endif
|
||||
|
|
|
@ -106,7 +106,7 @@ struct lzma_coder_s {
|
|||
lzma_range_decoder rc;
|
||||
|
||||
// State
|
||||
uint32_t state;
|
||||
lzma_lzma_state state;
|
||||
uint32_t rep0; ///< Distance of the latest match
|
||||
uint32_t rep1; ///< Distance of second latest match
|
||||
uint32_t rep2; ///< Distance of third latest match
|
||||
|
@ -143,10 +143,10 @@ struct lzma_coder_s {
|
|||
probability pos_align_decoder[1 << ALIGN_BITS];
|
||||
|
||||
/// Length of a match
|
||||
lzma_length_decoder len_decoder;
|
||||
lzma_length_decoder match_len_decoder;
|
||||
|
||||
/// Length of a repeated match.
|
||||
lzma_length_decoder rep_match_len_decoder;
|
||||
lzma_length_decoder rep_len_decoder;
|
||||
|
||||
/// True when we have produced at least one byte of output since the
|
||||
/// beginning of the stream or the latest flush marker.
|
||||
|
@ -179,7 +179,7 @@ decode_dummy(const lzma_coder *restrict coder,
|
|||
coder->literal_coder, now_pos, lz_get_byte(coder->lz, 0));
|
||||
uint32_t symbol = 1;
|
||||
|
||||
if (is_char_state(state)) {
|
||||
if (is_literal_state(state)) {
|
||||
// Decode literal without match byte.
|
||||
do {
|
||||
if_bit_0(subcoder[symbol]) {
|
||||
|
@ -222,8 +222,7 @@ decode_dummy(const lzma_coder *restrict coder,
|
|||
|
||||
if_bit_0(coder->is_rep[state]) {
|
||||
update_bit_0_dummy();
|
||||
length_decode_dummy(len, coder->len_decoder, pos_state);
|
||||
update_match(state);
|
||||
length_decode_dummy(len, coder->match_len_decoder, pos_state);
|
||||
|
||||
const uint32_t len_to_pos_state = get_len_to_pos_state(len);
|
||||
uint32_t pos_slot = 0;
|
||||
|
@ -291,7 +290,7 @@ decode_dummy(const lzma_coder *restrict coder,
|
|||
}
|
||||
}
|
||||
|
||||
length_decode_dummy(len, coder->rep_match_len_decoder, pos_state);
|
||||
length_decode_dummy(len, coder->rep_len_decoder, pos_state);
|
||||
}
|
||||
} while (0);
|
||||
|
||||
|
@ -364,7 +363,7 @@ decode_real(lzma_coder *restrict coder, const uint8_t *restrict in,
|
|||
now_pos, lz_get_byte(coder->lz, 0));
|
||||
uint32_t symbol = 1;
|
||||
|
||||
if (is_char_state(state)) {
|
||||
if (is_literal_state(state)) {
|
||||
// Decode literal without match byte.
|
||||
do {
|
||||
if_bit_0(subcoder[symbol]) {
|
||||
|
@ -408,7 +407,7 @@ decode_real(lzma_coder *restrict coder, const uint8_t *restrict in,
|
|||
// decoder state, and start a new decoding loop.
|
||||
coder->lz.dict[coder->lz.pos++] = (uint8_t)(symbol);
|
||||
++now_pos;
|
||||
update_char(state);
|
||||
update_literal(state);
|
||||
has_produced_output = true;
|
||||
continue;
|
||||
}
|
||||
|
@ -429,7 +428,7 @@ decode_real(lzma_coder *restrict coder, const uint8_t *restrict in,
|
|||
// the value to distance.
|
||||
|
||||
// Decode the length of the match.
|
||||
length_decode(len, coder->len_decoder, pos_state);
|
||||
length_decode(len, coder->match_len_decoder, pos_state);
|
||||
|
||||
update_match(state);
|
||||
|
||||
|
@ -594,10 +593,10 @@ decode_real(lzma_coder *restrict coder, const uint8_t *restrict in,
|
|||
rep0 = distance;
|
||||
}
|
||||
|
||||
// Decode the length of the repeated match.
|
||||
length_decode(len, coder->rep_match_len_decoder, pos_state);
|
||||
update_long_rep(state);
|
||||
|
||||
update_rep(state);
|
||||
// Decode the length of the repeated match.
|
||||
length_decode(len, coder->rep_len_decoder, pos_state);
|
||||
}
|
||||
|
||||
|
||||
|
@ -746,23 +745,25 @@ lzma_lzma_decoder_init(lzma_next_coder *next, lzma_allocator *allocator,
|
|||
|
||||
// Len decoders (also bit/bittree)
|
||||
const uint32_t num_pos_states = 1 << next->coder->pos_bits;
|
||||
bit_reset(next->coder->len_decoder.choice);
|
||||
bit_reset(next->coder->len_decoder.choice2);
|
||||
bit_reset(next->coder->rep_match_len_decoder.choice);
|
||||
bit_reset(next->coder->rep_match_len_decoder.choice2);
|
||||
bit_reset(next->coder->match_len_decoder.choice);
|
||||
bit_reset(next->coder->match_len_decoder.choice2);
|
||||
bit_reset(next->coder->rep_len_decoder.choice);
|
||||
bit_reset(next->coder->rep_len_decoder.choice2);
|
||||
|
||||
for (uint32_t pos_state = 0; pos_state < num_pos_states; ++pos_state) {
|
||||
bittree_reset(next->coder->len_decoder.low[pos_state], LEN_LOW_BITS);
|
||||
bittree_reset(next->coder->len_decoder.mid[pos_state], LEN_MID_BITS);
|
||||
|
||||
bittree_reset(next->coder->rep_match_len_decoder.low[pos_state],
|
||||
bittree_reset(next->coder->match_len_decoder.low[pos_state],
|
||||
LEN_LOW_BITS);
|
||||
bittree_reset(next->coder->rep_match_len_decoder.mid[pos_state],
|
||||
bittree_reset(next->coder->match_len_decoder.mid[pos_state],
|
||||
LEN_MID_BITS);
|
||||
|
||||
bittree_reset(next->coder->rep_len_decoder.low[pos_state],
|
||||
LEN_LOW_BITS);
|
||||
bittree_reset(next->coder->rep_len_decoder.mid[pos_state],
|
||||
LEN_MID_BITS);
|
||||
}
|
||||
|
||||
bittree_reset(next->coder->len_decoder.high, LEN_HIGH_BITS);
|
||||
bittree_reset(next->coder->rep_match_len_decoder.high, LEN_HIGH_BITS);
|
||||
bittree_reset(next->coder->match_len_decoder.high, LEN_HIGH_BITS);
|
||||
bittree_reset(next->coder->rep_len_decoder.high, LEN_HIGH_BITS);
|
||||
|
||||
next->coder->has_produced_output = false;
|
||||
|
||||
|
|
|
@ -170,7 +170,7 @@ lzma_lzma_encode(lzma_coder *coder, uint8_t *restrict out,
|
|||
lzma_read_match_distances(coder, &len, &num_distance_pairs);
|
||||
|
||||
bit_encode_0(coder->is_match[coder->state][0]);
|
||||
update_char(coder->state);
|
||||
update_literal(coder->state);
|
||||
|
||||
const uint8_t cur_byte = coder->lz.buffer[
|
||||
coder->lz.read_pos - coder->additional_offset];
|
||||
|
@ -244,7 +244,7 @@ lzma_lzma_encode(lzma_coder *coder, uint8_t *restrict out,
|
|||
probability *subcoder = literal_get_subcoder(coder->literal_coder,
|
||||
coder->now_pos, coder->previous_byte);
|
||||
|
||||
if (is_char_state(coder->state)) {
|
||||
if (is_literal_state(coder->state)) {
|
||||
literal_encode(subcoder, cur_byte);
|
||||
} else {
|
||||
const uint8_t match_byte = coder->lz.buffer[
|
||||
|
@ -254,7 +254,7 @@ lzma_lzma_encode(lzma_coder *coder, uint8_t *restrict out,
|
|||
literal_encode_matched(subcoder, match_byte, cur_byte);
|
||||
}
|
||||
|
||||
update_char(coder->state);
|
||||
update_literal(coder->state);
|
||||
coder->previous_byte = cur_byte;
|
||||
|
||||
} else {
|
||||
|
@ -294,16 +294,16 @@ lzma_lzma_encode(lzma_coder *coder, uint8_t *restrict out,
|
|||
if (len == 1) {
|
||||
update_short_rep(coder->state);
|
||||
} else {
|
||||
length_encode(coder->rep_match_len_encoder,
|
||||
length_encode(coder->rep_len_encoder,
|
||||
len - MATCH_MIN_LEN, pos_state,
|
||||
best_compression);
|
||||
update_rep(coder->state);
|
||||
update_long_rep(coder->state);
|
||||
}
|
||||
|
||||
} else {
|
||||
bit_encode_0(coder->is_rep[coder->state]);
|
||||
update_match(coder->state);
|
||||
length_encode(coder->len_encoder, len - MATCH_MIN_LEN,
|
||||
length_encode(coder->match_len_encoder, len - MATCH_MIN_LEN,
|
||||
pos_state, best_compression);
|
||||
pos -= REP_DISTANCES;
|
||||
|
||||
|
@ -364,7 +364,7 @@ lzma_lzma_encode(lzma_coder *coder, uint8_t *restrict out,
|
|||
|
||||
const uint32_t len = coder->lz.sequence == SEQ_FLUSH
|
||||
? LEN_SPECIAL_FLUSH : LEN_SPECIAL_EOPM;
|
||||
length_encode(coder->len_encoder, len - MATCH_MIN_LEN,
|
||||
length_encode(coder->match_len_encoder, len - MATCH_MIN_LEN,
|
||||
pos_state, best_compression);
|
||||
|
||||
const uint32_t pos_slot = (1 << POS_SLOT_BITS) - 1;
|
||||
|
|
|
@ -63,7 +63,7 @@ do { \
|
|||
#define get_rep_price(price_target, rep_index, len, state, pos_state) \
|
||||
do { \
|
||||
get_pure_rep_price(price_target, rep_index, state, pos_state); \
|
||||
price_target += length_get_price(coder->rep_match_len_encoder, \
|
||||
price_target += length_get_price(coder->rep_len_encoder, \
|
||||
(len) - MATCH_MIN_LEN, pos_state); \
|
||||
} while (0)
|
||||
|
||||
|
@ -80,7 +80,7 @@ do { \
|
|||
+ align_prices[(pos) & ALIGN_MASK]; \
|
||||
} \
|
||||
price_target += length_get_price( \
|
||||
coder->len_encoder, (len) - MATCH_MIN_LEN, pos_state); \
|
||||
coder->match_len_encoder, (len) - MATCH_MIN_LEN, pos_state); \
|
||||
} while (0)
|
||||
|
||||
|
||||
|
@ -368,7 +368,7 @@ lzma_get_optimum(lzma_coder *restrict coder,
|
|||
+ literal_get_price(
|
||||
literal_get_subcoder(coder->literal_coder,
|
||||
position, coder->previous_byte),
|
||||
!is_char_state(coder->state), match_byte, current_byte);
|
||||
!is_literal_state(coder->state), match_byte, current_byte);
|
||||
|
||||
make_as_char(coder->optimum[1]);
|
||||
|
||||
|
@ -424,7 +424,7 @@ lzma_get_optimum(lzma_coder *restrict coder,
|
|||
do {
|
||||
const uint32_t cur_and_len_price = price
|
||||
+ length_get_price(
|
||||
coder->rep_match_len_encoder,
|
||||
coder->rep_len_encoder,
|
||||
rep_len - 2, pos_state);
|
||||
|
||||
if (cur_and_len_price < coder->optimum[rep_len].price) {
|
||||
|
@ -513,7 +513,7 @@ lzma_get_optimum(lzma_coder *restrict coder,
|
|||
state = coder->optimum[coder->optimum[cur].pos_prev_2].state;
|
||||
|
||||
if (coder->optimum[cur].back_prev_2 < REP_DISTANCES)
|
||||
update_rep(state);
|
||||
update_long_rep(state);
|
||||
else
|
||||
update_match(state);
|
||||
|
||||
|
@ -521,7 +521,7 @@ lzma_get_optimum(lzma_coder *restrict coder,
|
|||
state = coder->optimum[pos_prev].state;
|
||||
}
|
||||
|
||||
update_char(state);
|
||||
update_literal(state);
|
||||
|
||||
} else {
|
||||
state = coder->optimum[pos_prev].state;
|
||||
|
@ -531,17 +531,17 @@ lzma_get_optimum(lzma_coder *restrict coder,
|
|||
if (is_short_rep(coder->optimum[cur]))
|
||||
update_short_rep(state);
|
||||
else
|
||||
update_char(state);
|
||||
update_literal(state);
|
||||
} else {
|
||||
uint32_t pos;
|
||||
if (coder->optimum[cur].prev_1_is_char && coder->optimum[cur].prev_2) {
|
||||
pos_prev = coder->optimum[cur].pos_prev_2;
|
||||
pos = coder->optimum[cur].back_prev_2;
|
||||
update_rep(state);
|
||||
update_long_rep(state);
|
||||
} else {
|
||||
pos = coder->optimum[cur].back_prev;
|
||||
if (pos < REP_DISTANCES)
|
||||
update_rep(state);
|
||||
update_long_rep(state);
|
||||
else
|
||||
update_match(state);
|
||||
}
|
||||
|
@ -582,7 +582,7 @@ lzma_get_optimum(lzma_coder *restrict coder,
|
|||
+ literal_get_price(
|
||||
literal_get_subcoder(coder->literal_coder,
|
||||
position, buf[-1]),
|
||||
!is_char_state(state), match_byte, current_byte);
|
||||
!is_literal_state(state), match_byte, current_byte);
|
||||
|
||||
bool next_is_char = false;
|
||||
|
||||
|
@ -638,7 +638,7 @@ lzma_get_optimum(lzma_coder *restrict coder,
|
|||
|
||||
if (len_test_2 >= 2) {
|
||||
uint32_t state_2 = state;
|
||||
update_char(state_2);
|
||||
update_literal(state_2);
|
||||
|
||||
const uint32_t pos_state_next = (position + 1) & pos_mask;
|
||||
const uint32_t next_rep_match_price = cur_and_1_price
|
||||
|
@ -689,7 +689,7 @@ lzma_get_optimum(lzma_coder *restrict coder,
|
|||
|
||||
do {
|
||||
const uint32_t cur_and_len_price = price
|
||||
+ length_get_price(coder->rep_match_len_encoder,
|
||||
+ length_get_price(coder->rep_len_encoder,
|
||||
len_test - 2, pos_state);
|
||||
|
||||
if (cur_and_len_price < coder->optimum[cur + len_test].price) {
|
||||
|
@ -717,12 +717,12 @@ lzma_get_optimum(lzma_coder *restrict coder,
|
|||
|
||||
if (len_test_2 >= 2) {
|
||||
uint32_t state_2 = state;
|
||||
update_rep(state_2);
|
||||
update_long_rep(state_2);
|
||||
|
||||
uint32_t pos_state_next = (position + len_test) & pos_mask;
|
||||
|
||||
const uint32_t cur_and_len_char_price = price
|
||||
+ length_get_price(coder->rep_match_len_encoder,
|
||||
+ length_get_price(coder->rep_len_encoder,
|
||||
len_test - 2, pos_state)
|
||||
+ bit_get_price_0(coder->is_match[state_2][pos_state_next])
|
||||
+ literal_get_price(
|
||||
|
@ -730,7 +730,7 @@ lzma_get_optimum(lzma_coder *restrict coder,
|
|||
position + len_test, buf[len_test - 1]),
|
||||
true, *(buf + len_test - back_offset), buf[len_test]);
|
||||
|
||||
update_char(state_2);
|
||||
update_literal(state_2);
|
||||
|
||||
pos_state_next = (position + len_test + 1) & pos_mask;
|
||||
|
||||
|
@ -801,7 +801,7 @@ lzma_get_optimum(lzma_coder *restrict coder,
|
|||
len_to_pos_state][pos_slot]
|
||||
+ align_prices[cur_back & ALIGN_MASK];
|
||||
|
||||
cur_and_len_price += length_get_price(coder->len_encoder,
|
||||
cur_and_len_price += length_get_price(coder->match_len_encoder,
|
||||
len_test - MATCH_MIN_LEN, pos_state);
|
||||
|
||||
if (cur_and_len_price < coder->optimum[cur + len_test].price) {
|
||||
|
@ -843,7 +843,7 @@ lzma_get_optimum(lzma_coder *restrict coder,
|
|||
*(buf + len_test - back_offset),
|
||||
buf[len_test]);
|
||||
|
||||
update_char(state_2);
|
||||
update_literal(state_2);
|
||||
pos_state_next = (pos_state_next + 1) & pos_mask;
|
||||
|
||||
const uint32_t next_rep_match_price
|
||||
|
|
|
@ -174,10 +174,11 @@ lzma_lzma_encoder_init(lzma_next_coder *next, lzma_allocator *allocator,
|
|||
bittree_reset(next->coder->pos_align_encoder, ALIGN_BITS);
|
||||
|
||||
// Length encoders
|
||||
length_encoder_reset(&next->coder->len_encoder, 1U << options->pos_bits,
|
||||
length_encoder_reset(&next->coder->match_len_encoder,
|
||||
1U << options->pos_bits,
|
||||
options->fast_bytes + 1 - MATCH_MIN_LEN);
|
||||
|
||||
length_encoder_reset(&next->coder->rep_match_len_encoder,
|
||||
length_encoder_reset(&next->coder->rep_len_encoder,
|
||||
1U << options->pos_bits,
|
||||
next->coder->fast_bytes + 1 - MATCH_MIN_LEN);
|
||||
|
||||
|
|
|
@ -60,7 +60,7 @@ typedef struct {
|
|||
|
||||
|
||||
typedef struct {
|
||||
uint32_t state;
|
||||
lzma_lzma_state state;
|
||||
|
||||
bool prev_1_is_char;
|
||||
bool prev_2;
|
||||
|
@ -88,7 +88,7 @@ struct lzma_coder_s {
|
|||
lzma_range_encoder rc;
|
||||
|
||||
// State
|
||||
uint32_t state;
|
||||
lzma_lzma_state state;
|
||||
uint8_t previous_byte;
|
||||
uint32_t rep_distances[REP_DISTANCES];
|
||||
|
||||
|
@ -117,8 +117,8 @@ struct lzma_coder_s {
|
|||
probability pos_align_encoder[1 << ALIGN_BITS];
|
||||
|
||||
// Length encoders
|
||||
lzma_length_encoder len_encoder;
|
||||
lzma_length_encoder rep_match_len_encoder;
|
||||
lzma_length_encoder match_len_encoder;
|
||||
lzma_length_encoder rep_len_encoder;
|
||||
|
||||
// Optimal
|
||||
lzma_optimal optimum[OPTS];
|
||||
|
|
Loading…
Reference in New Issue