diff --git a/src/liblzma/lz/lz_decoder.h b/src/liblzma/lz/lz_decoder.h
index 65acf0b1..3b41649c 100644
--- a/src/liblzma/lz/lz_decoder.h
+++ b/src/liblzma/lz/lz_decoder.h
@@ -180,12 +180,22 @@ dict_repeat(lzma_dict *dict, uint32_t distance, uint32_t *len)
 }
 
 
+static inline void
+dict_put(lzma_dict *dict, uint8_t byte)
+{
+	dict->buf[dict->pos++] = byte;
+
+	if (dict->pos > dict->full)
+		dict->full = dict->pos;
+}
+
+
 /// Puts one byte into the dictionary. Returns true if the dictionary was
 /// already full and the byte couldn't be added.
 static inline bool
-dict_put(lzma_dict *dict, uint8_t byte)
+dict_put_safe(lzma_dict *dict, uint8_t byte)
 {
-	if (unlikely(dict->pos == dict->limit))
+	if (dict->pos == dict->limit)
 		return true;
 
 	dict->buf[dict->pos++] = byte;
diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c
index 2e8393d6..6203bae9 100644
--- a/src/liblzma/lzma/lzma_decoder.c
+++ b/src/liblzma/lzma/lzma_decoder.c
@@ -7,6 +7,7 @@
 ///
 //  Authors:    Igor Pavlov
 //              Lasse Collin
+//              Jia Tan
 //
 ///////////////////////////////////////////////////////////////////////////////
 
@@ -21,8 +22,12 @@
 #	pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
 #endif
 
+// Minimum number of input bytes to safely decode one LZMA symbol.
+// The worst case is that we decode 22 bits using probabilities and 26
+// direct bits. This may decode at maximum 20 bytes of input plus one
+// extra byte after the final EOPM normalization.
+#define LZMA_IN_REQUIRED 21
 
-#ifdef HAVE_SMALL
 
 // Macros for (somewhat) size-optimized code.
 // This is used to decode the match length (how many bytes must be repeated
@@ -193,22 +198,26 @@ typedef struct {
 	enum {
 		SEQ_NORMALIZE,
 		SEQ_IS_MATCH,
-		seq_8(SEQ_LITERAL),
-		seq_8(SEQ_LITERAL_MATCHED),
+		SEQ_LITERAL,
+		SEQ_LITERAL_MATCHED,
 		SEQ_LITERAL_WRITE,
 		SEQ_IS_REP,
-		seq_len(SEQ_MATCH_LEN),
-		seq_6(SEQ_DIST_SLOT),
+		SEQ_MATCH_LEN_CHOICE,
+		SEQ_MATCH_LEN_CHOICE2,
+		SEQ_MATCH_LEN_BITTREE,
+		SEQ_DIST_SLOT,
 		SEQ_DIST_MODEL,
 		SEQ_DIRECT,
-		seq_4(SEQ_ALIGN),
+		SEQ_ALIGN,
 		SEQ_EOPM,
 		SEQ_IS_REP0,
 		SEQ_SHORTREP,
 		SEQ_IS_REP0_LONG,
 		SEQ_IS_REP1,
 		SEQ_IS_REP2,
-		seq_len(SEQ_REP_LEN),
+		SEQ_REP_LEN_CHOICE,
+		SEQ_REP_LEN_CHOICE2,
+		SEQ_REP_LEN_BITTREE,
 		SEQ_COPY,
 	} sequence;
 
@@ -309,8 +318,42 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 		might_finish_without_eopm = true;
 	}
 
-	// The main decoder loop. The "switch" is used to restart the decoder at
-	// correct location. Once restarted, the "switch" is no longer used.
+	// Lookup table used to update the literal state.
+	// Compared to other state updates, this would need two branches.
+	// The lookup table is used by both Resumable and Non-resumable modes.
+	static const lzma_lzma_state next_state[] = {
+		STATE_LIT_LIT,
+		STATE_LIT_LIT,
+		STATE_LIT_LIT,
+		STATE_LIT_LIT,
+		STATE_MATCH_LIT_LIT,
+		STATE_REP_LIT_LIT,
+		STATE_SHORTREP_LIT_LIT,
+		STATE_MATCH_LIT,
+		STATE_REP_LIT,
+		STATE_SHORTREP_LIT,
+		STATE_MATCH_LIT,
+		STATE_REP_LIT
+	};
+
+	// The main decoder loop. The "switch" is used to resume the decoder at
+	// correct location. Once resumed, the "switch" is no longer used.
+	// The decoder loops is split into two modes:
+	//
+	// 1 - Non-resumable mode (fast). This is used when it is guaranteed
+	//     there is enough input to decode the next symbol. If the output
+	//     limit is reached, then the decoder loop will save the place
+	//     for the resumable mode to continue. This mode is not used if
+	//     HAVE_SMALL is defined. This is faster than Resumable mode
+	//     because it reduces the number of branches needed and allows
+	//     for more compiler optimizations.
+	//
+	// 2 - Resumable mode (slow). This is used when a previous decoder
+	//     loop did not have enough space in the input or output buffers
+	//     to complete. It uses sequence enum values to set remind
+	//     coder->sequence where to resume in the decoder loop. This
+	//     is the only mode used when HAVE_SMALL is defined.
+
 	switch (coder->sequence)
 	while (true) {
 		// Calculate new pos_state. This is skipped on the first loop
@@ -318,13 +361,440 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 		// variables.
 		pos_state = dict.pos & pos_mask;
 
+#ifndef HAVE_SMALL
+
+		///////////////////////////////
+		// Non-resumable Mode (fast) //
+		///////////////////////////////
+
+		// If there is not enough room for another LZMA symbol
+		// go to Resumable mode.
+		if (rc_in_pos + LZMA_IN_REQUIRED > in_size
+			|| dict.pos == dict.limit)
+			goto slow;
+
+		// Decode the first bit from the next LZMA symbol.
+		// If the bit is a 0, then we handle it as a literal.
+		// If the bit is a 1, then it is a match of previously
+		// decoded data.
+		rc_if_0(coder->is_match[state][pos_state]) {
+			/////////////////////
+			// Decode literal. //
+			/////////////////////
+
+			// Update the RC that we have decoded a 0.
+			rc_update_0(coder->is_match[state][pos_state]);
+
+			// Get the correct probability array from lp and
+			// lc params.
+			probs = literal_subcoder(coder->literal,
+					literal_context_bits, literal_pos_mask,
+					dict.pos, dict_get(&dict, 0));
+			symbol = 1;
+
+			if (is_literal_state(state)) {
+				// Decode literal without match byte.
+				// We need to decode 8 bits, so instead
+				// of looping from 1 - 8, we unroll the
+				// loop for a speed optimization.
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+			} else {
+				// Decode literal with match byte.
+				//
+				// We store the byte we compare against
+				// ("match byte") to "len" to minimize the
+				// number of variables we need to store
+				// between decoder calls.
+
+				len = (uint32_t)(dict_get(&dict, rep0)) << 1;
+
+				// The usage of "offset" allows omitting some
+				// branches, which should give tiny speed
+				// improvement on some CPUs. "offset" gets
+				// set to zero if match_bit didn't match.
+				offset = 0x100;
+
+				// Unroll the loop.
+				uint32_t match_bit;
+				uint32_t subcoder_index;
+
+#	define decode_with_match_bit \
+			match_bit = len & offset; \
+			subcoder_index = offset + match_bit + symbol; \
+			rc_bit(probs[subcoder_index], \
+					offset &= ~match_bit, \
+					offset &= match_bit)
+
+				decode_with_match_bit;
+				len <<= 1;
+				decode_with_match_bit;
+				len <<= 1;
+				decode_with_match_bit;
+				len <<= 1;
+				decode_with_match_bit;
+				len <<= 1;
+				decode_with_match_bit;
+				len <<= 1;
+				decode_with_match_bit;
+				len <<= 1;
+				decode_with_match_bit;
+				len <<= 1;
+				decode_with_match_bit;
+#	undef decode_match_bit
+			}
+
+			state = next_state[state];
+
+			// Write decoded literal to dictionary
+			dict_put(&dict, symbol);
+			continue;
+		}
+
+		///////////////////
+		// Decode match. //
+		///////////////////
+
+		// Instead of a new byte we are going to decode a
+		// distance-length pair. The distance represents how far
+		// back in the dictionary to begin copying. The length
+		// represents how many bytes to copy.
+
+		rc_update_1(coder->is_match[state][pos_state]);
+
+		rc_if_0(coder->is_rep[state]) {
+			///////////////////
+			// Simple match. //
+			///////////////////
+
+			// Not a repeated match. In this case,
+			// the length (how many bytes to copy) must be
+			// decoded first. Then, the distance (where to
+			// start copying) is decoded.
+			//
+			// This is also how we know when we are done
+			// decoding. If the distance decodes to UINT32_MAX,
+			// then we know to stop decoding (end of payload
+			// marker).
+
+			rc_update_0(coder->is_rep[state]);
+			update_match(state);
+
+			// The latest three match distances are kept in
+			// memory in case there are repeated matches.
+			rep3 = rep2;
+			rep2 = rep1;
+			rep1 = rep0;
+
+			// Decode the length of the match.
+			len_decode_fast(len, coder->match_len_decoder,
+					pos_state);
+
+			// Next, decode the distance into rep0.
+
+			// The next 6 bits determine how to decode the
+			// rest of the distance.
+			probs = coder->dist_slot[get_dist_state(len)];
+			symbol = 1;
+
+			rc_bit(probs[symbol], , );
+			rc_bit(probs[symbol], , );
+			rc_bit(probs[symbol], , );
+			rc_bit(probs[symbol], , );
+			rc_bit(probs[symbol], , );
+			rc_bit(probs[symbol], , );
+
+			// Get rid of the highest bit that was needed for
+			// indexing of the probability array.
+			symbol -= DIST_SLOTS;
+			assert(symbol <= 63);
+
+			if (symbol < DIST_MODEL_START) {
+				// If the decoded symbol is < DIST_MODEL_START
+				// then we use its value directly as the
+				// match distance. No other bits are needed.
+				// The only possible distance values
+				// are [0, 3].
+				rep0 = symbol;
+			} else {
+				// Use the first two bits of symbol as the
+				// highest bits of the match distance.
+
+				// "limit" represents the number of low bits
+				// to decode.
+				limit = (symbol >> 1) - 1;
+				assert(limit >= 1 && limit <= 30);
+				rep0 = 2 + (symbol & 1);
+
+				if (symbol < DIST_MODEL_END) {
+					// When symbol is > DIST_MODEL_START,
+					// but symbol < DIST_MODEL_END, then
+					// it can decode distances between
+					// [4, 127].
+					assert(limit <= 5);
+					rep0 <<= limit;
+					assert(rep0 <= 96);
+					// -1 is fine, because we start
+					// decoding at probs[1], not probs[0].
+					// NOTE: This violates the C standard,
+					// since we are doing pointer
+					// arithmetic past the beginning of
+					// the array.
+					assert((int32_t)(rep0 - symbol - 1)
+							>= -1);
+					assert((int32_t)(rep0 - symbol - 1)
+							<= 82);
+					probs = coder->pos_special + rep0
+							- symbol - 1;
+					symbol = 1;
+					offset = 0;
+
+					switch (limit) {
+					case 5:
+						assert(offset == 0);
+						rc_bit(probs[symbol], ,
+							rep0 += 1U);
+						++offset;
+						--limit;
+					case 4:
+						rc_bit(probs[symbol], ,
+							rep0 += 1U << offset);
+						++offset;
+						--limit;
+					case 3:
+						rc_bit(probs[symbol], ,
+							rep0 += 1U << offset);
+						++offset;
+						--limit;
+					case 2:
+						rc_bit(probs[symbol], ,
+							rep0 += 1U << offset);
+						++offset;
+						--limit;
+					case 1:
+						// We need "symbol" only for
+						// indexing the probability
+						// array, thus we can use
+						// rc_bit_last() here to
+						// omit the unneeded updating
+						// of "symbol".
+						rc_bit_last(probs[symbol], ,
+							rep0 += 1U << offset);
+					}
+				} else {
+					// The distance is >= 128. Decode the
+					// lower bits without probabilities
+					// except the lowest four bits.
+					assert(symbol >= 14);
+					assert(limit >= 6);
+					limit -= ALIGN_BITS;
+					assert(limit >= 2);
+
+					// Not worth manual unrolling
+					do {
+						rc_direct(rep0);
+					} while (--limit > 0);
+
+					// Decode the lowest four bits using
+					// probabilities.
+					rep0 <<= ALIGN_BITS;
+					symbol = 1;
+
+					rc_bit(coder->pos_align[symbol], ,
+							rep0 += 1);
+
+					rc_bit(coder->pos_align[symbol], ,
+							rep0 += 2);
+
+					rc_bit(coder->pos_align[symbol], ,
+							rep0 += 4);
+
+					// Like when distance [4, 127], we
+					// don't need "symbol" for anything
+					// other than indexing the probability
+					// array.
+					rc_bit_last(
+						coder->pos_align[symbol], ,
+						rep0 += 8);
+
+					if (rep0 == UINT32_MAX) {
+						///////////////////////////
+						// End of payload marker //
+						///////////////////////////
+
+						// End of payload marker was
+						// found. It may only be
+						// present if
+						//   - uncompressed size is
+						//     unknown or
+						//   - after known uncompressed
+						//     size amount of bytes has
+						//     been decompressed and
+						//     caller has indicated
+						//     that EOPM might be used
+						//     (it's not allowed in
+						//     LZMA2).
+						if (!eopm_is_valid) {
+							ret = LZMA_DATA_ERROR;
+							goto out;
+						}
+
+						// LZMA1 stream with
+						// end-of-payload marker.
+						rc_normalize();
+						ret = rc_is_finished(rc)
+							? LZMA_STREAM_END
+							: LZMA_DATA_ERROR;
+						goto out;
+					}
+				}
+			}
+
+			// Validate the distance we just decoded.
+			if (unlikely(!dict_is_distance_valid(&dict, rep0))) {
+				ret = LZMA_DATA_ERROR;
+				goto out;
+			}
+
+		} else {
+			rc_update_1(coder->is_rep[state]);
+
+			/////////////////////
+			// Repeated match. //
+			/////////////////////
+
+			// The match distance is a value that we have decoded
+			// recently. The latest four match distances are
+			// available as rep0, rep1, rep2 and rep3. We will
+			// now decode which of them is the new distance.
+			//
+			// There cannot be a match if we haven't produced
+			// any output, so check that first.
+			if (unlikely(!dict_is_distance_valid(&dict, 0))) {
+				ret = LZMA_DATA_ERROR;
+				goto out;
+			}
+
+			rc_if_0(coder->is_rep0[state]) {
+				rc_update_0(coder->is_rep0[state]);
+				// The distance is rep0.
+
+				// Decode the next bit to determine if 1 byte
+				// should be copied from rep0 distance or
+				// if the number of bytes needs to be decoded.
+
+				// If the next bit is 0, then it is a
+				// "Short Rep Match" and only 1 bit is copied.
+				// Otherwise, the length of the match is
+				// decoded after the "else" statement.
+				rc_if_0(coder->is_rep0_long[state][pos_state]) {
+					rc_update_0(coder->is_rep0_long[
+							state][pos_state]);
+
+					update_short_rep(state);
+					dict_put(&dict, dict_get(&dict, rep0));
+					continue;
+				}
+
+				// Repeating more than one byte at
+				// distance of rep0.
+				rc_update_1(coder->is_rep0_long[
+						state][pos_state]);
+
+			} else {
+				rc_update_1(coder->is_rep0[state]);
+
+				// The distance is rep1, rep2 or rep3. Once
+				// we find out which one of these three, it
+				// is stored to rep0 and rep1, rep2 and rep3
+				// are updated accordingly. There is no
+				// "Short Rep Match" option, so the length
+				// of the match must always be decoded next.
+				rc_if_0(coder->is_rep1[state]) {
+					// The distance is rep1.
+					rc_update_0(coder->is_rep1[state]);
+
+					const uint32_t distance = rep1;
+					rep1 = rep0;
+					rep0 = distance;
+
+				} else {
+					rc_update_1(coder->is_rep1[state]);
+
+					rc_if_0(coder->is_rep2[state]) {
+						// The distance is rep2.
+						rc_update_0(coder->is_rep2[
+								state]);
+
+						const uint32_t distance = rep2;
+						rep2 = rep1;
+						rep1 = rep0;
+						rep0 = distance;
+
+					} else {
+						// The distance is rep3.
+						rc_update_1(coder->is_rep2[
+								state]);
+
+						const uint32_t distance = rep3;
+						rep3 = rep2;
+						rep2 = rep1;
+						rep1 = rep0;
+						rep0 = distance;
+					}
+				}
+			}
+
+			update_long_rep(state);
+
+			// Decode the length of the repeated match.
+			len_decode_fast(len, coder->rep_len_decoder,
+					pos_state);
+		}
+
+		/////////////////////////////////
+		// Repeat from history buffer. //
+		/////////////////////////////////
+
+		// The length is always between these limits. There is no way
+		// to trigger the algorithm to set len outside this range.
+		assert(len >= MATCH_LEN_MIN);
+		assert(len <= MATCH_LEN_MAX);
+
+		// Repeat len bytes from distance of rep0.
+		if (unlikely(dict_repeat(&dict, rep0, &len))) {
+			coder->sequence = SEQ_COPY;
+			goto out;
+		}
+
+		continue;
+
+slow:
+#endif
+	///////////////////////////
+	// Resumable Mode (slow) //
+	///////////////////////////
+
+	// This is very similar to Non-resumable Mode, so most of the
+	// comments are not repeated. The main differences are:
+	// - case labels are used to resume at the correct location.
+	// - Loops are not unrolled.
+	// - Range coder macros take an extra sequence argument
+	//   so they can save to coder->sequence the location to
+	//   resume in case there is not enough input.
 	case SEQ_NORMALIZE:
 	case SEQ_IS_MATCH:
 		if (unlikely(might_finish_without_eopm
 				&& dict.pos == dict.limit)) {
 			// In rare cases there is a useless byte that needs
 			// to be read anyway.
-			rc_normalize(SEQ_NORMALIZE);
+			rc_normalize_safe(SEQ_NORMALIZE);
 
 			// If the range decoder state is such that we can
 			// be at the end of the LZMA stream, then the
@@ -347,10 +817,12 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 			eopm_is_valid = true;
 		}
 
-		rc_if_0(coder->is_match[state][pos_state], SEQ_IS_MATCH) {
-			rc_update_0(coder->is_match[state][pos_state]);
+		rc_if_0_safe(coder->is_match[state][pos_state], SEQ_IS_MATCH) {
+			/////////////////////
+			// Decode literal. //
+			/////////////////////
 
-			// It's a literal i.e. a single 8-bit byte.
+			rc_update_0(coder->is_match[state][pos_state]);
 
 			probs = literal_subcoder(coder->literal,
 					literal_context_bits, literal_pos_mask,
@@ -359,37 +831,19 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 
 			if (is_literal_state(state)) {
 				// Decode literal without match byte.
-#ifdef HAVE_SMALL
+				// The "slow" version does not unroll
+				// the loop.
 	case SEQ_LITERAL:
 				do {
-					rc_bit(probs[symbol], , , SEQ_LITERAL);
+					rc_bit_safe(probs[symbol], , ,
+							SEQ_LITERAL);
 				} while (symbol < (1 << 8));
-#else
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL0);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL1);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL2);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL3);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL4);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL5);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL6);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL7);
-#endif
 			} else {
 				// Decode literal with match byte.
-				//
-				// We store the byte we compare against
-				// ("match byte") to "len" to minimize the
-				// number of variables we need to store
-				// between decoder calls.
 				len = (uint32_t)(dict_get(&dict, rep0)) << 1;
 
-				// The usage of "offset" allows omitting some
-				// branches, which should give tiny speed
-				// improvement on some CPUs. "offset" gets
-				// set to zero if match_bit didn't match.
 				offset = 0x100;
 
-#ifdef HAVE_SMALL
 	case SEQ_LITERAL_MATCHED:
 				do {
 					const uint32_t match_bit
@@ -398,7 +852,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 							= offset + match_bit
 							+ symbol;
 
-					rc_bit(probs[subcoder_index],
+					rc_bit_safe(probs[subcoder_index],
 							offset &= ~match_bit,
 							offset &= match_bit,
 							SEQ_LITERAL_MATCHED);
@@ -411,61 +865,12 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 					len <<= 1;
 
 				} while (symbol < (1 << 8));
-#else
-				// Unroll the loop.
-				uint32_t match_bit;
-				uint32_t subcoder_index;
-
-#	define d(seq) \
-		case seq: \
-			match_bit = len & offset; \
-			subcoder_index = offset + match_bit + symbol; \
-			rc_bit(probs[subcoder_index], \
-					offset &= ~match_bit, \
-					offset &= match_bit, \
-					seq)
-
-				d(SEQ_LITERAL_MATCHED0);
-				len <<= 1;
-				d(SEQ_LITERAL_MATCHED1);
-				len <<= 1;
-				d(SEQ_LITERAL_MATCHED2);
-				len <<= 1;
-				d(SEQ_LITERAL_MATCHED3);
-				len <<= 1;
-				d(SEQ_LITERAL_MATCHED4);
-				len <<= 1;
-				d(SEQ_LITERAL_MATCHED5);
-				len <<= 1;
-				d(SEQ_LITERAL_MATCHED6);
-				len <<= 1;
-				d(SEQ_LITERAL_MATCHED7);
-#	undef d
-#endif
 			}
 
-			//update_literal(state);
-			// Use a lookup table to update to literal state,
-			// since compared to other state updates, this would
-			// need two branches.
-			static const lzma_lzma_state next_state[] = {
-				STATE_LIT_LIT,
-				STATE_LIT_LIT,
-				STATE_LIT_LIT,
-				STATE_LIT_LIT,
-				STATE_MATCH_LIT_LIT,
-				STATE_REP_LIT_LIT,
-				STATE_SHORTREP_LIT_LIT,
-				STATE_MATCH_LIT,
-				STATE_REP_LIT,
-				STATE_SHORTREP_LIT,
-				STATE_MATCH_LIT,
-				STATE_REP_LIT
-			};
 			state = next_state[state];
 
 	case SEQ_LITERAL_WRITE:
-			if (unlikely(dict_put(&dict, symbol))) {
+			if (dict_put_safe(&dict, symbol)) {
 				coder->sequence = SEQ_LITERAL_WRITE;
 				goto out;
 			}
@@ -473,64 +878,47 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 			continue;
 		}
 
-		// Instead of a new byte we are going to get a byte range
-		// (distance and length) which will be repeated from our
-		// output history.
+		///////////////////
+		// Decode match. //
+		///////////////////
 
 		rc_update_1(coder->is_match[state][pos_state]);
 
 	case SEQ_IS_REP:
-		rc_if_0(coder->is_rep[state], SEQ_IS_REP) {
-			// Not a repeated match
+		rc_if_0_safe(coder->is_rep[state], SEQ_IS_REP) {
+			///////////////////
+			// Simple match. //
+			///////////////////
+
 			rc_update_0(coder->is_rep[state]);
 			update_match(state);
 
-			// The latest three match distances are kept in
-			// memory in case there are repeated matches.
 			rep3 = rep2;
 			rep2 = rep1;
 			rep1 = rep0;
 
-			// Decode the length of the match.
 			len_decode(len, coder->match_len_decoder,
 					pos_state, SEQ_MATCH_LEN);
 
-			// Prepare to decode the highest two bits of the
-			// match distance.
 			probs = coder->dist_slot[get_dist_state(len)];
 			symbol = 1;
 
-#ifdef HAVE_SMALL
 	case SEQ_DIST_SLOT:
 			do {
-				rc_bit(probs[symbol], , , SEQ_DIST_SLOT);
+				rc_bit_safe(probs[symbol], , , SEQ_DIST_SLOT);
 			} while (symbol < DIST_SLOTS);
-#else
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT0);
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT1);
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT2);
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT3);
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT4);
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT5);
-#endif
-			// Get rid of the highest bit that was needed for
-			// indexing of the probability array.
+
 			symbol -= DIST_SLOTS;
 			assert(symbol <= 63);
 
 			if (symbol < DIST_MODEL_START) {
-				// Match distances [0, 3] have only two bits.
 				rep0 = symbol;
 			} else {
-				// Decode the lowest [1, 29] bits of
-				// the match distance.
 				limit = (symbol >> 1) - 1;
 				assert(limit >= 1 && limit <= 30);
 				rep0 = 2 + (symbol & 1);
 
 				if (symbol < DIST_MODEL_END) {
-					// Prepare to decode the low bits for
-					// a distance of [4, 127].
 					assert(limit <= 5);
 					rep0 <<= limit;
 					assert(rep0 <= 96);
@@ -549,118 +937,43 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 					symbol = 1;
 					offset = 0;
 	case SEQ_DIST_MODEL:
-#ifdef HAVE_SMALL
 					do {
-						rc_bit(probs[symbol], ,
+						rc_bit_safe(probs[symbol], ,
 							rep0 += 1U << offset,
 							SEQ_DIST_MODEL);
 					} while (++offset < limit);
-#else
-					switch (limit) {
-					case 5:
-						assert(offset == 0);
-						rc_bit(probs[symbol], ,
-							rep0 += 1U,
-							SEQ_DIST_MODEL);
-						++offset;
-						--limit;
-					case 4:
-						rc_bit(probs[symbol], ,
-							rep0 += 1U << offset,
-							SEQ_DIST_MODEL);
-						++offset;
-						--limit;
-					case 3:
-						rc_bit(probs[symbol], ,
-							rep0 += 1U << offset,
-							SEQ_DIST_MODEL);
-						++offset;
-						--limit;
-					case 2:
-						rc_bit(probs[symbol], ,
-							rep0 += 1U << offset,
-							SEQ_DIST_MODEL);
-						++offset;
-						--limit;
-					case 1:
-						// We need "symbol" only for
-						// indexing the probability
-						// array, thus we can use
-						// rc_bit_last() here to omit
-						// the unneeded updating of
-						// "symbol".
-						rc_bit_last(probs[symbol], ,
-							rep0 += 1U << offset,
-							SEQ_DIST_MODEL);
-					}
-#endif
 				} else {
-					// The distance is >= 128. Decode the
-					// lower bits without probabilities
-					// except the lowest four bits.
 					assert(symbol >= 14);
 					assert(limit >= 6);
 					limit -= ALIGN_BITS;
 					assert(limit >= 2);
 	case SEQ_DIRECT:
-					// Not worth manual unrolling
 					do {
-						rc_direct(rep0, SEQ_DIRECT);
+						rc_direct_safe(rep0,
+								SEQ_DIRECT);
 					} while (--limit > 0);
 
-					// Decode the lowest four bits using
-					// probabilities.
 					rep0 <<= ALIGN_BITS;
 					symbol = 1;
-#ifdef HAVE_SMALL
+
 					offset = 0;
 	case SEQ_ALIGN:
 					do {
-						rc_bit(coder->pos_align[
+						rc_bit_safe(coder->pos_align[
 								symbol], ,
 							rep0 += 1U << offset,
 							SEQ_ALIGN);
 					} while (++offset < ALIGN_BITS);
-#else
-	case SEQ_ALIGN0:
-					rc_bit(coder->pos_align[symbol], ,
-							rep0 += 1, SEQ_ALIGN0);
-	case SEQ_ALIGN1:
-					rc_bit(coder->pos_align[symbol], ,
-							rep0 += 2, SEQ_ALIGN1);
-	case SEQ_ALIGN2:
-					rc_bit(coder->pos_align[symbol], ,
-							rep0 += 4, SEQ_ALIGN2);
-	case SEQ_ALIGN3:
-					// Like in SEQ_DIST_MODEL, we don't
-					// need "symbol" for anything else
-					// than indexing the probability array.
-					rc_bit_last(coder->pos_align[symbol], ,
-							rep0 += 8, SEQ_ALIGN3);
-#endif
 
+					// End of payload marker
 					if (rep0 == UINT32_MAX) {
-						// End of payload marker was
-						// found. It may only be
-						// present if
-						//   - uncompressed size is
-						//     unknown or
-						//   - after known uncompressed
-						//     size amount of bytes has
-						//     been decompressed and
-						//     caller has indicated
-						//     that EOPM might be used
-						//     (it's not allowed in
-						//     LZMA2).
 						if (!eopm_is_valid) {
 							ret = LZMA_DATA_ERROR;
 							goto out;
 						}
 
 	case SEQ_EOPM:
-						// LZMA1 stream with
-						// end-of-payload marker.
-						rc_normalize(SEQ_EOPM);
+						rc_normalize_safe(SEQ_EOPM);
 						ret = rc_is_finished(rc)
 							? LZMA_STREAM_END
 							: LZMA_DATA_ERROR;
@@ -669,36 +982,30 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 				}
 			}
 
-			// Validate the distance we just decoded.
 			if (unlikely(!dict_is_distance_valid(&dict, rep0))) {
 				ret = LZMA_DATA_ERROR;
 				goto out;
 			}
 
 		} else {
+			/////////////////////
+			// Repeated match. //
+			/////////////////////
+
 			rc_update_1(coder->is_rep[state]);
 
-			// Repeated match
-			//
-			// The match distance is a value that we have had
-			// earlier. The latest four match distances are
-			// available as rep0, rep1, rep2 and rep3. We will
-			// now decode which of them is the new distance.
-			//
-			// There cannot be a match if we haven't produced
-			// any output, so check that first.
 			if (unlikely(!dict_is_distance_valid(&dict, 0))) {
 				ret = LZMA_DATA_ERROR;
 				goto out;
 			}
 
 	case SEQ_IS_REP0:
-			rc_if_0(coder->is_rep0[state], SEQ_IS_REP0) {
+			rc_if_0_safe(coder->is_rep0[state], SEQ_IS_REP0) {
 				rc_update_0(coder->is_rep0[state]);
-				// The distance is rep0.
 
 	case SEQ_IS_REP0_LONG:
-				rc_if_0(coder->is_rep0_long[state][pos_state],
+				rc_if_0_safe(coder->is_rep0_long
+						[state][pos_state],
 						SEQ_IS_REP0_LONG) {
 					rc_update_0(coder->is_rep0_long[
 							state][pos_state]);
@@ -706,8 +1013,9 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 					update_short_rep(state);
 
 	case SEQ_SHORTREP:
-					if (unlikely(dict_put(&dict, dict_get(
-							&dict, rep0)))) {
+					if (dict_put_safe(&dict,
+							dict_get(&dict,
+							rep0))) {
 						coder->sequence = SEQ_SHORTREP;
 						goto out;
 					}
@@ -715,8 +1023,6 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 					continue;
 				}
 
-				// Repeating more than one byte at
-				// distance of rep0.
 				rc_update_1(coder->is_rep0_long[
 						state][pos_state]);
 
@@ -724,11 +1030,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 				rc_update_1(coder->is_rep0[state]);
 
 	case SEQ_IS_REP1:
-				// The distance is rep1, rep2 or rep3. Once
-				// we find out which one of these three, it
-				// is stored to rep0 and rep1, rep2 and rep3
-				// are updated accordingly.
-				rc_if_0(coder->is_rep1[state], SEQ_IS_REP1) {
+				rc_if_0_safe(coder->is_rep1[state], SEQ_IS_REP1) {
 					rc_update_0(coder->is_rep1[state]);
 
 					const uint32_t distance = rep1;
@@ -738,7 +1040,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 				} else {
 					rc_update_1(coder->is_rep1[state]);
 	case SEQ_IS_REP2:
-					rc_if_0(coder->is_rep2[state],
+					rc_if_0_safe(coder->is_rep2[state],
 							SEQ_IS_REP2) {
 						rc_update_0(coder->is_rep2[
 								state]);
@@ -763,7 +1065,6 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 
 			update_long_rep(state);
 
-			// Decode the length of the repeated match.
 			len_decode(len, coder->rep_len_decoder,
 					pos_state, SEQ_REP_LEN);
 		}
@@ -772,13 +1073,10 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 		// Repeat from history buffer. //
 		/////////////////////////////////
 
-		// The length is always between these limits. There is no way
-		// to trigger the algorithm to set len outside this range.
 		assert(len >= MATCH_LEN_MIN);
 		assert(len <= MATCH_LEN_MAX);
 
 	case SEQ_COPY:
-		// Repeat len bytes from distance of rep0.
 		if (unlikely(dict_repeat(&dict, rep0, &len))) {
 			coder->sequence = SEQ_COPY;
 			goto out;