mirror of https://git.tukaani.org/xz.git
liblzma: Add optional autodetection of LZMA end marker.
Turns out that this is needed for .lzma files as the spec in LZMA SDK says that end marker may be present even if the size is stored in the header. Such files are rare but exist in the real world. The code in liblzma is so old that the spec didn't exist in LZMA SDK back then and I had understood that such files weren't possible (the lzma tool in LZMA SDK didn't create such files). This modifies the internal API so that LZMA decoder can be told if EOPM is allowed even when the uncompressed size is known. It's allowed with .lzma and not with other uses. Thanks to Karl Beldan for reporting the problem.
This commit is contained in:
parent
0c0f8e9761
commit
9595a3119b
|
@ -40,7 +40,11 @@ The .lzma File Format
|
||||||
|
|
||||||
0.2. Changes
|
0.2. Changes
|
||||||
|
|
||||||
Last modified: 2011-04-12 11:55+0300
|
Last modified: 2022-07-13 21:00+0300
|
||||||
|
|
||||||
|
Compared to the previous version (2011-04-12 11:55+0300)
|
||||||
|
the section 1.1.3 was modified to allow End of Payload Marker
|
||||||
|
with a known Uncompressed Size.
|
||||||
|
|
||||||
|
|
||||||
1. File Format
|
1. File Format
|
||||||
|
@ -129,7 +133,10 @@ The .lzma File Format
|
||||||
Uncompressed Size is stored as unsigned 64-bit little endian
|
Uncompressed Size is stored as unsigned 64-bit little endian
|
||||||
integer. A special value of 0xFFFF_FFFF_FFFF_FFFF indicates
|
integer. A special value of 0xFFFF_FFFF_FFFF_FFFF indicates
|
||||||
that Uncompressed Size is unknown. End of Payload Marker (*)
|
that Uncompressed Size is unknown. End of Payload Marker (*)
|
||||||
is used if and only if Uncompressed Size is unknown.
|
is used if Uncompressed Size is unknown. End of Payload Marker
|
||||||
|
is allowed but rarely used if Uncompressed Size is known.
|
||||||
|
XZ Utils 5.2.5 and older don't support .lzma files that have
|
||||||
|
End of Payload Marker together with a known Uncompressed Size.
|
||||||
|
|
||||||
XZ Utils rejects files whose Uncompressed Size field specifies
|
XZ Utils rejects files whose Uncompressed Size field specifies
|
||||||
a known size that is 256 GiB or more. This is to reject false
|
a known size that is 256 GiB or more. This is to reject false
|
||||||
|
|
|
@ -146,7 +146,7 @@ alone_decode(void *coder_ptr, const lzma_allocator *allocator,
|
||||||
|
|
||||||
// Use a hack to set the uncompressed size.
|
// Use a hack to set the uncompressed size.
|
||||||
lzma_lz_decoder_uncompressed(coder->next.coder,
|
lzma_lz_decoder_uncompressed(coder->next.coder,
|
||||||
coder->uncompressed_size);
|
coder->uncompressed_size, true);
|
||||||
|
|
||||||
coder->sequence = SEQ_CODE;
|
coder->sequence = SEQ_CODE;
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -108,7 +108,7 @@ microlzma_decode(void *coder_ptr, const lzma_allocator *allocator,
|
||||||
// Use a hack to set the uncompressed size.
|
// Use a hack to set the uncompressed size.
|
||||||
if (coder->uncomp_size_is_exact)
|
if (coder->uncomp_size_is_exact)
|
||||||
lzma_lz_decoder_uncompressed(coder->lzma.coder,
|
lzma_lz_decoder_uncompressed(coder->lzma.coder,
|
||||||
coder->uncomp_size);
|
coder->uncomp_size, false);
|
||||||
|
|
||||||
// Pass one dummy 0x00 byte to the LZMA decoder since that
|
// Pass one dummy 0x00 byte to the LZMA decoder since that
|
||||||
// is what it expects the first byte to be.
|
// is what it expects the first byte to be.
|
||||||
|
|
|
@ -304,8 +304,14 @@ lzma_lz_decoder_memusage(size_t dictionary_size)
|
||||||
|
|
||||||
|
|
||||||
extern void
|
extern void
|
||||||
lzma_lz_decoder_uncompressed(void *coder_ptr, lzma_vli uncompressed_size)
|
lzma_lz_decoder_uncompressed(void *coder_ptr, lzma_vli uncompressed_size,
|
||||||
|
bool allow_eopm)
|
||||||
{
|
{
|
||||||
lzma_coder *coder = coder_ptr;
|
lzma_coder *coder = coder_ptr;
|
||||||
coder->lz.set_uncompressed(coder->lz.coder, uncompressed_size);
|
|
||||||
|
if (uncompressed_size == LZMA_VLI_UNKNOWN)
|
||||||
|
allow_eopm = true;
|
||||||
|
|
||||||
|
coder->lz.set_uncompressed(coder->lz.coder, uncompressed_size,
|
||||||
|
allow_eopm);
|
||||||
}
|
}
|
||||||
|
|
|
@ -62,8 +62,10 @@ typedef struct {
|
||||||
|
|
||||||
void (*reset)(void *coder, const void *options);
|
void (*reset)(void *coder, const void *options);
|
||||||
|
|
||||||
/// Set the uncompressed size
|
/// Set the uncompressed size. If uncompressed_size == LZMA_VLI_UNKNOWN
|
||||||
void (*set_uncompressed)(void *coder, lzma_vli uncompressed_size);
|
/// then allow_eopm will always be true.
|
||||||
|
void (*set_uncompressed)(void *coder, lzma_vli uncompressed_size,
|
||||||
|
bool allow_eopm);
|
||||||
|
|
||||||
/// Free allocated resources
|
/// Free allocated resources
|
||||||
void (*end)(void *coder, const lzma_allocator *allocator);
|
void (*end)(void *coder, const lzma_allocator *allocator);
|
||||||
|
@ -91,7 +93,7 @@ extern lzma_ret lzma_lz_decoder_init(lzma_next_coder *next,
|
||||||
extern uint64_t lzma_lz_decoder_memusage(size_t dictionary_size);
|
extern uint64_t lzma_lz_decoder_memusage(size_t dictionary_size);
|
||||||
|
|
||||||
extern void lzma_lz_decoder_uncompressed(
|
extern void lzma_lz_decoder_uncompressed(
|
||||||
void *coder, lzma_vli uncompressed_size);
|
void *coder, lzma_vli uncompressed_size, bool allow_eopm);
|
||||||
|
|
||||||
|
|
||||||
//////////////////////
|
//////////////////////
|
||||||
|
|
|
@ -139,7 +139,7 @@ lzma2_decode(void *coder_ptr, lzma_dict *restrict dict,
|
||||||
coder->uncompressed_size += in[(*in_pos)++] + 1U;
|
coder->uncompressed_size += in[(*in_pos)++] + 1U;
|
||||||
coder->sequence = SEQ_COMPRESSED_0;
|
coder->sequence = SEQ_COMPRESSED_0;
|
||||||
coder->lzma.set_uncompressed(coder->lzma.coder,
|
coder->lzma.set_uncompressed(coder->lzma.coder,
|
||||||
coder->uncompressed_size);
|
coder->uncompressed_size, false);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case SEQ_COMPRESSED_0:
|
case SEQ_COMPRESSED_0:
|
||||||
|
|
|
@ -238,6 +238,11 @@ typedef struct {
|
||||||
/// payload marker is expected.
|
/// payload marker is expected.
|
||||||
lzma_vli uncompressed_size;
|
lzma_vli uncompressed_size;
|
||||||
|
|
||||||
|
/// True if end of payload marker (EOPM) is allowed even when
|
||||||
|
/// uncompressed_size is known; false if EOPM must not be present.
|
||||||
|
/// This is ignored if uncompressed_size == LZMA_VLI_UNKNOWN.
|
||||||
|
bool allow_eopm;
|
||||||
|
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
// State of incomplete symbol //
|
// State of incomplete symbol //
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
|
@ -343,12 +348,19 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
|
||||||
|
|
||||||
lzma_ret ret = LZMA_OK;
|
lzma_ret ret = LZMA_OK;
|
||||||
|
|
||||||
// If uncompressed size is known, there must be no end of payload
|
// EOPM is always required (not just allowed) when
|
||||||
// marker.
|
// the uncompressed size isn't known.
|
||||||
const bool no_eopm = coder->uncompressed_size
|
bool eopm_allowed = coder->uncompressed_size == LZMA_VLI_UNKNOWN;
|
||||||
!= LZMA_VLI_UNKNOWN;
|
|
||||||
if (no_eopm && coder->uncompressed_size < dict.limit - dict.pos)
|
// If uncompressed size is known and there is enough output space
|
||||||
|
// to decode all the data, limit the available buffer space so that
|
||||||
|
// the main loop won't try to decode past the end of the stream.
|
||||||
|
bool might_finish_without_eopm = false;
|
||||||
|
if (coder->uncompressed_size != LZMA_VLI_UNKNOWN
|
||||||
|
&& coder->uncompressed_size <= dict.limit - dict.pos) {
|
||||||
dict.limit = dict.pos + (size_t)(coder->uncompressed_size);
|
dict.limit = dict.pos + (size_t)(coder->uncompressed_size);
|
||||||
|
might_finish_without_eopm = true;
|
||||||
|
}
|
||||||
|
|
||||||
// The main decoder loop. The "switch" is used to restart the decoder at
|
// The main decoder loop. The "switch" is used to restart the decoder at
|
||||||
// correct location. Once restarted, the "switch" is no longer used.
|
// correct location. Once restarted, the "switch" is no longer used.
|
||||||
|
@ -361,8 +373,32 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
|
||||||
|
|
||||||
case SEQ_NORMALIZE:
|
case SEQ_NORMALIZE:
|
||||||
case SEQ_IS_MATCH:
|
case SEQ_IS_MATCH:
|
||||||
if (unlikely(no_eopm && dict.pos == dict.limit))
|
if (unlikely(might_finish_without_eopm
|
||||||
break;
|
&& dict.pos == dict.limit)) {
|
||||||
|
// In rare cases there is a useless byte that needs
|
||||||
|
// to be read anyway.
|
||||||
|
rc_normalize(SEQ_NORMALIZE);
|
||||||
|
|
||||||
|
// If the range decoder state is such that we can
|
||||||
|
// be at the end of the LZMA stream, then the
|
||||||
|
// decoding is finished.
|
||||||
|
if (rc_is_finished(rc)) {
|
||||||
|
ret = LZMA_STREAM_END;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the caller hasn't allowed EOPM to be present
|
||||||
|
// together with known uncompressed size, then the
|
||||||
|
// LZMA stream is corrupt.
|
||||||
|
if (!coder->allow_eopm) {
|
||||||
|
ret = LZMA_DATA_ERROR;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Otherwise continue decoding with the expectation
|
||||||
|
// that the next LZMA symbol is EOPM.
|
||||||
|
eopm_allowed = true;
|
||||||
|
}
|
||||||
|
|
||||||
rc_if_0(coder->is_match[state][pos_state], SEQ_IS_MATCH) {
|
rc_if_0(coder->is_match[state][pos_state], SEQ_IS_MATCH) {
|
||||||
rc_update_0(coder->is_match[state][pos_state]);
|
rc_update_0(coder->is_match[state][pos_state]);
|
||||||
|
@ -658,11 +694,18 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
|
||||||
|
|
||||||
if (rep0 == UINT32_MAX) {
|
if (rep0 == UINT32_MAX) {
|
||||||
// End of payload marker was
|
// End of payload marker was
|
||||||
// found. It must not be
|
// found. It may only be
|
||||||
// present if uncompressed
|
// present if
|
||||||
// size is known.
|
// - uncompressed size is
|
||||||
if (coder->uncompressed_size
|
// unknown or
|
||||||
!= LZMA_VLI_UNKNOWN) {
|
// - after known uncompressed
|
||||||
|
// size amount of bytes has
|
||||||
|
// been decompressed and
|
||||||
|
// caller has indicated
|
||||||
|
// that EOPM might be used
|
||||||
|
// (it's not allowed in
|
||||||
|
// LZMA2).
|
||||||
|
if (!eopm_allowed) {
|
||||||
ret = LZMA_DATA_ERROR;
|
ret = LZMA_DATA_ERROR;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
@ -671,7 +714,9 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
|
||||||
// LZMA1 stream with
|
// LZMA1 stream with
|
||||||
// end-of-payload marker.
|
// end-of-payload marker.
|
||||||
rc_normalize(SEQ_EOPM);
|
rc_normalize(SEQ_EOPM);
|
||||||
ret = LZMA_STREAM_END;
|
ret = rc_is_finished(rc)
|
||||||
|
? LZMA_STREAM_END
|
||||||
|
: LZMA_DATA_ERROR;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -793,9 +838,6 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
rc_normalize(SEQ_NORMALIZE);
|
|
||||||
coder->sequence = SEQ_IS_MATCH;
|
|
||||||
|
|
||||||
out:
|
out:
|
||||||
// Save state
|
// Save state
|
||||||
|
|
||||||
|
@ -822,24 +864,21 @@ out:
|
||||||
if (coder->uncompressed_size != LZMA_VLI_UNKNOWN) {
|
if (coder->uncompressed_size != LZMA_VLI_UNKNOWN) {
|
||||||
coder->uncompressed_size -= dict.pos - dict_start;
|
coder->uncompressed_size -= dict.pos - dict_start;
|
||||||
|
|
||||||
// Since there cannot be end of payload marker if the
|
// If we have gotten all the output but the decoder wants
|
||||||
// uncompressed size was known, we check here if we
|
// to write more output, the file is corrupt. There are
|
||||||
// finished decoding.
|
// three SEQ values where output is produced.
|
||||||
if (coder->uncompressed_size == 0 && ret == LZMA_OK
|
if (coder->uncompressed_size == 0 && ret == LZMA_OK
|
||||||
&& coder->sequence != SEQ_NORMALIZE)
|
&& (coder->sequence == SEQ_LITERAL_WRITE
|
||||||
ret = coder->sequence == SEQ_IS_MATCH
|
|| coder->sequence == SEQ_SHORTREP
|
||||||
? LZMA_STREAM_END : LZMA_DATA_ERROR;
|
|| coder->sequence == SEQ_COPY))
|
||||||
|
ret = LZMA_DATA_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
// We can do an additional check in the range decoder to catch some
|
|
||||||
// corrupted files.
|
|
||||||
if (ret == LZMA_STREAM_END) {
|
if (ret == LZMA_STREAM_END) {
|
||||||
if (!rc_is_finished(coder->rc))
|
|
||||||
ret = LZMA_DATA_ERROR;
|
|
||||||
|
|
||||||
// Reset the range decoder so that it is ready to reinitialize
|
// Reset the range decoder so that it is ready to reinitialize
|
||||||
// for a new LZMA2 chunk.
|
// for a new LZMA2 chunk.
|
||||||
rc_reset(coder->rc);
|
rc_reset(coder->rc);
|
||||||
|
coder->sequence = SEQ_IS_MATCH;
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -848,10 +887,12 @@ out:
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
lzma_decoder_uncompressed(void *coder_ptr, lzma_vli uncompressed_size)
|
lzma_decoder_uncompressed(void *coder_ptr, lzma_vli uncompressed_size,
|
||||||
|
bool allow_eopm)
|
||||||
{
|
{
|
||||||
lzma_lzma1_decoder *coder = coder_ptr;
|
lzma_lzma1_decoder *coder = coder_ptr;
|
||||||
coder->uncompressed_size = uncompressed_size;
|
coder->uncompressed_size = uncompressed_size;
|
||||||
|
coder->allow_eopm = allow_eopm;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -977,7 +1018,7 @@ lzma_decoder_init(lzma_lz_decoder *lz, const lzma_allocator *allocator,
|
||||||
lz, allocator, options, lz_options));
|
lz, allocator, options, lz_options));
|
||||||
|
|
||||||
lzma_decoder_reset(lz->coder, options);
|
lzma_decoder_reset(lz->coder, options);
|
||||||
lzma_decoder_uncompressed(lz->coder, LZMA_VLI_UNKNOWN);
|
lzma_decoder_uncompressed(lz->coder, LZMA_VLI_UNKNOWN, true);
|
||||||
|
|
||||||
return LZMA_OK;
|
return LZMA_OK;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue