xz/src/liblzma/common/block_buffer_encoder.c

355 lines
11 KiB
C
Raw Normal View History

// SPDX-License-Identifier: 0BSD
///////////////////////////////////////////////////////////////////////////////
//
/// \file block_buffer_encoder.c
/// \brief Single-call .xz Block encoder
//
// Author: Lasse Collin
//
///////////////////////////////////////////////////////////////////////////////
#include "block_buffer_encoder.h"
#include "block_encoder.h"
#include "filter_encoder.h"
#include "lzma2_encoder.h"
#include "check.h"
/// Estimate the maximum size of the Block Header and Check fields for
/// a Block that uses LZMA2 uncompressed chunks. We could use
/// lzma_block_header_size() but this is simpler.
///
/// Block Header Size + Block Flags + Compressed Size
/// + Uncompressed Size + Filter Flags for LZMA2 + CRC32 + Check
/// and round up to the next multiple of four to take Header Padding
/// into account.
#define HEADERS_BOUND ((1 + 1 + 2 * LZMA_VLI_BYTES_MAX + 3 + 4 \
+ LZMA_CHECK_SIZE_MAX + 3) & ~3)
static uint64_t
lzma2_bound(uint64_t uncompressed_size)
{
// Prevent integer overflow in overhead calculation.
if (uncompressed_size > COMPRESSED_SIZE_MAX)
return 0;
// Calculate the exact overhead of the LZMA2 headers: Round
// uncompressed_size up to the next multiple of LZMA2_CHUNK_MAX,
// multiply by the size of per-chunk header, and add one byte for
// the end marker.
const uint64_t overhead = ((uncompressed_size + LZMA2_CHUNK_MAX - 1)
/ LZMA2_CHUNK_MAX)
* LZMA2_HEADER_UNCOMPRESSED + 1;
// Catch the possible integer overflow.
if (COMPRESSED_SIZE_MAX - overhead < uncompressed_size)
return 0;
return uncompressed_size + overhead;
}
extern uint64_t
lzma_block_buffer_bound64(uint64_t uncompressed_size)
{
// If the data doesn't compress, we always use uncompressed
// LZMA2 chunks.
uint64_t lzma2_size = lzma2_bound(uncompressed_size);
if (lzma2_size == 0)
return 0;
// Take Block Padding into account.
lzma2_size = (lzma2_size + 3) & ~UINT64_C(3);
// No risk of integer overflow because lzma2_bound() already takes
// into account the size of the headers in the Block.
return HEADERS_BOUND + lzma2_size;
}
extern LZMA_API(size_t)
lzma_block_buffer_bound(size_t uncompressed_size)
{
uint64_t ret = lzma_block_buffer_bound64(uncompressed_size);
#if SIZE_MAX < UINT64_MAX
// Catch the possible integer overflow on 32-bit systems.
if (ret > SIZE_MAX)
return 0;
#endif
return ret;
}
static lzma_ret
block_encode_uncompressed(lzma_block *block, const uint8_t *in, size_t in_size,
uint8_t *out, size_t *out_pos, size_t out_size)
{
// Use LZMA2 uncompressed chunks. We wouldn't need a dictionary at
// all, but LZMA2 always requires a dictionary, so use the minimum
// value to minimize memory usage of the decoder.
lzma_options_lzma lzma2 = {
.dict_size = LZMA_DICT_SIZE_MIN,
};
lzma_filter filters[2];
filters[0].id = LZMA_FILTER_LZMA2;
filters[0].options = &lzma2;
filters[1].id = LZMA_VLI_UNKNOWN;
// Set the above filter options to *block temporarily so that we can
// encode the Block Header.
lzma_filter *filters_orig = block->filters;
block->filters = filters;
if (lzma_block_header_size(block) != LZMA_OK) {
block->filters = filters_orig;
return LZMA_PROG_ERROR;
}
// Check that there's enough output space. The caller has already
// set block->compressed_size to what lzma2_bound() has returned,
// so we can reuse that value. We know that compressed_size is a
// known valid VLI and header_size is a small value so their sum
// will never overflow.
assert(block->compressed_size == lzma2_bound(in_size));
if (out_size - *out_pos
< block->header_size + block->compressed_size) {
block->filters = filters_orig;
return LZMA_BUF_ERROR;
}
if (lzma_block_header_encode(block, out + *out_pos) != LZMA_OK) {
block->filters = filters_orig;
return LZMA_PROG_ERROR;
}
block->filters = filters_orig;
*out_pos += block->header_size;
// Encode the data using LZMA2 uncompressed chunks.
size_t in_pos = 0;
uint8_t control = 0x01; // Dictionary reset
while (in_pos < in_size) {
// Control byte: Indicate uncompressed chunk, of which
// the first resets the dictionary.
out[(*out_pos)++] = control;
control = 0x02; // No dictionary reset
// Size of the uncompressed chunk
const size_t copy_size
= my_min(in_size - in_pos, LZMA2_CHUNK_MAX);
out[(*out_pos)++] = (copy_size - 1) >> 8;
out[(*out_pos)++] = (copy_size - 1) & 0xFF;
// The actual data
assert(*out_pos + copy_size <= out_size);
memcpy(out + *out_pos, in + in_pos, copy_size);
in_pos += copy_size;
*out_pos += copy_size;
}
// End marker
out[(*out_pos)++] = 0x00;
assert(*out_pos <= out_size);
return LZMA_OK;
}
static lzma_ret
block_encode_normal(lzma_block *block, const lzma_allocator *allocator,
const uint8_t *in, size_t in_size,
uint8_t *out, size_t *out_pos, size_t out_size)
{
// Find out the size of the Block Header.
return_if_error(lzma_block_header_size(block));
// Reserve space for the Block Header and skip it for now.
if (out_size - *out_pos <= block->header_size)
return LZMA_BUF_ERROR;
const size_t out_start = *out_pos;
*out_pos += block->header_size;
// Limit out_size so that we stop encoding if the output would grow
// bigger than what uncompressed Block would be.
if (out_size - *out_pos > block->compressed_size)
out_size = *out_pos + block->compressed_size;
// TODO: In many common cases this could be optimized to use
// significantly less memory.
lzma_next_coder raw_encoder = LZMA_NEXT_CODER_INIT;
lzma_ret ret = lzma_raw_encoder_init(
&raw_encoder, allocator, block->filters);
if (ret == LZMA_OK) {
size_t in_pos = 0;
ret = raw_encoder.code(raw_encoder.coder, allocator,
in, &in_pos, in_size, out, out_pos, out_size,
LZMA_FINISH);
}
// NOTE: This needs to be run even if lzma_raw_encoder_init() failed.
lzma_next_end(&raw_encoder, allocator);
if (ret == LZMA_STREAM_END) {
// Compression was successful. Write the Block Header.
block->compressed_size
= *out_pos - (out_start + block->header_size);
ret = lzma_block_header_encode(block, out + out_start);
if (ret != LZMA_OK)
ret = LZMA_PROG_ERROR;
} else if (ret == LZMA_OK) {
// Output buffer became full.
ret = LZMA_BUF_ERROR;
}
// Reset *out_pos if something went wrong.
if (ret != LZMA_OK)
*out_pos = out_start;
return ret;
}
static lzma_ret
block_buffer_encode(lzma_block *block, const lzma_allocator *allocator,
const uint8_t *in, size_t in_size,
uint8_t *out, size_t *out_pos, size_t out_size,
bool try_to_compress)
{
// Validate the arguments.
if (block == NULL || (in == NULL && in_size != 0) || out == NULL
|| out_pos == NULL || *out_pos > out_size)
return LZMA_PROG_ERROR;
// The contents of the structure may depend on the version so
// check the version before validating the contents of *block.
if (block->version > 1)
return LZMA_OPTIONS_ERROR;
if ((unsigned int)(block->check) > LZMA_CHECK_ID_MAX
|| (try_to_compress && block->filters == NULL))
return LZMA_PROG_ERROR;
if (!lzma_check_is_supported(block->check))
return LZMA_UNSUPPORTED_CHECK;
// Size of a Block has to be a multiple of four, so limit the size
// here already. This way we don't need to check it again when adding
// Block Padding.
out_size -= (out_size - *out_pos) & 3;
// Get the size of the Check field.
const size_t check_size = lzma_check_size(block->check);
assert(check_size != UINT32_MAX);
// Reserve space for the Check field.
if (out_size - *out_pos <= check_size)
return LZMA_BUF_ERROR;
out_size -= check_size;
// Initialize block->uncompressed_size and calculate the worst-case
// value for block->compressed_size.
block->uncompressed_size = in_size;
block->compressed_size = lzma2_bound(in_size);
if (block->compressed_size == 0)
return LZMA_DATA_ERROR;
// Do the actual compression.
lzma_ret ret = LZMA_BUF_ERROR;
if (try_to_compress)
ret = block_encode_normal(block, allocator,
in, in_size, out, out_pos, out_size);
if (ret != LZMA_OK) {
// If the error was something else than output buffer
// becoming full, return the error now.
if (ret != LZMA_BUF_ERROR)
return ret;
2023-07-31 12:02:21 +00:00
// The data was incompressible (at least with the options
// given to us) or the output buffer was too small. Use the
// uncompressed chunks of LZMA2 to wrap the data into a valid
// Block. If we haven't been given enough output space, even
// this may fail.
return_if_error(block_encode_uncompressed(block, in, in_size,
out, out_pos, out_size));
}
assert(*out_pos <= out_size);
// Block Padding. No buffer overflow here, because we already adjusted
// out_size so that (out_size - out_start) is a multiple of four.
// Thus, if the buffer is full, the loop body can never run.
for (size_t i = (size_t)(block->compressed_size); i & 3; ++i) {
assert(*out_pos < out_size);
out[(*out_pos)++] = 0x00;
}
// If there's no Check field, we are done now.
if (check_size > 0) {
// Calculate the integrity check. We reserved space for
// the Check field earlier so we don't need to check for
// available output space here.
lzma_check_state check;
lzma_check_init(&check, block->check);
lzma_check_update(&check, block->check, in, in_size);
lzma_check_finish(&check, block->check);
memcpy(block->raw_check, check.buffer.u8, check_size);
memcpy(out + *out_pos, check.buffer.u8, check_size);
*out_pos += check_size;
}
return LZMA_OK;
}
extern LZMA_API(lzma_ret)
lzma_block_buffer_encode(lzma_block *block, const lzma_allocator *allocator,
const uint8_t *in, size_t in_size,
uint8_t *out, size_t *out_pos, size_t out_size)
{
return block_buffer_encode(block, allocator,
in, in_size, out, out_pos, out_size, true);
}
liblzma: Vaccinate against an ill patch from RHEL/CentOS 7. RHEL/CentOS 7 shipped with 5.1.2alpha, including the threaded encoder that is behind #ifdef LZMA_UNSTABLE in the API headers. In 5.1.2alpha these symbols are under XZ_5.1.2alpha in liblzma.map. API/ABI compatibility tracking isn't done between development releases so newer releases didn't have XZ_5.1.2alpha anymore. Later RHEL/CentOS 7 updated xz to 5.2.2 but they wanted to keep the exported symbols compatible with 5.1.2alpha. After checking the ABI changes it turned out that >= 5.2.0 ABI is backward compatible with the threaded encoder functions from 5.1.2alpha (but not vice versa as fixes and extensions to these functions were made between 5.1.2alpha and 5.2.0). In RHEL/CentOS 7, XZ Utils 5.2.2 was patched with xz-5.2.2-compat-libs.patch to modify liblzma.map: - XZ_5.1.2alpha was added with lzma_stream_encoder_mt and lzma_stream_encoder_mt_memusage. This matched XZ Utils 5.1.2alpha. - XZ_5.2 was replaced with XZ_5.2.2. It is clear that this was an error; the intention was to keep using XZ_5.2 (XZ_5.2.2 has never been used in XZ Utils). So XZ_5.2.2 lists all symbols that were listed under XZ_5.2 before the patch. lzma_stream_encoder_mt and _mt_memusage are included too so they are listed both here and under XZ_5.1.2alpha. The patch didn't add any __asm__(".symver ...") lines to the .c files. Thus the resulting liblzma.so exports the threaded encoder functions under XZ_5.1.2alpha only. Listing the two functions also under XZ_5.2.2 in liblzma.map has no effect without matching .symver lines. The lack of XZ_5.2 in RHEL/CentOS 7 means that binaries linked against unpatched XZ Utils 5.2.x won't run on RHEL/CentOS 7. This is unfortunate but this alone isn't too bad as the problem is contained within RHEL/CentOS 7 and doesn't affect users of other distributions. It could also be fixed internally in RHEL/CentOS 7. The second problem is more serious: In XZ Utils 5.2.2 the API headers don't have #ifdef LZMA_UNSTABLE for obvious reasons. This is true in RHEL/CentOS 7 version too. Thus now programs using new APIs can be compiled without an extra #define. However, the programs end up depending on symbol version XZ_5.1.2alpha (and possibly also XZ_5.2.2) instead of XZ_5.2 as they would with an unpatched XZ Utils 5.2.2. This means that such binaries won't run on other distributions shipping XZ Utils >= 5.2.0 as they don't provide XZ_5.1.2alpha or XZ_5.2.2; they only provide XZ_5.2 (and XZ_5.0). (This includes RHEL/CentOS 8 as the patch luckily isn't included there anymore with XZ Utils 5.2.4.) Binaries built by RHEL/CentOS 7 users get distributed and then people wonder why they don't run on some other distribution. Seems that people have found out about the patch and been copying it to some build scripts, seemingly curing the symptoms but actually spreading the illness further and outside RHEL/CentOS 7. The ill patch seems to be from late 2016 (RHEL 7.3) and in 2017 it had spread at least to EasyBuild. I heard about the events only recently. :-( This commit splits liblzma.map into two versions: one for GNU/Linux and another for other OSes that can use symbol versioning (FreeBSD, Solaris, maybe others). The Linux-specific file and the matching additions to .c files add full compatibility with binaries that have been built against a RHEL/CentOS-patched liblzma. Builds for OSes other than GNU/Linux won't get the vaccine as they should be immune to the problem (I really hope that no build script uses the RHEL/CentOS 7 patch outside GNU/Linux). The RHEL/CentOS compatibility symbols XZ_5.1.2alpha and XZ_5.2.2 are intentionally put *after* XZ_5.2 in liblzma_linux.map. This way if one forgets to #define HAVE_SYMBOL_VERSIONS_LINUX when building, the resulting liblzma.so.5 will have lzma_stream_encoder_mt@@XZ_5.2 since XZ_5.2 {...} is the first one that lists that function. Without HAVE_SYMBOL_VERSIONS_LINUX @XZ_5.1.2alpha and @XZ_5.2.2 will be missing but that's still a minor problem compared to only having lzma_stream_encoder_mt@@XZ_5.1.2alpha! The "local: *;" line was moved to XZ_5.0 so that it doesn't need to be moved around. It doesn't matter where it is put. Having two similar liblzma_*.map files is a bit silly as it is, at least for now, easily possible to generate the generic one from the Linux-specific file. But that adds extra steps and increases the risk of mistakes when supporting more than one build system. So I rather maintain two files in parallel and let validate_map.sh check that they are in sync when "make mydist" is run. This adds .symver lines for lzma_stream_encoder_mt@XZ_5.2.2 and lzma_stream_encoder_mt_memusage@XZ_5.2.2 even though these weren't exported by RHEL/CentOS 7 (only @@XZ_5.1.2alpha was for these two). I added these anyway because someone might misunderstand the RHEL/CentOS 7 patch and think that @XZ_5.2.2 (@@XZ_5.2.2) versions were exported too. At glance one could suggest using __typeof__ to copy the function prototypes when making aliases. However, this doesn't work trivially because __typeof__ won't copy attributes (lzma_nothrow, lzma_pure) and it won't change symbol visibility from hidden to default (done by LZMA_API()). Attributes could be copied with __copy__ attribute but that needs GCC 9 and a fallback method would be needed anyway. This uses __symver__ attribute with GCC >= 10 and __asm__(".symver ...") with everything else. The attribute method is required for LTO (-flto) support with GCC. Using -flto with GCC older than 10 is now broken on GNU/Linux and will not be fixed (can silently result in a broken liblzma build that has dangerously incorrect symbol versions). LTO builds with Clang seem to work with the traditional __asm__(".symver ...") method. Thanks to Boud Roukema for reporting the problem and discussing the details and testing the fix.
2022-09-04 20:23:00 +00:00
#ifdef HAVE_SYMBOL_VERSIONS_LINUX
// This is for compatibility with binaries linked against liblzma that
// has been patched with xz-5.2.2-compat-libs.patch from RHEL/CentOS 7.
LZMA_SYMVER_API("lzma_block_uncomp_encode@XZ_5.2.2",
lzma_ret, lzma_block_uncomp_encode_522)(lzma_block *block,
const uint8_t *in, size_t in_size,
uint8_t *out, size_t *out_pos, size_t out_size)
lzma_nothrow lzma_attr_warn_unused_result
__attribute__((__alias__("lzma_block_uncomp_encode_52")));
LZMA_SYMVER_API("lzma_block_uncomp_encode@@XZ_5.2",
lzma_ret, lzma_block_uncomp_encode_52)(lzma_block *block,
const uint8_t *in, size_t in_size,
uint8_t *out, size_t *out_pos, size_t out_size)
lzma_nothrow lzma_attr_warn_unused_result;
#define lzma_block_uncomp_encode lzma_block_uncomp_encode_52
#endif
extern LZMA_API(lzma_ret)
lzma_block_uncomp_encode(lzma_block *block,
const uint8_t *in, size_t in_size,
uint8_t *out, size_t *out_pos, size_t out_size)
{
// It won't allocate any memory from heap so no need
// for lzma_allocator.
return block_buffer_encode(block, NULL,
in, in_size, out, out_pos, out_size, false);
}