From da1718f266fcfc091e7bf08aae1bc986d0e6cc6b Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Sun, 25 May 2014 21:45:56 +0300 Subject: [PATCH] liblzma: Use lzma_alloc_zero() in LZ encoder initialization. This avoids a memzero() call for a newly-allocated memory, which can be expensive when encoding small streams with an over-sized dictionary. To avoid using lzma_alloc_zero() for memory that doesn't need to be zeroed, lzma_mf.son is now allocated separately, which requires handling it separately in normalize() too. Thanks to Vincenzo Innocente for reporting the problem. --- src/liblzma/lz/lz_encoder.c | 86 ++++++++++++++++++---------------- src/liblzma/lz/lz_encoder.h | 2 +- src/liblzma/lz/lz_encoder_mf.c | 33 +++++++------ 3 files changed, 64 insertions(+), 57 deletions(-) diff --git a/src/liblzma/lz/lz_encoder.c b/src/liblzma/lz/lz_encoder.c index 1f9ecfd4..76954e4d 100644 --- a/src/liblzma/lz/lz_encoder.c +++ b/src/liblzma/lz/lz_encoder.c @@ -326,25 +326,22 @@ lz_encoder_prepare(lzma_mf *mf, const lzma_allocator *allocator, hs += HASH_4_SIZE; */ - // If the above code calculating hs is modified, make sure that - // this assertion stays valid (UINT32_MAX / 5 is not strictly the - // exact limit). If it doesn't, you need to calculate that - // hash_size_sum + sons_count cannot overflow. - assert(hs < UINT32_MAX / 5); - - const uint32_t old_count = mf->hash_size_sum + mf->sons_count; - mf->hash_size_sum = hs; + const uint32_t old_hash_count = mf->hash_count; + const uint32_t old_sons_count = mf->sons_count; + mf->hash_count = hs; mf->sons_count = mf->cyclic_size; if (is_bt) mf->sons_count *= 2; - const uint32_t new_count = mf->hash_size_sum + mf->sons_count; - // Deallocate the old hash array if it exists and has different size // than what is needed now. - if (old_count != new_count) { + if (old_hash_count != mf->hash_count + || old_sons_count != mf->sons_count) { lzma_free(mf->hash, allocator); mf->hash = NULL; + + lzma_free(mf->son, allocator); + mf->son = NULL; } // Maximum number of match finder cycles @@ -382,44 +379,49 @@ lz_encoder_init(lzma_mf *mf, const lzma_allocator *allocator, mf->write_pos = 0; mf->pending = 0; - // Allocate match finder's hash array. - const size_t alloc_count = mf->hash_size_sum + mf->sons_count; - #if UINT32_MAX >= SIZE_MAX / 4 // Check for integer overflow. (Huge dictionaries are not // possible on 32-bit CPU.) - if (alloc_count > SIZE_MAX / sizeof(uint32_t)) + if (mf->hash_count > SIZE_MAX / sizeof(uint32_t) + || mf->sons_count > SIZE_MAX / sizeof(uint32_t)) return true; #endif + // Allocate and initialize the hash table. Since EMPTY_HASH_VALUE + // is zero, we can use lzma_alloc_zero() or memzero() for mf->hash. + // + // We don't need to initialize mf->son, but not doing that may + // make Valgrind complain in normalization (see normalize() in + // lz_encoder_mf.c). Skipping the initialization is *very* good + // when big dictionary is used but only small amount of data gets + // actually compressed: most of the mf->son won't get actually + // allocated by the kernel, so we avoid wasting RAM and improve + // initialization speed a lot. if (mf->hash == NULL) { - mf->hash = lzma_alloc(alloc_count * sizeof(uint32_t), + mf->hash = lzma_alloc_zero(mf->hash_count * sizeof(uint32_t), allocator); - if (mf->hash == NULL) + mf->son = lzma_alloc(mf->sons_count * sizeof(uint32_t), + allocator); + + if (mf->hash == NULL || mf->son == NULL) { + lzma_free(mf->hash, allocator); + mf->hash = NULL; + + lzma_free(mf->son, allocator); + mf->son = NULL; + return true; + } + } else { +/* + for (uint32_t i = 0; i < mf->hash_count; ++i) + mf->hash[i] = EMPTY_HASH_VALUE; +*/ + memzero(mf->hash, mf->hash_count * sizeof(uint32_t)); } - mf->son = mf->hash + mf->hash_size_sum; mf->cyclic_pos = 0; - // Initialize the hash table. Since EMPTY_HASH_VALUE is zero, we - // can use memset(). -/* - for (uint32_t i = 0; i < hash_size_sum; ++i) - mf->hash[i] = EMPTY_HASH_VALUE; -*/ - memzero(mf->hash, (size_t)(mf->hash_size_sum) * sizeof(uint32_t)); - - // We don't need to initialize mf->son, but not doing that will - // make Valgrind complain in normalization (see normalize() in - // lz_encoder_mf.c). - // - // Skipping this initialization is *very* good when big dictionary is - // used but only small amount of data gets actually compressed: most - // of the mf->hash won't get actually allocated by the kernel, so - // we avoid wasting RAM and improve initialization speed a lot. - //memzero(mf->son, (size_t)(mf->sons_count) * sizeof(uint32_t)); - // Handle preset dictionary. if (lz_options->preset_dict != NULL && lz_options->preset_dict_size > 0) { @@ -446,7 +448,8 @@ lzma_lz_encoder_memusage(const lzma_lz_options *lz_options) lzma_mf mf = { .buffer = NULL, .hash = NULL, - .hash_size_sum = 0, + .son = NULL, + .hash_count = 0, .sons_count = 0, }; @@ -455,9 +458,8 @@ lzma_lz_encoder_memusage(const lzma_lz_options *lz_options) return UINT64_MAX; // Calculate the memory usage. - return (uint64_t)(mf.hash_size_sum + mf.sons_count) - * sizeof(uint32_t) - + (uint64_t)(mf.size) + sizeof(lzma_coder); + return ((uint64_t)(mf.hash_count) + mf.sons_count) * sizeof(uint32_t) + + mf.size + sizeof(lzma_coder); } @@ -466,6 +468,7 @@ lz_encoder_end(lzma_coder *coder, const lzma_allocator *allocator) { lzma_next_end(&coder->next, allocator); + lzma_free(coder->mf.son, allocator); lzma_free(coder->mf.hash, allocator); lzma_free(coder->mf.buffer, allocator); @@ -523,7 +526,8 @@ lzma_lz_encoder_init(lzma_next_coder *next, const lzma_allocator *allocator, next->coder->mf.buffer = NULL; next->coder->mf.hash = NULL; - next->coder->mf.hash_size_sum = 0; + next->coder->mf.son = NULL; + next->coder->mf.hash_count = 0; next->coder->mf.sons_count = 0; next->coder->next = LZMA_NEXT_CODER_INIT; diff --git a/src/liblzma/lz/lz_encoder.h b/src/liblzma/lz/lz_encoder.h index d11d4a95..dad9c6b2 100644 --- a/src/liblzma/lz/lz_encoder.h +++ b/src/liblzma/lz/lz_encoder.h @@ -119,7 +119,7 @@ struct lzma_mf_s { lzma_action action; /// Number of elements in hash[] - uint32_t hash_size_sum; + uint32_t hash_count; /// Number of elements in son[] uint32_t sons_count; diff --git a/src/liblzma/lz/lz_encoder_mf.c b/src/liblzma/lz/lz_encoder_mf.c index f82a1c1d..bf787f45 100644 --- a/src/liblzma/lz/lz_encoder_mf.c +++ b/src/liblzma/lz/lz_encoder_mf.c @@ -116,24 +116,27 @@ normalize(lzma_mf *mf) = (MUST_NORMALIZE_POS - mf->cyclic_size); // & (~(UINT32_C(1) << 10) - 1); - const uint32_t count = mf->hash_size_sum + mf->sons_count; - uint32_t *hash = mf->hash; - - for (uint32_t i = 0; i < count; ++i) { + for (uint32_t i = 0; i < mf->hash_count; ++i) { // If the distance is greater than the dictionary size, // we can simply mark the hash element as empty. - // - // NOTE: Only the first mf->hash_size_sum elements are - // initialized for sure. There may be uninitialized elements - // in mf->son. Since we go through both mf->hash and - // mf->son here in normalization, Valgrind may complain - // that the "if" below depends on uninitialized value. In - // this case it is safe to ignore the warning. See also the - // comments in lz_encoder_init() in lz_encoder.c. - if (hash[i] <= subvalue) - hash[i] = EMPTY_HASH_VALUE; + if (mf->hash[i] <= subvalue) + mf->hash[i] = EMPTY_HASH_VALUE; else - hash[i] -= subvalue; + mf->hash[i] -= subvalue; + } + + for (uint32_t i = 0; i < mf->sons_count; ++i) { + // Do the same for mf->son. + // + // NOTE: There may be uninitialized elements in mf->son. + // Valgrind may complain that the "if" below depends on + // an uninitialized value. In this case it is safe to ignore + // the warning. See also the comments in lz_encoder_init() + // in lz_encoder.c. + if (mf->son[i] <= subvalue) + mf->son[i] = EMPTY_HASH_VALUE; + else + mf->son[i] -= subvalue; } // Update offset to match the new locations.