From da1718f266fcfc091e7bf08aae1bc986d0e6cc6b Mon Sep 17 00:00:00 2001
From: Lasse Collin <lasse.collin@tukaani.org>
Date: Sun, 25 May 2014 21:45:56 +0300
Subject: [PATCH] liblzma: Use lzma_alloc_zero() in LZ encoder initialization.

This avoids a memzero() call for a newly-allocated memory,
which can be expensive when encoding small streams with
an over-sized dictionary.

To avoid using lzma_alloc_zero() for memory that doesn't
need to be zeroed, lzma_mf.son is now allocated separately,
which requires handling it separately in normalize() too.

Thanks to Vincenzo Innocente for reporting the problem.
---
 src/liblzma/lz/lz_encoder.c    | 86 ++++++++++++++++++----------------
 src/liblzma/lz/lz_encoder.h    |  2 +-
 src/liblzma/lz/lz_encoder_mf.c | 33 +++++++------
 3 files changed, 64 insertions(+), 57 deletions(-)

diff --git a/src/liblzma/lz/lz_encoder.c b/src/liblzma/lz/lz_encoder.c
index 1f9ecfd4..76954e4d 100644
--- a/src/liblzma/lz/lz_encoder.c
+++ b/src/liblzma/lz/lz_encoder.c
@@ -326,25 +326,22 @@ lz_encoder_prepare(lzma_mf *mf, const lzma_allocator *allocator,
 		hs += HASH_4_SIZE;
 */
 
-	// If the above code calculating hs is modified, make sure that
-	// this assertion stays valid (UINT32_MAX / 5 is not strictly the
-	// exact limit). If it doesn't, you need to calculate that
-	// hash_size_sum + sons_count cannot overflow.
-	assert(hs < UINT32_MAX / 5);
-
-	const uint32_t old_count = mf->hash_size_sum + mf->sons_count;
-	mf->hash_size_sum = hs;
+	const uint32_t old_hash_count = mf->hash_count;
+	const uint32_t old_sons_count = mf->sons_count;
+	mf->hash_count = hs;
 	mf->sons_count = mf->cyclic_size;
 	if (is_bt)
 		mf->sons_count *= 2;
 
-	const uint32_t new_count = mf->hash_size_sum + mf->sons_count;
-
 	// Deallocate the old hash array if it exists and has different size
 	// than what is needed now.
-	if (old_count != new_count) {
+	if (old_hash_count != mf->hash_count
+			|| old_sons_count != mf->sons_count) {
 		lzma_free(mf->hash, allocator);
 		mf->hash = NULL;
+
+		lzma_free(mf->son, allocator);
+		mf->son = NULL;
 	}
 
 	// Maximum number of match finder cycles
@@ -382,44 +379,49 @@ lz_encoder_init(lzma_mf *mf, const lzma_allocator *allocator,
 	mf->write_pos = 0;
 	mf->pending = 0;
 
-	// Allocate match finder's hash array.
-	const size_t alloc_count = mf->hash_size_sum + mf->sons_count;
-
 #if UINT32_MAX >= SIZE_MAX / 4
 	// Check for integer overflow. (Huge dictionaries are not
 	// possible on 32-bit CPU.)
-	if (alloc_count > SIZE_MAX / sizeof(uint32_t))
+	if (mf->hash_count > SIZE_MAX / sizeof(uint32_t)
+			|| mf->sons_count > SIZE_MAX / sizeof(uint32_t))
 		return true;
 #endif
 
+	// Allocate and initialize the hash table. Since EMPTY_HASH_VALUE
+	// is zero, we can use lzma_alloc_zero() or memzero() for mf->hash.
+	//
+	// We don't need to initialize mf->son, but not doing that may
+	// make Valgrind complain in normalization (see normalize() in
+	// lz_encoder_mf.c). Skipping the initialization is *very* good
+	// when big dictionary is used but only small amount of data gets
+	// actually compressed: most of the mf->son won't get actually
+	// allocated by the kernel, so we avoid wasting RAM and improve
+	// initialization speed a lot.
 	if (mf->hash == NULL) {
-		mf->hash = lzma_alloc(alloc_count * sizeof(uint32_t),
+		mf->hash = lzma_alloc_zero(mf->hash_count * sizeof(uint32_t),
 				allocator);
-		if (mf->hash == NULL)
+		mf->son = lzma_alloc(mf->sons_count * sizeof(uint32_t),
+				allocator);
+
+		if (mf->hash == NULL || mf->son == NULL) {
+			lzma_free(mf->hash, allocator);
+			mf->hash = NULL;
+
+			lzma_free(mf->son, allocator);
+			mf->son = NULL;
+
 			return true;
+		}
+	} else {
+/*
+		for (uint32_t i = 0; i < mf->hash_count; ++i)
+			mf->hash[i] = EMPTY_HASH_VALUE;
+*/
+		memzero(mf->hash, mf->hash_count * sizeof(uint32_t));
 	}
 
-	mf->son = mf->hash + mf->hash_size_sum;
 	mf->cyclic_pos = 0;
 
-	// Initialize the hash table. Since EMPTY_HASH_VALUE is zero, we
-	// can use memset().
-/*
-	for (uint32_t i = 0; i < hash_size_sum; ++i)
-		mf->hash[i] = EMPTY_HASH_VALUE;
-*/
-	memzero(mf->hash, (size_t)(mf->hash_size_sum) * sizeof(uint32_t));
-
-	// We don't need to initialize mf->son, but not doing that will
-	// make Valgrind complain in normalization (see normalize() in
-	// lz_encoder_mf.c).
-	//
-	// Skipping this initialization is *very* good when big dictionary is
-	// used but only small amount of data gets actually compressed: most
-	// of the mf->hash won't get actually allocated by the kernel, so
-	// we avoid wasting RAM and improve initialization speed a lot.
-	//memzero(mf->son, (size_t)(mf->sons_count) * sizeof(uint32_t));
-
 	// Handle preset dictionary.
 	if (lz_options->preset_dict != NULL
 			&& lz_options->preset_dict_size > 0) {
@@ -446,7 +448,8 @@ lzma_lz_encoder_memusage(const lzma_lz_options *lz_options)
 	lzma_mf mf = {
 		.buffer = NULL,
 		.hash = NULL,
-		.hash_size_sum = 0,
+		.son = NULL,
+		.hash_count = 0,
 		.sons_count = 0,
 	};
 
@@ -455,9 +458,8 @@ lzma_lz_encoder_memusage(const lzma_lz_options *lz_options)
 		return UINT64_MAX;
 
 	// Calculate the memory usage.
-	return (uint64_t)(mf.hash_size_sum + mf.sons_count)
-				* sizeof(uint32_t)
-			+ (uint64_t)(mf.size) + sizeof(lzma_coder);
+	return ((uint64_t)(mf.hash_count) + mf.sons_count) * sizeof(uint32_t)
+			+ mf.size + sizeof(lzma_coder);
 }
 
 
@@ -466,6 +468,7 @@ lz_encoder_end(lzma_coder *coder, const lzma_allocator *allocator)
 {
 	lzma_next_end(&coder->next, allocator);
 
+	lzma_free(coder->mf.son, allocator);
 	lzma_free(coder->mf.hash, allocator);
 	lzma_free(coder->mf.buffer, allocator);
 
@@ -523,7 +526,8 @@ lzma_lz_encoder_init(lzma_next_coder *next, const lzma_allocator *allocator,
 
 		next->coder->mf.buffer = NULL;
 		next->coder->mf.hash = NULL;
-		next->coder->mf.hash_size_sum = 0;
+		next->coder->mf.son = NULL;
+		next->coder->mf.hash_count = 0;
 		next->coder->mf.sons_count = 0;
 
 		next->coder->next = LZMA_NEXT_CODER_INIT;
diff --git a/src/liblzma/lz/lz_encoder.h b/src/liblzma/lz/lz_encoder.h
index d11d4a95..dad9c6b2 100644
--- a/src/liblzma/lz/lz_encoder.h
+++ b/src/liblzma/lz/lz_encoder.h
@@ -119,7 +119,7 @@ struct lzma_mf_s {
 	lzma_action action;
 
 	/// Number of elements in hash[]
-	uint32_t hash_size_sum;
+	uint32_t hash_count;
 
 	/// Number of elements in son[]
 	uint32_t sons_count;
diff --git a/src/liblzma/lz/lz_encoder_mf.c b/src/liblzma/lz/lz_encoder_mf.c
index f82a1c1d..bf787f45 100644
--- a/src/liblzma/lz/lz_encoder_mf.c
+++ b/src/liblzma/lz/lz_encoder_mf.c
@@ -116,24 +116,27 @@ normalize(lzma_mf *mf)
 			= (MUST_NORMALIZE_POS - mf->cyclic_size);
 				// & (~(UINT32_C(1) << 10) - 1);
 
-	const uint32_t count = mf->hash_size_sum + mf->sons_count;
-	uint32_t *hash = mf->hash;
-
-	for (uint32_t i = 0; i < count; ++i) {
+	for (uint32_t i = 0; i < mf->hash_count; ++i) {
 		// If the distance is greater than the dictionary size,
 		// we can simply mark the hash element as empty.
-		//
-		// NOTE: Only the first mf->hash_size_sum elements are
-		// initialized for sure. There may be uninitialized elements
-		// in mf->son. Since we go through both mf->hash and
-		// mf->son here in normalization, Valgrind may complain
-		// that the "if" below depends on uninitialized value. In
-		// this case it is safe to ignore the warning. See also the
-		// comments in lz_encoder_init() in lz_encoder.c.
-		if (hash[i] <= subvalue)
-			hash[i] = EMPTY_HASH_VALUE;
+		if (mf->hash[i] <= subvalue)
+			mf->hash[i] = EMPTY_HASH_VALUE;
 		else
-			hash[i] -= subvalue;
+			mf->hash[i] -= subvalue;
+	}
+
+	for (uint32_t i = 0; i < mf->sons_count; ++i) {
+		// Do the same for mf->son.
+		//
+		// NOTE: There may be uninitialized elements in mf->son.
+		// Valgrind may complain that the "if" below depends on
+		// an uninitialized value. In this case it is safe to ignore
+		// the warning. See also the comments in lz_encoder_init()
+		// in lz_encoder.c.
+		if (mf->son[i] <= subvalue)
+			mf->son[i] = EMPTY_HASH_VALUE;
+		else
+			mf->son[i] -= subvalue;
 	}
 
 	// Update offset to match the new locations.