xz/src/liblzma/common/index.c

///////////////////////////////////////////////////////////////////////////////
//
/// \file       index.c
/// \brief      Handling of Index
//
//  Author:     Lasse Collin
//
//  This file has been put into the public domain.
//  You can do whatever you want with this file.
//
///////////////////////////////////////////////////////////////////////////////

#include "index.h"


/// Number of Records to allocate at once in the unrolled list.
#define INDEX_GROUP_SIZE 256


typedef struct lzma_index_group_s lzma_index_group;
struct lzma_index_group_s {
	/// Previous group
	lzma_index_group *prev;

	/// Next group
	lzma_index_group *next;

	/// Index of the last Record in this group
	size_t last;

	/// Unpadded Size fields as special cumulative sum relative to the
	/// beginning of the group. It's special in sense that the previous
	/// value is rounded up the next multiple of four with before
	/// calculating the new value. The total encoded size of the Blocks
	/// in the group is unpadded_sums[last] rounded up to the next
	/// multiple of four.
	///
	/// For example, if the Unpadded Sizes are 39, 57, and 81, the stored
	/// values are 39, 97 (40 + 57), and 181 (100 + 181). The total
	/// encoded size of these Blocks is 184.
	///
	/// This encoding is nice from point of view of lzma_index_locate().
	lzma_vli unpadded_sums[INDEX_GROUP_SIZE];

	/// Uncompressed Size fields as cumulative sum relative to the
	/// beginning of the group. The uncompressed size of the group is
	/// uncompressed_sums[last].
	lzma_vli uncompressed_sums[INDEX_GROUP_SIZE];

	/// True if the Record is padding
	bool paddings[INDEX_GROUP_SIZE];
};


struct lzma_index_s {
	/// Total size of the Blocks and padding
	lzma_vli total_size;

	/// Uncompressed size of the Stream
	lzma_vli uncompressed_size;

	/// Number of non-padding records. This is needed for Index encoder.
	lzma_vli count;

	/// Size of the List of Records field; this is updated every time
	/// a new non-padding Record is added.
	lzma_vli index_list_size;

	/// First group of Records
	lzma_index_group *head;

	/// Last group of Records
	lzma_index_group *tail;

	/// Tracking the read position
	struct {
		/// Group where the current read position is.
		lzma_index_group *group;

		/// The most recently read Record in *group
		size_t record;

		/// Uncompressed offset of the beginning of *group relative
		/// to the beginning of the Stream
		lzma_vli uncompressed_offset;

		/// Compressed offset of the beginning of *group relative
		/// to the beginning of the Stream
		lzma_vli stream_offset;
	} current;

	/// Information about earlier Indexes when multiple Indexes have
	/// been combined.
	struct {
		/// Sum of the Record counts of the all but the last Stream.
		lzma_vli count;

		/// Sum of the List of Records fields of all but the last
		/// Stream. This is needed when a new Index is concatenated
		/// to this lzma_index structure.
		lzma_vli index_list_size;

		/// Total size of all but the last Stream and all Stream
		/// Padding fields.
		lzma_vli streams_size;
	} old;
};


extern LZMA_API(lzma_vli)
lzma_index_memusage(lzma_vli count)
{
	if (count > LZMA_VLI_MAX)
		return UINT64_MAX;

	return sizeof(lzma_index) + (count + INDEX_GROUP_SIZE - 1)
			/ INDEX_GROUP_SIZE * sizeof(lzma_index_group);
}


static void
free_index_list(lzma_index *i, lzma_allocator *allocator)
{
	lzma_index_group *g = i->head;

	while (g != NULL) {
		lzma_index_group *tmp = g->next;
		lzma_free(g, allocator);
		g = tmp;
	}

	return;
}


extern LZMA_API(lzma_index *)
lzma_index_init(lzma_index *i, lzma_allocator *allocator)
{
	if (i == NULL) {
		i = lzma_alloc(sizeof(lzma_index), allocator);
		if (i == NULL)
			return NULL;
	} else {
		free_index_list(i, allocator);
	}

	i->total_size = 0;
	i->uncompressed_size = 0;
	i->count = 0;
	i->index_list_size = 0;
	i->head = NULL;
	i->tail = NULL;
	i->current.group = NULL;
	i->old.count = 0;
	i->old.index_list_size = 0;
	i->old.streams_size = 0;

	return i;
}


extern LZMA_API(void)
lzma_index_end(lzma_index *i, lzma_allocator *allocator)
{
	if (i != NULL) {
		free_index_list(i, allocator);
		lzma_free(i, allocator);
	}

	return;
}


extern LZMA_API(lzma_vli)
lzma_index_count(const lzma_index *i)
{
	return i->count;
}


extern LZMA_API(lzma_vli)
lzma_index_size(const lzma_index *i)
{
	return index_size(i->count, i->index_list_size);
}


extern LZMA_API(lzma_vli)
lzma_index_total_size(const lzma_index *i)
{
	return i->total_size;
}


extern LZMA_API(lzma_vli)
lzma_index_stream_size(const lzma_index *i)
{
	// Stream Header + Blocks + Index + Stream Footer
	return LZMA_STREAM_HEADER_SIZE + i->total_size
			+ index_size(i->count, i->index_list_size)
			+ LZMA_STREAM_HEADER_SIZE;
}


extern LZMA_API(lzma_vli)
lzma_index_file_size(const lzma_index *i)
{
	// If multiple Streams are concatenated, the Stream Header, Index,
	// and Stream Footer fields of all but the last Stream are already
	// included in old.streams_size. Thus, we need to calculate only the
	// size of the last Index, not all Indexes.
	return i->old.streams_size + LZMA_STREAM_HEADER_SIZE + i->total_size
			+ index_size(i->count - i->old.count,
				i->index_list_size - i->old.index_list_size)
			+ LZMA_STREAM_HEADER_SIZE;
}


extern LZMA_API(lzma_vli)
lzma_index_uncompressed_size(const lzma_index *i)
{
	return i->uncompressed_size;
}


extern uint32_t
lzma_index_padding_size(const lzma_index *i)
{
	return (LZMA_VLI_C(4)
		- index_size_unpadded(i->count, i->index_list_size)) & 3;
}


/// Appends a new Record to the Index. If needed, this allocates a new
/// Record group.
static lzma_ret
index_append_real(lzma_index *i, lzma_allocator *allocator,
		lzma_vli unpadded_size, lzma_vli uncompressed_size,
		bool is_padding)
{
	// Add the new record.
	if (i->tail == NULL || i->tail->last == INDEX_GROUP_SIZE - 1) {
		// Allocate a new group.
		lzma_index_group *g = lzma_alloc(sizeof(lzma_index_group),
				allocator);
		if (g == NULL)
			return LZMA_MEM_ERROR;

		// Initialize the group and set its first record.
		g->prev = i->tail;
		g->next = NULL;
		g->last = 0;
		g->unpadded_sums[0] = unpadded_size;
		g->uncompressed_sums[0] = uncompressed_size;
		g->paddings[0] = is_padding;

		// If this is the first group, make it the head.
		if (i->head == NULL)
			i->head = g;
		else
			i->tail->next = g;

		// Make it the new tail.
		i->tail = g;

	} else {
		// i->tail has space left for at least one record.
		i->tail->unpadded_sums[i->tail->last + 1]
				= unpadded_size + vli_ceil4(
					i->tail->unpadded_sums[i->tail->last]);
		i->tail->uncompressed_sums[i->tail->last + 1]
				= i->tail->uncompressed_sums[i->tail->last]
					+ uncompressed_size;
		i->tail->paddings[i->tail->last + 1] = is_padding;
		++i->tail->last;
	}

	return LZMA_OK;
}


extern LZMA_API(lzma_ret)
lzma_index_append(lzma_index *i, lzma_allocator *allocator,
		lzma_vli unpadded_size, lzma_vli uncompressed_size)
{
	if (unpadded_size < UNPADDED_SIZE_MIN
			|| unpadded_size > UNPADDED_SIZE_MAX
			|| uncompressed_size > LZMA_VLI_MAX)
		return LZMA_PROG_ERROR;

	// This looks a bit ugly. We want to first validate that the Index
	// and Stream stay in valid limits after adding this Record. After
	// validating, we may need to allocate a new lzma_index_group (it's
	// slightly more correct to validate before allocating, YMMV).
	lzma_ret ret;

	// First update the overall info so we can validate it.
	const lzma_vli index_list_size_add = lzma_vli_size(unpadded_size)
			+ lzma_vli_size(uncompressed_size);

	const lzma_vli total_size = vli_ceil4(unpadded_size);

	i->total_size += total_size;
	i->uncompressed_size += uncompressed_size;
	++i->count;
	i->index_list_size += index_list_size_add;

	if (i->total_size > LZMA_VLI_MAX
			|| i->uncompressed_size > LZMA_VLI_MAX
			|| lzma_index_size(i) > LZMA_BACKWARD_SIZE_MAX
			|| lzma_index_file_size(i) > LZMA_VLI_MAX)
		ret = LZMA_DATA_ERROR; // Would grow past the limits.
	else
		ret = index_append_real(i, allocator, unpadded_size,
				uncompressed_size, false);

	if (ret != LZMA_OK) {
		// Something went wrong. Undo the updates.
		i->total_size -= total_size;
		i->uncompressed_size -= uncompressed_size;
		--i->count;
		i->index_list_size -= index_list_size_add;
	}

	return ret;
}


/// Initialize i->current to point to the first Record.
static bool
init_current(lzma_index *i)
{
	if (i->head == NULL) {
		assert(i->count == 0);
		return true;
	}

	assert(i->count > 0);

	i->current.group = i->head;
	i->current.record = 0;
	i->current.stream_offset = LZMA_STREAM_HEADER_SIZE;
	i->current.uncompressed_offset = 0;

	return false;
}


/// Go backward to the previous group.
static void
previous_group(lzma_index *i)
{
	assert(i->current.group->prev != NULL);

	// Go to the previous group first.
	i->current.group = i->current.group->prev;
	i->current.record = i->current.group->last;

	// Then update the offsets.
	i->current.stream_offset -= vli_ceil4(i->current.group->unpadded_sums[
			i->current.group->last]);
	i->current.uncompressed_offset -= i->current.group->uncompressed_sums[
			i->current.group->last];

	return;
}


/// Go forward to the next group.
static void
next_group(lzma_index *i)
{
	assert(i->current.group->next != NULL);

	// Update the offsets first.
	i->current.stream_offset += vli_ceil4(i->current.group->unpadded_sums[
			i->current.group->last]);
	i->current.uncompressed_offset += i->current.group
			->uncompressed_sums[i->current.group->last];

	// Then go to the next group.
	i->current.record = 0;
	i->current.group = i->current.group->next;

	return;
}


/// Set *info from i->current.
static void
set_info(const lzma_index *i, lzma_index_record *info)
{
	// First copy the cumulative sizes from the current Record of the
	// current group.
	info->unpadded_size
			= i->current.group->unpadded_sums[i->current.record];
	info->total_size = vli_ceil4(info->unpadded_size);
	info->uncompressed_size = i->current.group->uncompressed_sums[
			i->current.record];

	// Copy the start offsets of this group.
	info->stream_offset = i->current.stream_offset;
	info->uncompressed_offset = i->current.uncompressed_offset;

	// If it's not the first Record in this group, we need to do some
	// adjustements.
	if (i->current.record > 0) {
		// Since the _sums[] are cumulative, we substract the sums of
		// the previous Record to get the sizes of the current Record,
		// and add the sums of the previous Record to the offsets.
		// With unpadded_sums[] we need to take into account that it
		// uses a bit weird way to do the cumulative summing
		const lzma_vli total_sum
				= vli_ceil4(i->current.group->unpadded_sums[
					i->current.record - 1]);

		const lzma_vli uncompressed_sum = i->current.group
				->uncompressed_sums[i->current.record - 1];

		info->total_size -= total_sum;
		info->unpadded_size -= total_sum;
		info->uncompressed_size -= uncompressed_sum;

		info->stream_offset += total_sum;
		info->uncompressed_offset += uncompressed_sum;
	}

	return;
}


extern LZMA_API(lzma_bool)
lzma_index_read(lzma_index *i, lzma_index_record *info)
{
	if (i->current.group == NULL) {
		// We are at the beginning of the Record list. Set up
		// i->current point at the first Record. Return if there
		// are no Records.
		if (init_current(i))
			return true;
	} else do {
		// Try to go the next Record.
		if (i->current.record < i->current.group->last)
			++i->current.record;
		else if (i->current.group->next == NULL)
			return true;
		else
			next_group(i);
	} while (i->current.group->paddings[i->current.record]);

	// We found a new Record. Set the information to *info.
	set_info(i, info);

	return false;
}


extern LZMA_API(void)
lzma_index_rewind(lzma_index *i)
{
	i->current.group = NULL;
	return;
}


extern LZMA_API(lzma_bool)
lzma_index_locate(lzma_index *i, lzma_index_record *info, lzma_vli target)
{
	// Check if it is possible to fullfill the request.
	if (target >= i->uncompressed_size)
		return true;

	// Now we know that we will have an answer. Initialize the current
	// read position if needed.
	if (i->current.group == NULL && init_current(i))
		return true;

	// Locate the group where the wanted Block is. First search forward.
	while (i->current.uncompressed_offset <= target) {
		// If the first uncompressed byte of the next group is past
		// the target offset, it has to be this or an earlier group.
		if (i->current.uncompressed_offset + i->current.group
				->uncompressed_sums[i->current.group->last]
				> target)
			break;

		// Go forward to the next group.
		next_group(i);
	}

	// Then search backward.
	while (i->current.uncompressed_offset > target)
		previous_group(i);

	// Now the target Block is somewhere in i->current.group. Offsets
	// in groups are relative to the beginning of the group, thus
	// we must adjust the target before starting the search loop.
	assert(target >= i->current.uncompressed_offset);
	target -= i->current.uncompressed_offset;

	// Use binary search to locate the exact Record. It is the first
	// Record whose uncompressed_sums[] value is greater than target.
	// This is because we want the rightmost Record that fullfills the
	// search criterion. It is possible that there are empty Blocks or
	// padding, we don't want to return them.
	size_t left = 0;
	size_t right = i->current.group->last;

	while (left < right) {
		const size_t pos = left + (right - left) / 2;
		if (i->current.group->uncompressed_sums[pos] <= target)
			left = pos + 1;
		else
			right = pos;
	}

	i->current.record = left;

#ifndef NDEBUG
	// The found Record must not be padding or have zero uncompressed size.
	assert(!i->current.group->paddings[i->current.record]);

	if (i->current.record == 0)
		assert(i->current.group->uncompressed_sums[0] > 0);
	else
		assert(i->current.group->uncompressed_sums[i->current.record]
				- i->current.group->uncompressed_sums[
					i->current.record - 1] > 0);
#endif

	set_info(i, info);

	return false;
}


extern LZMA_API(lzma_ret)
lzma_index_cat(lzma_index *restrict dest, lzma_index *restrict src,
		lzma_allocator *allocator, lzma_vli padding)
{
	if (dest == NULL || src == NULL || dest == src
			|| padding > LZMA_VLI_MAX)
		return LZMA_PROG_ERROR;

	// Check that the combined size of the Indexes stays within limits.
	{
		const lzma_vli dest_size = index_size_unpadded(
				dest->count, dest->index_list_size);
		const lzma_vli src_size = index_size_unpadded(
				src->count, src->index_list_size);
		if (vli_ceil4(dest_size + src_size) > LZMA_BACKWARD_SIZE_MAX)
			return LZMA_DATA_ERROR;
	}

	// Check that the combined size of the "files" (combined total
	// encoded sizes) stays within limits.
	{
		const lzma_vli dest_size = lzma_index_file_size(dest);
		const lzma_vli src_size = lzma_index_file_size(src);
		if (dest_size + src_size > LZMA_VLI_MAX
				|| dest_size + src_size + padding
					> LZMA_VLI_MAX)
			return LZMA_DATA_ERROR;
	}

	// Add a padding Record to take into account the size of
	// Index + Stream Footer + Stream Padding + Stream Header.
	//
	// NOTE: This cannot overflow, because Index Size is always
	// far smaller than LZMA_VLI_MAX, and adding two VLIs
	// (Index Size and padding) doesn't overflow.
	padding += index_size(dest->count - dest->old.count,
				dest->index_list_size
					- dest->old.index_list_size)
			+ LZMA_STREAM_HEADER_SIZE * 2;

	// While the above cannot overflow, but it may become an invalid VLI.
	if (padding > LZMA_VLI_MAX)
		return LZMA_DATA_ERROR;

	// Add the padding Record.
	{
		lzma_ret ret;

		// First update the info so we can validate it.
		dest->old.streams_size += padding;

		if (dest->old.streams_size > LZMA_VLI_MAX
				|| lzma_index_file_size(dest) > LZMA_VLI_MAX)
			ret = LZMA_DATA_ERROR; // Would grow past the limits.
		else
			ret = index_append_real(dest, allocator,
					padding, 0, true);

		// If something went wrong, undo the updated value and return
		// the error.
		if (ret != LZMA_OK) {
			dest->old.streams_size -= padding;
			return ret;
		}
	}

	// Avoid wasting lots of memory if src->head has only a few records
	// that fit into dest->tail. That is, combine two groups if possible.
	//
	// NOTE: We know that dest->tail != NULL since we just appended
	// a padding Record. But we don't know about src->head.
	if (src->head != NULL && src->head->last + 1
			<= INDEX_GROUP_SIZE - dest->tail->last - 1) {
		// Copy the first Record.
		dest->tail->unpadded_sums[dest->tail->last + 1]
				= vli_ceil4(dest->tail->unpadded_sums[
					dest->tail->last])
				+ src->head->unpadded_sums[0];

		dest->tail->uncompressed_sums[dest->tail->last + 1]
			= dest->tail->uncompressed_sums[dest->tail->last]
				+ src->head->uncompressed_sums[0];

		dest->tail->paddings[dest->tail->last + 1]
				= src->head->paddings[0];

		++dest->tail->last;

		// Copy the rest.
		for (size_t i = 1; i < src->head->last; ++i) {
			dest->tail->unpadded_sums[dest->tail->last + 1]
				= vli_ceil4(dest->tail->unpadded_sums[
						dest->tail->last])
					+ src->head->unpadded_sums[i + 1]
					- src->head->unpadded_sums[i];

			dest->tail->uncompressed_sums[dest->tail->last + 1]
				= dest->tail->uncompressed_sums[
						dest->tail->last]
					+ src->head->uncompressed_sums[i + 1]
					- src->head->uncompressed_sums[i];

			dest->tail->paddings[dest->tail->last + 1]
				= src->head->paddings[i + 1];

			++dest->tail->last;
		}

		// Free the head group of *src. Don't bother updating prev
		// pointers since those won't be used for anything before
		// we deallocate the whole *src structure.
		lzma_index_group *tmp = src->head;
		src->head = src->head->next;
		lzma_free(tmp, allocator);
	}

	// If there are groups left in *src, join them as is. Note that if we
	// are combining already combined Indexes, src->head can be non-NULL
	// even if we just combined the old src->head to dest->tail.
	if (src->head != NULL) {
		src->head->prev = dest->tail;
		dest->tail->next = src->head;
		dest->tail = src->tail;
	}

	// Update information about earlier Indexes. Only the last Index
	// from *src won't be counted in dest->old. The last Index is left
	// open and can be even appended with lzma_index_append().
	dest->old.count = dest->count + src->old.count;
	dest->old.index_list_size
			= dest->index_list_size + src->old.index_list_size;
	dest->old.streams_size += src->old.streams_size;

	// Update overall information.
	dest->total_size += src->total_size;
	dest->uncompressed_size += src->uncompressed_size;
	dest->count += src->count;
	dest->index_list_size += src->index_list_size;

	// *src has nothing left but the base structure.
	lzma_free(src, allocator);

	return LZMA_OK;
}


extern LZMA_API(lzma_index *)
lzma_index_dup(const lzma_index *src, lzma_allocator *allocator)
{
	lzma_index *dest = lzma_alloc(sizeof(lzma_index), allocator);
	if (dest == NULL)
		return NULL;

	// Copy the base structure except the pointers.
	*dest = *src;
	dest->head = NULL;
	dest->tail = NULL;
	dest->current.group = NULL;

	// Copy the Records.
	const lzma_index_group *src_group = src->head;
	while (src_group != NULL) {
		// Allocate a new group.
		lzma_index_group *dest_group = lzma_alloc(
				sizeof(lzma_index_group), allocator);
		if (dest_group == NULL) {
			lzma_index_end(dest, allocator);
			return NULL;
		}

		// Set the pointers.
		dest_group->prev = dest->tail;
		dest_group->next = NULL;

		if (dest->head == NULL)
			dest->head = dest_group;
		else
			dest->tail->next = dest_group;

		dest->tail = dest_group;

		dest_group->last = src_group->last;

		// Copy the arrays so that we don't read uninitialized memory.
		const size_t count = src_group->last + 1;
		memcpy(dest_group->unpadded_sums, src_group->unpadded_sums,
				sizeof(lzma_vli) * count);
		memcpy(dest_group->uncompressed_sums,
				src_group->uncompressed_sums,
				sizeof(lzma_vli) * count);
		memcpy(dest_group->paddings, src_group->paddings,
				sizeof(bool) * count);

		// Copy also the read position.
		if (src_group == src->current.group)
			dest->current.group = dest->tail;

		src_group = src_group->next;
	}

	return dest;
}


extern LZMA_API(lzma_bool)
lzma_index_equal(const lzma_index *a, const lzma_index *b)
{
	// No point to compare more if the pointers are the same.
	if (a == b)
		return true;

	// Compare the basic properties.
	if (a->total_size != b->total_size
			|| a->uncompressed_size != b->uncompressed_size
			|| a->index_list_size != b->index_list_size
			|| a->count != b->count)
		return false;

	// Compare the Records.
	const lzma_index_group *ag = a->head;
	const lzma_index_group *bg = b->head;
	while (ag != NULL && bg != NULL) {
		const size_t count = ag->last + 1;
		if (ag->last != bg->last
				|| memcmp(ag->unpadded_sums,
					bg->unpadded_sums,
					sizeof(lzma_vli) * count) != 0
				|| memcmp(ag->uncompressed_sums,
					bg->uncompressed_sums,
					sizeof(lzma_vli) * count) != 0
				|| memcmp(ag->paddings, bg->paddings,
					sizeof(bool) * count) != 0)
			return false;

		ag = ag->next;
		bg = bg->next;
	}

	return ag == NULL && bg == NULL;
}