Create sparse files by default when decompressing into

a regular file.

Sparse file creation can be disabled with --no-sparse.
I don't promise yet that the name of this option won't
change before 5.0.0. It's possible that the code, that
checks when it is safe to use sparse output on stdout,
is not good enough, and a more flexible command line
option is needed to configure sparse file handling.
This commit is contained in:
Lasse Collin 2009-11-25 11:19:20 +02:00
parent 37de544414
commit 465d1b0d65
6 changed files with 272 additions and 56 deletions

View File

@ -43,6 +43,7 @@ parse_real(args_info *args, int argc, char **argv)
OPT_LZMA1, OPT_LZMA1,
OPT_LZMA2, OPT_LZMA2,
OPT_NO_SPARSE,
OPT_FILES, OPT_FILES,
OPT_FILES0, OPT_FILES0,
OPT_INFO_MEMORY, OPT_INFO_MEMORY,
@ -65,6 +66,7 @@ parse_real(args_info *args, int argc, char **argv)
{ "force", no_argument, NULL, 'f' }, { "force", no_argument, NULL, 'f' },
{ "stdout", no_argument, NULL, 'c' }, { "stdout", no_argument, NULL, 'c' },
{ "to-stdout", no_argument, NULL, 'c' }, { "to-stdout", no_argument, NULL, 'c' },
{ "no-sparse", no_argument, NULL, OPT_NO_SPARSE },
{ "suffix", required_argument, NULL, 'S' }, { "suffix", required_argument, NULL, 'S' },
// { "recursive", no_argument, NULL, 'r' }, // TODO // { "recursive", no_argument, NULL, 'r' }, // TODO
{ "files", optional_argument, NULL, OPT_FILES }, { "files", optional_argument, NULL, OPT_FILES },
@ -339,6 +341,10 @@ parse_real(args_info *args, int argc, char **argv)
break; break;
} }
case OPT_NO_SPARSE:
io_no_sparse();
break;
case OPT_FILES: case OPT_FILES:
args->files_delim = '\n'; args->files_delim = '\n';

View File

@ -33,8 +33,8 @@ static lzma_stream strm = LZMA_STREAM_INIT;
static lzma_filter filters[LZMA_FILTERS_MAX + 1]; static lzma_filter filters[LZMA_FILTERS_MAX + 1];
/// Input and output buffers /// Input and output buffers
static uint8_t in_buf[IO_BUFFER_SIZE]; static io_buf in_buf;
static uint8_t out_buf[IO_BUFFER_SIZE]; static io_buf out_buf;
/// Number of filters. Zero indicates that we are using a preset. /// Number of filters. Zero indicates that we are using a preset.
static size_t filters_count = 0; static size_t filters_count = 0;
@ -275,7 +275,7 @@ coder_set_compression_settings(void)
static bool static bool
is_format_xz(void) is_format_xz(void)
{ {
return strm.avail_in >= 6 && memcmp(in_buf, "\3757zXZ", 6) == 0; return strm.avail_in >= 6 && memcmp(in_buf.u8, "\3757zXZ", 6) == 0;
} }
@ -289,7 +289,7 @@ is_format_lzma(void)
// Decode the LZMA1 properties. // Decode the LZMA1 properties.
lzma_filter filter = { .id = LZMA_FILTER_LZMA1 }; lzma_filter filter = { .id = LZMA_FILTER_LZMA1 };
if (lzma_properties_decode(&filter, NULL, in_buf, 5) != LZMA_OK) if (lzma_properties_decode(&filter, NULL, in_buf.u8, 5) != LZMA_OK)
return false; return false;
// A hack to ditch tons of false positives: We allow only dictionary // A hack to ditch tons of false positives: We allow only dictionary
@ -317,7 +317,7 @@ is_format_lzma(void)
// Again, if someone complains, this will be reconsidered. // Again, if someone complains, this will be reconsidered.
uint64_t uncompressed_size = 0; uint64_t uncompressed_size = 0;
for (size_t i = 0; i < 8; ++i) for (size_t i = 0; i < 8; ++i)
uncompressed_size |= (uint64_t)(in_buf[5 + i]) << (i * 8); uncompressed_size |= (uint64_t)(in_buf.u8[5 + i]) << (i * 8);
if (uncompressed_size != UINT64_MAX if (uncompressed_size != UINT64_MAX
&& uncompressed_size > (UINT64_C(1) << 38)) && uncompressed_size > (UINT64_C(1) << 38))
@ -444,15 +444,16 @@ coder_normal(file_pair *pair)
// Assume that something goes wrong. // Assume that something goes wrong.
bool success = false; bool success = false;
strm.next_out = out_buf; strm.next_out = out_buf.u8;
strm.avail_out = IO_BUFFER_SIZE; strm.avail_out = IO_BUFFER_SIZE;
while (!user_abort) { while (!user_abort) {
// Fill the input buffer if it is empty and we haven't reached // Fill the input buffer if it is empty and we haven't reached
// end of file yet. // end of file yet.
if (strm.avail_in == 0 && !pair->src_eof) { if (strm.avail_in == 0 && !pair->src_eof) {
strm.next_in = in_buf; strm.next_in = in_buf.u8;
strm.avail_in = io_read(pair, in_buf, IO_BUFFER_SIZE); strm.avail_in = io_read(
pair, &in_buf, IO_BUFFER_SIZE);
if (strm.avail_in == SIZE_MAX) if (strm.avail_in == SIZE_MAX)
break; break;
@ -466,11 +467,11 @@ coder_normal(file_pair *pair)
// Write out if the output buffer became full. // Write out if the output buffer became full.
if (strm.avail_out == 0) { if (strm.avail_out == 0) {
if (opt_mode != MODE_TEST && io_write(pair, out_buf, if (opt_mode != MODE_TEST && io_write(pair, &out_buf,
IO_BUFFER_SIZE - strm.avail_out)) IO_BUFFER_SIZE - strm.avail_out))
break; break;
strm.next_out = out_buf; strm.next_out = out_buf.u8;
strm.avail_out = IO_BUFFER_SIZE; strm.avail_out = IO_BUFFER_SIZE;
} }
@ -487,7 +488,7 @@ coder_normal(file_pair *pair)
// when trying to get at least some useful // when trying to get at least some useful
// data out of damaged files. // data out of damaged files.
if (opt_mode != MODE_TEST && io_write(pair, if (opt_mode != MODE_TEST && io_write(pair,
out_buf, IO_BUFFER_SIZE &out_buf, IO_BUFFER_SIZE
- strm.avail_out)) - strm.avail_out))
break; break;
} }
@ -502,7 +503,7 @@ coder_normal(file_pair *pair)
// input, and thus pair->src_eof // input, and thus pair->src_eof
// becomes true. // becomes true.
strm.avail_in = io_read( strm.avail_in = io_read(
pair, in_buf, 1); pair, &in_buf, 1);
if (strm.avail_in == SIZE_MAX) if (strm.avail_in == SIZE_MAX)
break; break;
@ -579,14 +580,14 @@ coder_passthru(file_pair *pair)
if (user_abort) if (user_abort)
return false; return false;
if (io_write(pair, in_buf, strm.avail_in)) if (io_write(pair, &in_buf, strm.avail_in))
return false; return false;
strm.total_in += strm.avail_in; strm.total_in += strm.avail_in;
strm.total_out = strm.total_in; strm.total_out = strm.total_in;
message_progress_update(); message_progress_update();
strm.avail_in = io_read(pair, in_buf, IO_BUFFER_SIZE); strm.avail_in = io_read(pair, &in_buf, IO_BUFFER_SIZE);
if (strm.avail_in == SIZE_MAX) if (strm.avail_in == SIZE_MAX)
return false; return false;
} }
@ -613,8 +614,8 @@ coder_run(const char *filename)
// Read the first chunk of input data. This is needed to detect // Read the first chunk of input data. This is needed to detect
// the input file type (for now, only for decompression). // the input file type (for now, only for decompression).
strm.next_in = in_buf; strm.next_in = in_buf.u8;
strm.avail_in = io_read(pair, in_buf, IO_BUFFER_SIZE); strm.avail_in = io_read(pair, &in_buf, IO_BUFFER_SIZE);
switch (coder_init(pair)) { switch (coder_init(pair)) {
case CODER_INIT_NORMAL: case CODER_INIT_NORMAL:

View File

@ -37,6 +37,17 @@ static bool warn_fchown;
#endif #endif
/// If true, try to create sparse files when decompressing.
static bool try_sparse = true;
/// File status flags of standard output. This is used by io_open_dest()
/// and io_close_dest().
static int stdout_flags = 0;
static bool io_write_buf(file_pair *pair, const uint8_t *buf, size_t size);
extern void extern void
io_init(void) io_init(void)
{ {
@ -63,6 +74,14 @@ io_init(void)
} }
extern void
io_no_sparse(void)
{
try_sparse = false;
return;
}
/// \brief Unlink a file /// \brief Unlink a file
/// ///
/// This tries to verify that the file being unlinked really is the file that /// This tries to verify that the file being unlinked really is the file that
@ -498,42 +517,42 @@ io_open_dest(file_pair *pair)
#ifdef TUKLIB_DOSLIKE #ifdef TUKLIB_DOSLIKE
setmode(STDOUT_FILENO, O_BINARY); setmode(STDOUT_FILENO, O_BINARY);
#endif #endif
return false; } else {
} pair->dest_name = suffix_get_dest_name(pair->src_name);
if (pair->dest_name == NULL)
return true;
pair->dest_name = suffix_get_dest_name(pair->src_name); // If --force was used, unlink the target file first.
if (pair->dest_name == NULL) if (opt_force && unlink(pair->dest_name) && errno != ENOENT) {
return true; message_error("%s: Cannot unlink: %s",
pair->dest_name, strerror(errno));
free(pair->dest_name);
return true;
}
// If --force was used, unlink the target file first. if (opt_force && unlink(pair->dest_name) && errno != ENOENT) {
if (opt_force && unlink(pair->dest_name) && errno != ENOENT) { message_error("%s: Cannot unlink: %s",
message_error("%s: Cannot unlink: %s", pair->dest_name, strerror(errno));
pair->dest_name, strerror(errno)); free(pair->dest_name);
free(pair->dest_name); return true;
return true; }
}
if (opt_force && unlink(pair->dest_name) && errno != ENOENT) { // Open the file.
message_error("%s: Cannot unlink: %s", pair->dest_name, const int flags = O_WRONLY | O_BINARY | O_NOCTTY
strerror(errno)); | O_CREAT | O_EXCL;
free(pair->dest_name); const mode_t mode = S_IRUSR | S_IWUSR;
return true; pair->dest_fd = open(pair->dest_name, flags, mode);
}
// Open the file. if (pair->dest_fd == -1) {
const int flags = O_WRONLY | O_BINARY | O_NOCTTY | O_CREAT | O_EXCL; // Don't bother with error message if user requested
const mode_t mode = S_IRUSR | S_IWUSR; // us to exit anyway.
pair->dest_fd = open(pair->dest_name, flags, mode); if (!user_abort)
message_error("%s: %s", pair->dest_name,
strerror(errno));
if (pair->dest_fd == -1) { free(pair->dest_name);
// Don't bother with error message if user requested return true;
// us to exit anyway. }
if (!user_abort)
message_error("%s: %s", pair->dest_name,
strerror(errno));
free(pair->dest_name);
return true;
} }
// If this really fails... well, we have a safe fallback. // If this really fails... well, we have a safe fallback.
@ -545,6 +564,65 @@ io_open_dest(file_pair *pair)
#elif !defined(TUKLIB_DOSLIKE) #elif !defined(TUKLIB_DOSLIKE)
pair->dest_st.st_dev = 0; pair->dest_st.st_dev = 0;
pair->dest_st.st_ino = 0; pair->dest_st.st_ino = 0;
#endif
#ifndef TUKLIB_DOSLIKE
} else if (try_sparse && opt_mode == MODE_DECOMPRESS) {
// When writing to standard output, we need to be extra
// careful:
// - It may be connected to something else than
// a regular file.
// - We aren't necessarily writing to a new empty file
// or to the end of an existing file.
// - O_APPEND may be active.
//
// TODO: I'm keeping this disabled for DOS-like systems
// for now. FAT doesn't support sparse files, but NTFS
// does, so maybe this should be enabled on Windows after
// some testing.
if (pair->dest_fd == STDOUT_FILENO) {
if (!S_ISREG(pair->dest_st.st_mode))
return false;
const int flags = fcntl(STDOUT_FILENO, F_GETFL);
if (flags == -1)
return false;
if (flags & O_APPEND) {
// Creating a sparse file is not possible
// when O_APPEND is active (it's used by
// shell's >> redirection). As I understand
// it, it is safe to temporarily disable
// O_APPEND in xz, because if someone
// happened to write to the same file at the
// same time, results would be bad anyway
// (users shouldn't assume that xz uses any
// specific block size when writing data).
//
// The write position may be something else
// than the end of the file, so we must fix
// it to start writing at the end of the file
// to imitate O_APPEND.
if (lseek(STDOUT_FILENO, 0, SEEK_END) == -1)
return false;
if (fcntl(STDOUT_FILENO, F_SETFL,
stdout_flags & ~O_APPEND))
return false;
// Remember the flags so that io_close_dest()
// can restore them.
stdout_flags = flags;
} else if (lseek(STDOUT_FILENO, 0, SEEK_CUR)
!= pair->dest_st.st_size) {
// Writing won't start exactly at the end
// of the file. We cannot use sparse output,
// because it would probably corrupt the file.
return false;
}
}
pair->dest_try_sparse = true;
#endif #endif
} }
@ -562,6 +640,21 @@ io_open_dest(file_pair *pair)
static int static int
io_close_dest(file_pair *pair, bool success) io_close_dest(file_pair *pair, bool success)
{ {
// If io_open_dest() has disabled O_APPEND, restore it here.
if (stdout_flags != 0) {
assert(pair->dest_fd == STDOUT_FILENO);
const int fail = fcntl(STDOUT_FILENO, F_SETFL, stdout_flags);
stdout_flags = 0;
if (fail) {
message_error(_("Error restoring the O_APPEND flag "
"to standard output: %s"),
strerror(errno));
return -1;
}
}
if (pair->dest_fd == -1 || pair->dest_fd == STDOUT_FILENO) if (pair->dest_fd == -1 || pair->dest_fd == STDOUT_FILENO)
return 0; return 0;
@ -603,6 +696,8 @@ io_open(const char *src_name)
.src_fd = -1, .src_fd = -1,
.dest_fd = -1, .dest_fd = -1,
.src_eof = false, .src_eof = false,
.dest_try_sparse = false,
.dest_pending_sparse = 0,
}; };
// Block the signals, for which we have a custom signal handler, so // Block the signals, for which we have a custom signal handler, so
@ -629,6 +724,29 @@ io_open(const char *src_name)
extern void extern void
io_close(file_pair *pair, bool success) io_close(file_pair *pair, bool success)
{ {
// Take care of sparseness at the end of the output file.
if (success && pair->dest_try_sparse
&& pair->dest_pending_sparse > 0) {
// Seek forward one byte less than the size of the pending
// hole, then write one zero-byte. This way the file grows
// to its correct size. An alternative would be to use
// ftruncate() but that isn't portable enough (e.g. it
// doesn't work with FAT on Linux; FAT isn't that important
// since it doesn't support sparse files anyway, but we don't
// want to create corrupt files on it).
if (lseek(pair->dest_fd, pair->dest_pending_sparse - 1,
SEEK_CUR) == -1) {
message_error(_("%s: Seeking failed when trying "
"to create a sparse file: %s"),
pair->dest_name, strerror(errno));
success = false;
} else {
const uint8_t zero[1] = { '\0' };
if (io_write_buf(pair, zero, 1))
success = false;
}
}
signals_block(); signals_block();
if (success && pair->dest_fd != STDOUT_FILENO) if (success && pair->dest_fd != STDOUT_FILENO)
@ -651,11 +769,12 @@ io_close(file_pair *pair, bool success)
extern size_t extern size_t
io_read(file_pair *pair, uint8_t *buf, size_t size) io_read(file_pair *pair, io_buf *buf_union, size_t size)
{ {
// We use small buffers here. // We use small buffers here.
assert(size < SSIZE_MAX); assert(size < SSIZE_MAX);
uint8_t *buf = buf_union->u8;
size_t left = size; size_t left = size;
while (left > 0) { while (left > 0) {
@ -691,8 +810,21 @@ io_read(file_pair *pair, uint8_t *buf, size_t size)
} }
extern bool static bool
io_write(const file_pair *pair, const uint8_t *buf, size_t size) is_sparse(const io_buf *buf)
{
assert(IO_BUFFER_SIZE % sizeof(uint64_t) == 0);
for (size_t i = 0; i < ARRAY_SIZE(buf->u64); ++i)
if (buf->u64[i] != 0)
return false;
return true;
}
static bool
io_write_buf(file_pair *pair, const uint8_t *buf, size_t size)
{ {
assert(size < SSIZE_MAX); assert(size < SSIZE_MAX);
@ -731,3 +863,46 @@ io_write(const file_pair *pair, const uint8_t *buf, size_t size)
return false; return false;
} }
extern bool
io_write(file_pair *pair, const io_buf *buf, size_t size)
{
assert(size <= IO_BUFFER_SIZE);
if (pair->dest_try_sparse) {
// Check if the block is sparse (contains only zeros). If it
// sparse, we just store the amount and return. We will take
// care of actually skipping over the hole when we hit the
// next data block or close the file.
//
// Since io_close() requires that dest_pending_sparse > 0
// if the file ends with sparse block, we must also return
// if size == 0 to avoid doing the lseek().
if (size == IO_BUFFER_SIZE) {
if (is_sparse(buf)) {
pair->dest_pending_sparse += size;
return false;
}
} else if (size == 0) {
return false;
}
// This is not a sparse block. If we have a pending hole,
// skip it now.
if (pair->dest_pending_sparse > 0) {
if (lseek(pair->dest_fd, pair->dest_pending_sparse,
SEEK_CUR) == -1) {
message_error(_("%s: Seeking failed when "
"trying to create a sparse "
"file: %s"), pair->dest_name,
strerror(errno));
return true;
}
pair->dest_pending_sparse = 0;
}
}
return io_write_buf(pair, buf->u8, size);
}

View File

@ -11,13 +11,22 @@
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Some systems have suboptimal BUFSIZ. Use a bit bigger value on them. // Some systems have suboptimal BUFSIZ. Use a bit bigger value on them.
// We also need that IO_BUFFER_SIZE is a multiple of 8 (sizeof(uint64_t))
#if BUFSIZ <= 1024 #if BUFSIZ <= 1024
# define IO_BUFFER_SIZE 8192 # define IO_BUFFER_SIZE 8192
#else #else
# define IO_BUFFER_SIZE BUFSIZ # define IO_BUFFER_SIZE (BUFSIZ & ~7U)
#endif #endif
/// is_sparse() accesses the buffer as uint64_t for maximum speed.
/// Use an union to make sure that the buffer is properly aligned.
typedef union {
uint8_t u8[IO_BUFFER_SIZE];
uint64_t u64[IO_BUFFER_SIZE / sizeof(uint64_t)];
} io_buf;
typedef struct { typedef struct {
/// Name of the source filename (as given on the command line) or /// Name of the source filename (as given on the command line) or
/// pointer to static "(stdin)" when reading from standard input. /// pointer to static "(stdin)" when reading from standard input.
@ -33,15 +42,24 @@ typedef struct {
/// File descriptor of the target file /// File descriptor of the target file
int dest_fd; int dest_fd;
/// True once end of the source file has been detected.
bool src_eof;
/// If true, we look for long chunks of zeros and try to create
/// a sparse file.
bool dest_try_sparse;
/// This is used only if dest_try_sparse is true. This holds the
/// number of zero bytes we haven't written out, because we plan
/// to make that byte range a sparse chunk.
off_t dest_pending_sparse;
/// Stat of the source file. /// Stat of the source file.
struct stat src_st; struct stat src_st;
/// Stat of the destination file. /// Stat of the destination file.
struct stat dest_st; struct stat dest_st;
/// True once end of the source file has been detected.
bool src_eof;
} file_pair; } file_pair;
@ -49,6 +67,10 @@ typedef struct {
extern void io_init(void); extern void io_init(void);
/// \brief Disable creation of sparse files when decompressing
extern void io_no_sparse(void);
/// \brief Opens a file pair /// \brief Opens a file pair
extern file_pair *io_open(const char *src_name); extern file_pair *io_open(const char *src_name);
@ -72,7 +94,7 @@ extern void io_close(file_pair *pair, bool success);
/// \return On success, number of bytes read is returned. On end of /// \return On success, number of bytes read is returned. On end of
/// file zero is returned and pair->src_eof set to true. /// file zero is returned and pair->src_eof set to true.
/// On error, SIZE_MAX is returned and error message printed. /// On error, SIZE_MAX is returned and error message printed.
extern size_t io_read(file_pair *pair, uint8_t *buf, size_t size); extern size_t io_read(file_pair *pair, io_buf *buf, size_t size);
/// \brief Writes a buffer to the destination file /// \brief Writes a buffer to the destination file
@ -83,4 +105,4 @@ extern size_t io_read(file_pair *pair, uint8_t *buf, size_t size);
/// ///
/// \return On success, zero is returned. On error, -1 is returned /// \return On success, zero is returned. On error, -1 is returned
/// and error message printed. /// and error message printed.
extern bool io_write(const file_pair *pair, const uint8_t *buf, size_t size); extern bool io_write(file_pair *pair, const io_buf *buf, size_t size);

View File

@ -1072,6 +1072,7 @@ message_help(bool long_help)
if (long_help) if (long_help)
puts(_( puts(_(
" --no-sparse do not create sparse files when decompressing\n"
" -S, --suffix=.SUF use the suffix `.SUF' on compressed files\n" " -S, --suffix=.SUF use the suffix `.SUF' on compressed files\n"
" --files=[FILE] read filenames to process from FILE; if FILE is\n" " --files=[FILE] read filenames to process from FILE; if FILE is\n"
" omitted, filenames are read from the standard input;\n" " omitted, filenames are read from the standard input;\n"

View File

@ -336,6 +336,17 @@ Write the compressed or decompressed data to standard output instead of
a file. This implies a file. This implies
.BR \-\-keep . .BR \-\-keep .
.TP .TP
.B \-\-no\-sparse
Disable creation of sparse files. By default, if decompressing into
a regular file,
.B xz
tries to make the file sparse if the decompressed data contains long
sequences of binary zeros. It works also when writing to standard output
as long as standard output is connected to a regular file, and certain
additional conditions are met to make it safe. Creating sparse files may
save disk space and speed up the decompression by reducing the amount of
disk I/O.
.TP
\fB\-S\fR \fI.suf\fR, \fB\-\-suffix=\fI.suf \fB\-S\fR \fI.suf\fR, \fB\-\-suffix=\fI.suf
When compressing, use When compressing, use
.I .suf .I .suf