diff --git a/src/xz/args.c b/src/xz/args.c index 75b62205..bb6e27bb 100644 --- a/src/xz/args.c +++ b/src/xz/args.c @@ -43,6 +43,7 @@ parse_real(args_info *args, int argc, char **argv) OPT_LZMA1, OPT_LZMA2, + OPT_NO_SPARSE, OPT_FILES, OPT_FILES0, OPT_INFO_MEMORY, @@ -65,6 +66,7 @@ parse_real(args_info *args, int argc, char **argv) { "force", no_argument, NULL, 'f' }, { "stdout", no_argument, NULL, 'c' }, { "to-stdout", no_argument, NULL, 'c' }, + { "no-sparse", no_argument, NULL, OPT_NO_SPARSE }, { "suffix", required_argument, NULL, 'S' }, // { "recursive", no_argument, NULL, 'r' }, // TODO { "files", optional_argument, NULL, OPT_FILES }, @@ -339,6 +341,10 @@ parse_real(args_info *args, int argc, char **argv) break; } + case OPT_NO_SPARSE: + io_no_sparse(); + break; + case OPT_FILES: args->files_delim = '\n'; diff --git a/src/xz/coder.c b/src/xz/coder.c index 7cf6186f..d58e7e39 100644 --- a/src/xz/coder.c +++ b/src/xz/coder.c @@ -33,8 +33,8 @@ static lzma_stream strm = LZMA_STREAM_INIT; static lzma_filter filters[LZMA_FILTERS_MAX + 1]; /// Input and output buffers -static uint8_t in_buf[IO_BUFFER_SIZE]; -static uint8_t out_buf[IO_BUFFER_SIZE]; +static io_buf in_buf; +static io_buf out_buf; /// Number of filters. Zero indicates that we are using a preset. static size_t filters_count = 0; @@ -275,7 +275,7 @@ coder_set_compression_settings(void) static bool is_format_xz(void) { - return strm.avail_in >= 6 && memcmp(in_buf, "\3757zXZ", 6) == 0; + return strm.avail_in >= 6 && memcmp(in_buf.u8, "\3757zXZ", 6) == 0; } @@ -289,7 +289,7 @@ is_format_lzma(void) // Decode the LZMA1 properties. lzma_filter filter = { .id = LZMA_FILTER_LZMA1 }; - if (lzma_properties_decode(&filter, NULL, in_buf, 5) != LZMA_OK) + if (lzma_properties_decode(&filter, NULL, in_buf.u8, 5) != LZMA_OK) return false; // A hack to ditch tons of false positives: We allow only dictionary @@ -317,7 +317,7 @@ is_format_lzma(void) // Again, if someone complains, this will be reconsidered. uint64_t uncompressed_size = 0; for (size_t i = 0; i < 8; ++i) - uncompressed_size |= (uint64_t)(in_buf[5 + i]) << (i * 8); + uncompressed_size |= (uint64_t)(in_buf.u8[5 + i]) << (i * 8); if (uncompressed_size != UINT64_MAX && uncompressed_size > (UINT64_C(1) << 38)) @@ -444,15 +444,16 @@ coder_normal(file_pair *pair) // Assume that something goes wrong. bool success = false; - strm.next_out = out_buf; + strm.next_out = out_buf.u8; strm.avail_out = IO_BUFFER_SIZE; while (!user_abort) { // Fill the input buffer if it is empty and we haven't reached // end of file yet. if (strm.avail_in == 0 && !pair->src_eof) { - strm.next_in = in_buf; - strm.avail_in = io_read(pair, in_buf, IO_BUFFER_SIZE); + strm.next_in = in_buf.u8; + strm.avail_in = io_read( + pair, &in_buf, IO_BUFFER_SIZE); if (strm.avail_in == SIZE_MAX) break; @@ -466,11 +467,11 @@ coder_normal(file_pair *pair) // Write out if the output buffer became full. if (strm.avail_out == 0) { - if (opt_mode != MODE_TEST && io_write(pair, out_buf, + if (opt_mode != MODE_TEST && io_write(pair, &out_buf, IO_BUFFER_SIZE - strm.avail_out)) break; - strm.next_out = out_buf; + strm.next_out = out_buf.u8; strm.avail_out = IO_BUFFER_SIZE; } @@ -487,7 +488,7 @@ coder_normal(file_pair *pair) // when trying to get at least some useful // data out of damaged files. if (opt_mode != MODE_TEST && io_write(pair, - out_buf, IO_BUFFER_SIZE + &out_buf, IO_BUFFER_SIZE - strm.avail_out)) break; } @@ -502,7 +503,7 @@ coder_normal(file_pair *pair) // input, and thus pair->src_eof // becomes true. strm.avail_in = io_read( - pair, in_buf, 1); + pair, &in_buf, 1); if (strm.avail_in == SIZE_MAX) break; @@ -579,14 +580,14 @@ coder_passthru(file_pair *pair) if (user_abort) return false; - if (io_write(pair, in_buf, strm.avail_in)) + if (io_write(pair, &in_buf, strm.avail_in)) return false; strm.total_in += strm.avail_in; strm.total_out = strm.total_in; message_progress_update(); - strm.avail_in = io_read(pair, in_buf, IO_BUFFER_SIZE); + strm.avail_in = io_read(pair, &in_buf, IO_BUFFER_SIZE); if (strm.avail_in == SIZE_MAX) return false; } @@ -613,8 +614,8 @@ coder_run(const char *filename) // Read the first chunk of input data. This is needed to detect // the input file type (for now, only for decompression). - strm.next_in = in_buf; - strm.avail_in = io_read(pair, in_buf, IO_BUFFER_SIZE); + strm.next_in = in_buf.u8; + strm.avail_in = io_read(pair, &in_buf, IO_BUFFER_SIZE); switch (coder_init(pair)) { case CODER_INIT_NORMAL: diff --git a/src/xz/file_io.c b/src/xz/file_io.c index b79d0b77..be5db73d 100644 --- a/src/xz/file_io.c +++ b/src/xz/file_io.c @@ -37,6 +37,17 @@ static bool warn_fchown; #endif +/// If true, try to create sparse files when decompressing. +static bool try_sparse = true; + +/// File status flags of standard output. This is used by io_open_dest() +/// and io_close_dest(). +static int stdout_flags = 0; + + +static bool io_write_buf(file_pair *pair, const uint8_t *buf, size_t size); + + extern void io_init(void) { @@ -63,6 +74,14 @@ io_init(void) } +extern void +io_no_sparse(void) +{ + try_sparse = false; + return; +} + + /// \brief Unlink a file /// /// This tries to verify that the file being unlinked really is the file that @@ -498,42 +517,42 @@ io_open_dest(file_pair *pair) #ifdef TUKLIB_DOSLIKE setmode(STDOUT_FILENO, O_BINARY); #endif - return false; - } + } else { + pair->dest_name = suffix_get_dest_name(pair->src_name); + if (pair->dest_name == NULL) + return true; - pair->dest_name = suffix_get_dest_name(pair->src_name); - if (pair->dest_name == NULL) - return true; + // If --force was used, unlink the target file first. + if (opt_force && unlink(pair->dest_name) && errno != ENOENT) { + message_error("%s: Cannot unlink: %s", + pair->dest_name, strerror(errno)); + free(pair->dest_name); + return true; + } - // If --force was used, unlink the target file first. - if (opt_force && unlink(pair->dest_name) && errno != ENOENT) { - message_error("%s: Cannot unlink: %s", - pair->dest_name, strerror(errno)); - free(pair->dest_name); - return true; - } + if (opt_force && unlink(pair->dest_name) && errno != ENOENT) { + message_error("%s: Cannot unlink: %s", + pair->dest_name, strerror(errno)); + free(pair->dest_name); + return true; + } - if (opt_force && unlink(pair->dest_name) && errno != ENOENT) { - message_error("%s: Cannot unlink: %s", pair->dest_name, - strerror(errno)); - free(pair->dest_name); - return true; - } + // Open the file. + const int flags = O_WRONLY | O_BINARY | O_NOCTTY + | O_CREAT | O_EXCL; + const mode_t mode = S_IRUSR | S_IWUSR; + pair->dest_fd = open(pair->dest_name, flags, mode); - // Open the file. - const int flags = O_WRONLY | O_BINARY | O_NOCTTY | O_CREAT | O_EXCL; - const mode_t mode = S_IRUSR | S_IWUSR; - pair->dest_fd = open(pair->dest_name, flags, mode); + if (pair->dest_fd == -1) { + // Don't bother with error message if user requested + // us to exit anyway. + if (!user_abort) + message_error("%s: %s", pair->dest_name, + strerror(errno)); - if (pair->dest_fd == -1) { - // Don't bother with error message if user requested - // us to exit anyway. - if (!user_abort) - message_error("%s: %s", pair->dest_name, - strerror(errno)); - - free(pair->dest_name); - return true; + free(pair->dest_name); + return true; + } } // If this really fails... well, we have a safe fallback. @@ -545,6 +564,65 @@ io_open_dest(file_pair *pair) #elif !defined(TUKLIB_DOSLIKE) pair->dest_st.st_dev = 0; pair->dest_st.st_ino = 0; +#endif +#ifndef TUKLIB_DOSLIKE + } else if (try_sparse && opt_mode == MODE_DECOMPRESS) { + // When writing to standard output, we need to be extra + // careful: + // - It may be connected to something else than + // a regular file. + // - We aren't necessarily writing to a new empty file + // or to the end of an existing file. + // - O_APPEND may be active. + // + // TODO: I'm keeping this disabled for DOS-like systems + // for now. FAT doesn't support sparse files, but NTFS + // does, so maybe this should be enabled on Windows after + // some testing. + if (pair->dest_fd == STDOUT_FILENO) { + if (!S_ISREG(pair->dest_st.st_mode)) + return false; + + const int flags = fcntl(STDOUT_FILENO, F_GETFL); + if (flags == -1) + return false; + + if (flags & O_APPEND) { + // Creating a sparse file is not possible + // when O_APPEND is active (it's used by + // shell's >> redirection). As I understand + // it, it is safe to temporarily disable + // O_APPEND in xz, because if someone + // happened to write to the same file at the + // same time, results would be bad anyway + // (users shouldn't assume that xz uses any + // specific block size when writing data). + // + // The write position may be something else + // than the end of the file, so we must fix + // it to start writing at the end of the file + // to imitate O_APPEND. + if (lseek(STDOUT_FILENO, 0, SEEK_END) == -1) + return false; + + if (fcntl(STDOUT_FILENO, F_SETFL, + stdout_flags & ~O_APPEND)) + return false; + + // Remember the flags so that io_close_dest() + // can restore them. + stdout_flags = flags; + + } else if (lseek(STDOUT_FILENO, 0, SEEK_CUR) + != pair->dest_st.st_size) { + // Writing won't start exactly at the end + // of the file. We cannot use sparse output, + // because it would probably corrupt the file. + return false; + } + } + + pair->dest_try_sparse = true; #endif } @@ -562,6 +640,21 @@ io_open_dest(file_pair *pair) static int io_close_dest(file_pair *pair, bool success) { + // If io_open_dest() has disabled O_APPEND, restore it here. + if (stdout_flags != 0) { + assert(pair->dest_fd == STDOUT_FILENO); + + const int fail = fcntl(STDOUT_FILENO, F_SETFL, stdout_flags); + stdout_flags = 0; + + if (fail) { + message_error(_("Error restoring the O_APPEND flag " + "to standard output: %s"), + strerror(errno)); + return -1; + } + } + if (pair->dest_fd == -1 || pair->dest_fd == STDOUT_FILENO) return 0; @@ -603,6 +696,8 @@ io_open(const char *src_name) .src_fd = -1, .dest_fd = -1, .src_eof = false, + .dest_try_sparse = false, + .dest_pending_sparse = 0, }; // Block the signals, for which we have a custom signal handler, so @@ -629,6 +724,29 @@ io_open(const char *src_name) extern void io_close(file_pair *pair, bool success) { + // Take care of sparseness at the end of the output file. + if (success && pair->dest_try_sparse + && pair->dest_pending_sparse > 0) { + // Seek forward one byte less than the size of the pending + // hole, then write one zero-byte. This way the file grows + // to its correct size. An alternative would be to use + // ftruncate() but that isn't portable enough (e.g. it + // doesn't work with FAT on Linux; FAT isn't that important + // since it doesn't support sparse files anyway, but we don't + // want to create corrupt files on it). + if (lseek(pair->dest_fd, pair->dest_pending_sparse - 1, + SEEK_CUR) == -1) { + message_error(_("%s: Seeking failed when trying " + "to create a sparse file: %s"), + pair->dest_name, strerror(errno)); + success = false; + } else { + const uint8_t zero[1] = { '\0' }; + if (io_write_buf(pair, zero, 1)) + success = false; + } + } + signals_block(); if (success && pair->dest_fd != STDOUT_FILENO) @@ -651,11 +769,12 @@ io_close(file_pair *pair, bool success) extern size_t -io_read(file_pair *pair, uint8_t *buf, size_t size) +io_read(file_pair *pair, io_buf *buf_union, size_t size) { // We use small buffers here. assert(size < SSIZE_MAX); + uint8_t *buf = buf_union->u8; size_t left = size; while (left > 0) { @@ -691,8 +810,21 @@ io_read(file_pair *pair, uint8_t *buf, size_t size) } -extern bool -io_write(const file_pair *pair, const uint8_t *buf, size_t size) +static bool +is_sparse(const io_buf *buf) +{ + assert(IO_BUFFER_SIZE % sizeof(uint64_t) == 0); + + for (size_t i = 0; i < ARRAY_SIZE(buf->u64); ++i) + if (buf->u64[i] != 0) + return false; + + return true; +} + + +static bool +io_write_buf(file_pair *pair, const uint8_t *buf, size_t size) { assert(size < SSIZE_MAX); @@ -731,3 +863,46 @@ io_write(const file_pair *pair, const uint8_t *buf, size_t size) return false; } + + +extern bool +io_write(file_pair *pair, const io_buf *buf, size_t size) +{ + assert(size <= IO_BUFFER_SIZE); + + if (pair->dest_try_sparse) { + // Check if the block is sparse (contains only zeros). If it + // sparse, we just store the amount and return. We will take + // care of actually skipping over the hole when we hit the + // next data block or close the file. + // + // Since io_close() requires that dest_pending_sparse > 0 + // if the file ends with sparse block, we must also return + // if size == 0 to avoid doing the lseek(). + if (size == IO_BUFFER_SIZE) { + if (is_sparse(buf)) { + pair->dest_pending_sparse += size; + return false; + } + } else if (size == 0) { + return false; + } + + // This is not a sparse block. If we have a pending hole, + // skip it now. + if (pair->dest_pending_sparse > 0) { + if (lseek(pair->dest_fd, pair->dest_pending_sparse, + SEEK_CUR) == -1) { + message_error(_("%s: Seeking failed when " + "trying to create a sparse " + "file: %s"), pair->dest_name, + strerror(errno)); + return true; + } + + pair->dest_pending_sparse = 0; + } + } + + return io_write_buf(pair, buf->u8, size); +} diff --git a/src/xz/file_io.h b/src/xz/file_io.h index b0bbe11a..58bf7b5e 100644 --- a/src/xz/file_io.h +++ b/src/xz/file_io.h @@ -11,13 +11,22 @@ /////////////////////////////////////////////////////////////////////////////// // Some systems have suboptimal BUFSIZ. Use a bit bigger value on them. +// We also need that IO_BUFFER_SIZE is a multiple of 8 (sizeof(uint64_t)) #if BUFSIZ <= 1024 # define IO_BUFFER_SIZE 8192 #else -# define IO_BUFFER_SIZE BUFSIZ +# define IO_BUFFER_SIZE (BUFSIZ & ~7U) #endif +/// is_sparse() accesses the buffer as uint64_t for maximum speed. +/// Use an union to make sure that the buffer is properly aligned. +typedef union { + uint8_t u8[IO_BUFFER_SIZE]; + uint64_t u64[IO_BUFFER_SIZE / sizeof(uint64_t)]; +} io_buf; + + typedef struct { /// Name of the source filename (as given on the command line) or /// pointer to static "(stdin)" when reading from standard input. @@ -33,15 +42,24 @@ typedef struct { /// File descriptor of the target file int dest_fd; + /// True once end of the source file has been detected. + bool src_eof; + + /// If true, we look for long chunks of zeros and try to create + /// a sparse file. + bool dest_try_sparse; + + /// This is used only if dest_try_sparse is true. This holds the + /// number of zero bytes we haven't written out, because we plan + /// to make that byte range a sparse chunk. + off_t dest_pending_sparse; + /// Stat of the source file. struct stat src_st; /// Stat of the destination file. struct stat dest_st; - /// True once end of the source file has been detected. - bool src_eof; - } file_pair; @@ -49,6 +67,10 @@ typedef struct { extern void io_init(void); +/// \brief Disable creation of sparse files when decompressing +extern void io_no_sparse(void); + + /// \brief Opens a file pair extern file_pair *io_open(const char *src_name); @@ -72,7 +94,7 @@ extern void io_close(file_pair *pair, bool success); /// \return On success, number of bytes read is returned. On end of /// file zero is returned and pair->src_eof set to true. /// On error, SIZE_MAX is returned and error message printed. -extern size_t io_read(file_pair *pair, uint8_t *buf, size_t size); +extern size_t io_read(file_pair *pair, io_buf *buf, size_t size); /// \brief Writes a buffer to the destination file @@ -83,4 +105,4 @@ extern size_t io_read(file_pair *pair, uint8_t *buf, size_t size); /// /// \return On success, zero is returned. On error, -1 is returned /// and error message printed. -extern bool io_write(const file_pair *pair, const uint8_t *buf, size_t size); +extern bool io_write(file_pair *pair, const io_buf *buf, size_t size); diff --git a/src/xz/message.c b/src/xz/message.c index be7c3fac..4f8ca00d 100644 --- a/src/xz/message.c +++ b/src/xz/message.c @@ -1072,6 +1072,7 @@ message_help(bool long_help) if (long_help) puts(_( +" --no-sparse do not create sparse files when decompressing\n" " -S, --suffix=.SUF use the suffix `.SUF' on compressed files\n" " --files=[FILE] read filenames to process from FILE; if FILE is\n" " omitted, filenames are read from the standard input;\n" diff --git a/src/xz/xz.1 b/src/xz/xz.1 index b8115624..94aa562e 100644 --- a/src/xz/xz.1 +++ b/src/xz/xz.1 @@ -336,6 +336,17 @@ Write the compressed or decompressed data to standard output instead of a file. This implies .BR \-\-keep . .TP +.B \-\-no\-sparse +Disable creation of sparse files. By default, if decompressing into +a regular file, +.B xz +tries to make the file sparse if the decompressed data contains long +sequences of binary zeros. It works also when writing to standard output +as long as standard output is connected to a regular file, and certain +additional conditions are met to make it safe. Creating sparse files may +save disk space and speed up the decompression by reducing the amount of +disk I/O. +.TP \fB\-S\fR \fI.suf\fR, \fB\-\-suffix=\fI.suf When compressing, use .I .suf