Add tuklib_mbstr_wrap for automatic word wrapping

Automatic word wrapping makes translators' work easier and reduces errors like misaligned columns or overlong lines. Right-to-left languages and languages that don't use spaces between words will still need extra effort. (xz hasn't been translated to any RTL language so far.)
2025-04-02 05:40:58 +00:00 · 2024-12-16 18:43:52 +02:00 · 2024-12-16 18:43:52 +02:00 · ca529c3f41
commit ca529c3f41
parent 314b83ceba
6 changed files with 506 additions and 1 deletions
--- a/cmake/tuklib_mbstr.cmake
+++ b/cmake/tuklib_mbstr.cmake
@ -18,4 +18,8 @@ function(tuklib_mbstr TARGET_OR_ALL)
    # NOTE: wcwidth() requires _GNU_SOURCE or _XOPEN_SOURCE on GNU/Linux.
    check_symbol_exists(wcwidth wchar.h HAVE_WCWIDTH)
    tuklib_add_definition_if("${TARGET_OR_ALL}" HAVE_WCWIDTH)
    # NOTE: vasprintf() requires _GNU_SOURCE on GNU/Linux.
    check_symbol_exists(vasprintf stdio.h HAVE_VASPRINTF)
    tuklib_add_definition_if("${TARGET_OR_ALL}" HAVE_VASPRINTF)
 endfunction()
--- a/m4/tuklib_mbstr.m4
+++ b/m4/tuklib_mbstr.m4
@ -27,5 +27,5 @@
 AC_DEFUN_ONCE([TUKLIB_MBSTR], [
 AC_REQUIRE([TUKLIB_COMMON])
 AC_FUNC_MBRTOWC
-AC_CHECK_FUNCS([wcwidth])
+AC_CHECK_FUNCS([wcwidth vasprintf])
 ])dnl
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -30,6 +30,8 @@ EXTRA_DIST = \
 	common/tuklib_mbstr.h \
 	common/tuklib_mbstr_fw.c \
 	common/tuklib_mbstr_width.c \
 	common/tuklib_mbstr_wrap.c \
 	common/tuklib_mbstr_wrap.h \
 	common/tuklib_open_stdxxx.c \
 	common/tuklib_open_stdxxx.h \
 	common/tuklib_physmem.c \
--- a/src/common/tuklib_gettext.h
+++ b/src/common/tuklib_gettext.h
@ -68,4 +68,15 @@
 #endif
 #define N_(msgid) msgid
 // Optional: Strings that are word wrapped using tuklib_mbstr_wrap may be
 // marked with W_("foo) in the source code. xgettext can then add a comment
 // to all such strings to inform translators. The following option needs to
 // be added to XGETTEXT_OPTIONS in po/Makevars or in an equivalent place:
 //
 // '--keyword=W_:1,"This is word wrapped at spaces. The Unicode character U+00A0 works as a non-breaking space. Tab (\t) is interpret as a zero-width space (the tab itself is not displayed); U+200B is NOT supported. Manual word wrapping with \n is supported but requires care."'
 //
 // NOTE: The double-quotes in the --keyword argument above must be passed to
 // xgettext as is, thus one needs the single-quotes in Makevars.
 #define W_(msgid) _(msgid)
 #endif
--- a/src/common/tuklib_mbstr_wrap.c
+++ b/src/common/tuklib_mbstr_wrap.c
@ -0,0 +1,285 @@
 // SPDX-License-Identifier: 0BSD
 ///////////////////////////////////////////////////////////////////////////////
 //
 /// \file       tuklib_mbstr_wrap.c
 /// \brief      Word wraps a string and prints it to a FILE stream
 ///
 /// This depends on tuklib_mbstr_width.c.
 //
 //  Author:     Lasse Collin
 //
 ///////////////////////////////////////////////////////////////////////////////
 #include "tuklib_mbstr.h"
 #include "tuklib_mbstr_wrap.h"
 #include <stdarg.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 extern int
 tuklib_wraps(FILE *outfile, const struct tuklib_wrap_opt *opt, const char *str)
 {
 	// left_cont may be less than left_margin. In that case, if the first
 	// word is extremely long, it will stay on the first line even if
 	// the line then gets overlong.
 	//
 	// On the other hand, left2_cont < left2_margin isn't allowed because
 	// it could result in inconsistent behavior when a very long word
 	// comes right after a \v.
 	//
 	// It is fine to have left2_margin < left_margin although it would be
 	// an odd use case.
 	if (!(opt->left_margin < opt->right_margin
 			&& opt->left_cont < opt->right_margin
 			&& opt->left2_margin <= opt->left2_cont
 			&& opt->left2_cont < opt->right_margin))
 		return TUKLIB_WRAP_ERR_OPT;
 	// This is set to TUKLIB_WRAP_WARN_OVERLONG if one or more
 	// output lines extend past opt->right_margin columns.
 	int warn_overlong = 0;
 	// Indentation of the first output line after \n or \r.
 	// \v sets this to opt->left2_margin.
 	// \r resets this back to the original value.
 	size_t first_indent = opt->left_margin;
 	// Indentation of the output lines that occur due to word wrapping.
 	// \v sets this to opt->left2_cont and \r back to the original value.
 	size_t cont_indent = opt->left_cont;
 	// If word wrapping occurs, the newline isn't printed unless more
 	// text would be put on the continuation line. This is also used
 	// when \v needs to start on a new line.
 	bool pending_newline = false;
 	// Spaces are printed only when there is something else to put
 	// after the spaces on the line. This avoids unwanted empty lines
 	// in the output and makes it possible to ignore possible spaces
 	// before a \v character.
 	size_t pending_spaces = first_indent;
 	// Current output column. When cur_col == pending_spaces, nothing
 	// has been actually printed to the current output line.
 	size_t cur_col = pending_spaces;
 	while (true) {
 		// Number of bytes until the *next* line-break opportunity.
 		size_t len = 0;
 		// Number of columns until the *next* line-break opportunity.
 		size_t width = 0;
 		// Text between a pair of \b characters is treated as
 		// an unbreakable block even if it contains spaces.
 		// It must not contain any control characters before
 		// the closing \b.
 		bool unbreakable = false;
 		while (true) {
 			// Find the next character that we handle specially.
 			// In an unbreakable block, search only for the
 			// closing \b; if missing, the unbreakable block
 			// extends to the end of the string.
 			const size_t n = strcspn(str + len,
 					unbreakable ? "\b" : " \t\n\r\v\b");
 			// Calculate how many columns the characters need.
 			const size_t w = tuklib_mbstr_width_mem(str + len, n);
 			if (w == (size_t)-1)
 				return TUKLIB_WRAP_ERR_STR;
 			width += w;
 			len += n;
 			// \b isn't a line-break opportunity so it has to
 			// be handled here. For simplicity, empty blocks
 			// are treated as zero-width characters.
 			if (str[len] == '\b') {
 				++len;
 				unbreakable = !unbreakable;
 				continue;
 			}
 			break;
 		}
 		// Determine if adding this chunk of text would make the
 		// current output line exceed opt->right_margin columns.
 		const bool too_long = cur_col + width > opt->right_margin;
 		// Wrap the line if needed. However:
 		//
 		//   - Don't wrap if the current column is less than where
 		//     the continuation line would begin. In that case
 		//     the chunk wouldn't fit on the next line either so
 		//     we just have to produce an overlong line.
 		//
 		//   - Don't wrap if so far the line only contains spaces.
 		//     Wrapping in that case would leave a weird empty line.
 		//     NOTE: This "only contains spaces" condition is the
 		//     reason why left2_margin > left2_cont isn't allowed.
 		if (too_long && cur_col > cont_indent
 				&& cur_col > pending_spaces) {
 			// There might be trailing spaces or zero-width spaces
 			// which need to be ignored to keep the output pretty.
 			//
 			// Spaces need to be ignored because in some
 			// writing styles there are two spaces after
 			// a full stop. Example string:
 			//
 			//     "Foo bar.  Abc def."
 			//              ^
 			// If the first space after the first full stop
 			// triggers word wrapping, both spaces must be
 			// ignored. Otherwise the next line would be
 			// indented too much.
 			//
 			// Zero-width spaces are ignored the same way
 			// because they are meaningless if an adjacent
 			// character is a space.
 			while (*str == ' ' || *str == '\t')
 				++str;
 			// Don't print the newline here; only mark it as
 			// pending. This avoids an unwanted empty line if
 			// there is a \n or \r or \0 after the spaces have
 			// been ignored.
 			pending_newline = true;
 			pending_spaces = cont_indent;
 			cur_col = pending_spaces;
 			// Since str may have been incremented due to the
 			// ignored spaces, the loop needs to be restarted.
 			continue;
 		}
 		// Print the current chunk of text before the next
 		// line-break opportunity. If the chunk was empty,
 		// don't print anything so that the pending newline
 		// and pending spaces aren't printed on their own.
 		if (len > 0) {
 			if (pending_newline) {
 				pending_newline = false;
 				if (putc('\n', outfile) == EOF)
 					return TUKLIB_WRAP_ERR_IO;
 			}
 			while (pending_spaces > 0) {
 				if (putc(' ', outfile) == EOF)
 					return TUKLIB_WRAP_ERR_IO;
 				--pending_spaces;
 			}
 			for (size_t i = 0; i < len; ++i) {
 				// Ignore unbreakable block characters (\b).
 				const int c = (unsigned char)str[i];
 				if (c != '\b' && putc(c, outfile) == EOF)
 					return TUKLIB_WRAP_ERR_IO;
 			}
 			str += len;
 			cur_col += width;
 			// Remember if the line got overlong. If no other
 			// errors occur, we return warn_overlong. It might
 			// help in catching problematic strings.
 			if (too_long)
 				warn_overlong = TUKLIB_WRAP_WARN_OVERLONG;
 		}
 		// Handle the special character after the chunk of text.
 		switch (*str) {
 		case ' ':
 			// Regular space.
 			++cur_col;
 			++pending_spaces;
 			break;
 		case '\v':
 			// Set the alternative indentation settings.
 			first_indent = opt->left2_margin;
 			cont_indent = opt->left2_cont;
 			if (first_indent > cur_col) {
 				// Add one or more spaces to reach
 				// the column specified in first_indent.
 				pending_spaces += first_indent - cur_col;
 			} else {
 				// There is no room to add even one space
 				// before reaching the column first_indent.
 				pending_newline = true;
 				pending_spaces = first_indent;
 			}
 			cur_col = first_indent;
 			break;
 		case '\0': // Implicit newline at the end of the string.
 		case '\r': // Newline that also resets the effect of \v.
 		case '\n': // Newline without resetting the indentation mode.
 			if (putc('\n', outfile) == EOF)
 				return TUKLIB_WRAP_ERR_IO;
 			if (*str == '\0')
 				return warn_overlong;
 			if (*str == '\r') {
 				first_indent = opt->left_margin;
 				cont_indent = opt->left_cont;
 			}
 			pending_newline = false;
 			pending_spaces = first_indent;
 			cur_col = first_indent;
 			break;
 		}
 		// Skip the specially-handled character.
 		++str;
 	}
 }
 extern int
 tuklib_wrapf(FILE *stream, const struct tuklib_wrap_opt *opt,
 		const char *fmt, ...)
 {
 	va_list ap;
 	char *buf;
 #ifdef HAVE_VASPRINTF
 	va_start(ap, fmt);
 	const int n = vasprintf(&buf, fmt, ap);
 	va_end(ap);
 	if (n == -1)
 		return TUKLIB_WRAP_ERR_FORMAT;
 #else
 	// Fixed buffer size is dumb but in practice one shouldn't need
 	// huge strings for *formatted* output. This simple method is safe
 	// with pre-C99 vsnprintf() implementations too which don't return
 	// the required buffer size (they return -1 or buf_size - 1) or
 	// which might not null-terminate the buffer in case it's too small.
 	const size_t buf_size = 128 * 1024;
 	buf = malloc(buf_size);
 	if (buf == NULL)
 		return TUKLIB_WRAP_ERR_FORMAT;
 	va_start(ap, fmt);
 	const int n = vsnprintf(buf, buf_size, fmt, ap);
 	va_end(ap);
 	if (n <= 0 || n >= (int)(buf_size - 1)) {
 		free(buf);
 		return TUKLIB_WRAP_ERR_FORMAT;
 	}
 #endif
 	const int ret = tuklib_wraps(stream, opt, buf);
 	free(buf);
 	return ret;
 }
--- a/src/common/tuklib_mbstr_wrap.h
+++ b/src/common/tuklib_mbstr_wrap.h
@ -0,0 +1,203 @@
 // SPDX-License-Identifier: 0BSD
 ///////////////////////////////////////////////////////////////////////////////
 //
 /// \file       tuklib_mbstr_wrap.h
 /// \brief      Word wrapping for multibyte strings
 ///
 /// The word wrapping functions are intended to be usable, for example,
 /// for printing --help text in command line tools. While manually-wrapped
 /// --help text allows precise formatting, such freedom requires translators
 /// to count spaces and determine where line breaks should occur. It's
 /// tedious and error prone, and experience has shown that only some
 /// translators do it well. Automatic word wrapping is less flexible but
 /// results in polished-enough look with less effort from everyone.
 /// Right-to-left languages and languages that don't use spaces between
 /// words will still need extra effort though.
 //
 //  Author:     Lasse Collin
 //
 ///////////////////////////////////////////////////////////////////////////////
 #ifndef TUKLIB_MBSTR_WRAP_H
 #define TUKLIB_MBSTR_WRAP_H
 #include "tuklib_common.h"
 #include <stdio.h>
 TUKLIB_DECLS_BEGIN
 /// One or more output lines exceeded right_margin.
 /// This only a warning; everything was still printed successfully.
 #define TUKLIB_WRAP_WARN_OVERLONG   0x01
 /// Error writing to to the output FILE. The error flag in the FILE
 /// should have been set as well.
 #define TUKLIB_WRAP_ERR_IO          0x02
 /// Invalid options in struct tuklib_wrap_opt.
 /// Nothing was printed.
 #define TUKLIB_WRAP_ERR_OPT         0x04
 /// Invalid or unsupported multibyte character in the input string:
 /// either mbrtowc() failed or wcwidth() returned a negative value.
 #define TUKLIB_WRAP_ERR_STR         0x08
 /// Only tuklib_wrapf(): Error in converting the format string.
 /// It's either a memory allocation failure or something bad with the
 /// format string or arguments.
 #define TUKLIB_WRAP_ERR_FORMAT      0x10
 /// Options for tuklib_wraps() and tuklib_wrapf()
 struct tuklib_wrap_opt {
 	/// Indentation of the first output line after `\n` or `\r`.
 	/// This can be anything less than right_margin.
 	unsigned short left_margin;
 	/// Column where word-wrapped continuation lines start.
 	/// This can be anything less than right_margin.
 	unsigned short left_cont;
 	/// Column where the text after `\v` will start, either on the current
 	/// line (when there is room to add at least one space) or on a new
 	/// empty line.
 	unsigned short left2_margin;
 	/// Like left_cont but for text after a `\v`. However, this must
 	/// be greater than or equal to left2_margin in addition to being
 	/// less than right_margin.
 	unsigned short left2_cont;
 	/// For 80-column terminals, it is recommended to use 79 here for
 	/// maximum portability. 80 will work most of the time but it will
 	/// result in unwanted empty lines in the rare case where a terminal
 	/// moves the cursor to the beginning of the next line immediately
 	/// when the last column has been used.
 	unsigned short right_margin;
 };
 #define tuklib_wraps TUKLIB_SYMBOL(tuklib_wraps)
 extern int tuklib_wraps(FILE *stream, const struct tuklib_wrap_opt *opt,
 		const char *str);
 ///<
 /// \brief      Word wrap a multibyte string and write it to a FILE
 ///
 /// Word wrapping is done only at spaces and at the special control characters
 /// described below. Multiple consecutive spaces are handled properly: strings
 /// that have two (or more) spaces after a full sentence will look good even
 /// when the spaces occur at a word wrapping boundary. Trailing spaces are
 /// ignored at the end of a line or at the end of a string.
 ///
 /// The following control characters have been repurposed:
 ///
 ///   - `\t` = Zero-width space allows a line break without producing any
 ///            output by itself. This can be useful after hard hyphens as
 ///            hyphens aren't otherwise used for line breaking. This can also
 ///            be useful in languages that don't use spaces between words.
 ///            (The Unicode character U+200B isn't supported.)
 ///   - `\b` = Text between a pair of `\b` characters is treated as an
 ///            unbreakable block (not wrapped even if there are spaces).
 ///            For example, a non-breaking space can be done like
 ///            in `"123\b \bMiB"`. Control characters (like `\n` or `\t`)
 ///            aren't allowed before the closing `\b`. If closing `\b` is
 ///            missing, the block extends to the end of the string. Empty
 ///            blocks are treated as zero-width characters. If line breaks
 ///            are possible around an empty block (like in `"foo \b\b bar"`
 ///            or `"foo \b"`), it can result in weird output.
 ///   - `\v` = Change to alternative indentation (left2_margin).
 ///   - `\r` = Reset back to the initial indentation and add a newline.
 ///            The next line will be indented by left_margin.
 ///   - `\n` = Add a newline without resetting the effect of `\v`. The
 ///            next line will be indented by left_margin or left2_margin
 ///            (not left_cont or left2_cont).
 ///
 /// Only `\n` should appear in translatable strings. `\t` works too but
 /// even that might confuse some translators even if there is a TRANSLATORS
 /// comment explaining its meaning.
 ///
 /// To use the other control characters in messages, one should use
 /// tuklib_wrapf() with appropriate printf format string to combine
 /// translatable strings with non-translatable portions. For example:
 ///
 /// \code{.c}
 /// static const struct tuklib_wrap_opt wrap2 = { 2,  2, 22, 22, 79 };
 /// int e = 0;
 /// ...
 /// e |= tuklib_wrapf(stdout, &wrap2,
 ///                   "-h, --help\v%s\r"
 ///                   "    --version\v%s",
 ///                   W_("display this help and exit"),
 ///                   W_("display version information and exit"));
 /// ...
 /// if (e != 0) {
 ///     // Handle warning or error.
 ///     ...
 /// }
 /// \endcode
 ///
 /// Control characters other than `\n` and `\t` are unusable in
 /// translatable strings:
 ///
 ///   - Gettext tools show annoying warnings if C escape sequences other
 ///     than `\n` or `\t` are seen. (Otherwise they still work perfectly
 ///     fine though.)
 ///
 ///   - While at least Poedit and Lokalize support all escapes, some
 ///     editors only support `\n` and `\t`.
 ///
 ///   - They could confuse some translators, resulting in broken
 ///     translations.
 ///
 /// Using non-control characters would solve some issues but it wouldn't
 /// help with the unfortunate real-world issue that some translators would
 /// likely have trouble understanding a new syntax. The Gettext manual
 /// specifically warns about this, see the subheading "No unusual markup"
 /// in `info (gettext)Preparing Strings`. (While using `\t` for zero-width
 /// space is such custom markup, most translators will never need it.)
 ///
 /// Translators can use the Unicode character U+00A0 (or U+202F) if they
 /// need a non-breaking space. For example, in French a non-breaking space
 /// may be needed before colons and question marks (U+00A0 is common in
 /// real-world French PO files).
 ///
 /// Using a non-ASCII char in a string in the C code (like `"123\u00A0MiB"`)
 /// can work if one tells xgettext that input encoding is UTF-8, one
 /// ensures that the C compiler uses UTF-8 as the input charset, and one
 /// is certain that the program is *always* run under an UTF-8 locale.
 /// Unfortunately a portable program cannot make this kind of assumptions,
 /// which means that there is no pretty way to have a non-breaking space in
 /// a translatable string.
 ///
 /// Optional: To tell translators which strings are automatically word
 /// wrapped, see the macro `W_` in tuklib_gettext.h.
 ///
 /// \param      stream      Output FILE stream. For decent performance, it
 ///                         should be in buffered mode because this function
 ///                         writes the output one byte at a time with fputc().
 /// \param      opt         Word wrapping options.
 /// \param      str         Null-terminated multibyte string that is in
 ///                         the encoding used by the current locale.
 ///
 /// \return     Returns 0 on success. If an error or warning occurs, one of
 ///             TUKLIB_WRAP_* codes is returned. Those codes are powers
 ///             of two. When warning/error detection can be delayed, the
 ///             return values can be accumulated from multiple calls using
 ///             bitwise-or into a single variable which can be checked after
 ///             all strings have (hopefully) been printed.
 #define tuklib_wrapf TUKLIB_SYMBOL(tuklib_wrapf)
 extern int tuklib_wrapf(FILE *stream, const struct tuklib_wrap_opt *opt,
 		const char *fmt, ...);
 ///<
 /// \brief      Format and word-wrap a multibyte string and write it to a FILE
 ///
 /// This is like tuklib_wraps() except that this takes a printf
 /// format string.
 ///
 /// \note       On platforms that lack vasprintf(), the intermediate
 ///             result from vsnprintf() must fit into a 128 KiB buffer.
 ///             TUKLIB_WRAP_ERR_FORMAT is returned if it doesn't but
 ///             only on platforms that lack vasprintf().
 TUKLIB_DECLS_END
 #endif