From ca529c3f41a4a19a59e2e252e6dd9255f130c634 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Mon, 16 Dec 2024 18:43:52 +0200 Subject: [PATCH] Add tuklib_mbstr_wrap for automatic word wrapping Automatic word wrapping makes translators' work easier and reduces errors like misaligned columns or overlong lines. Right-to-left languages and languages that don't use spaces between words will still need extra effort. (xz hasn't been translated to any RTL language so far.) --- cmake/tuklib_mbstr.cmake | 4 + m4/tuklib_mbstr.m4 | 2 +- src/Makefile.am | 2 + src/common/tuklib_gettext.h | 11 ++ src/common/tuklib_mbstr_wrap.c | 285 +++++++++++++++++++++++++++++++++ src/common/tuklib_mbstr_wrap.h | 203 +++++++++++++++++++++++ 6 files changed, 506 insertions(+), 1 deletion(-) create mode 100644 src/common/tuklib_mbstr_wrap.c create mode 100644 src/common/tuklib_mbstr_wrap.h diff --git a/cmake/tuklib_mbstr.cmake b/cmake/tuklib_mbstr.cmake index 71e16cc5..bd234cbc 100644 --- a/cmake/tuklib_mbstr.cmake +++ b/cmake/tuklib_mbstr.cmake @@ -18,4 +18,8 @@ function(tuklib_mbstr TARGET_OR_ALL) # NOTE: wcwidth() requires _GNU_SOURCE or _XOPEN_SOURCE on GNU/Linux. check_symbol_exists(wcwidth wchar.h HAVE_WCWIDTH) tuklib_add_definition_if("${TARGET_OR_ALL}" HAVE_WCWIDTH) + + # NOTE: vasprintf() requires _GNU_SOURCE on GNU/Linux. + check_symbol_exists(vasprintf stdio.h HAVE_VASPRINTF) + tuklib_add_definition_if("${TARGET_OR_ALL}" HAVE_VASPRINTF) endfunction() diff --git a/m4/tuklib_mbstr.m4 b/m4/tuklib_mbstr.m4 index 01398347..98b1d7a3 100644 --- a/m4/tuklib_mbstr.m4 +++ b/m4/tuklib_mbstr.m4 @@ -27,5 +27,5 @@ AC_DEFUN_ONCE([TUKLIB_MBSTR], [ AC_REQUIRE([TUKLIB_COMMON]) AC_FUNC_MBRTOWC -AC_CHECK_FUNCS([wcwidth]) +AC_CHECK_FUNCS([wcwidth vasprintf]) ])dnl diff --git a/src/Makefile.am b/src/Makefile.am index e1fcb5a9..cb5aed40 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -30,6 +30,8 @@ EXTRA_DIST = \ common/tuklib_mbstr.h \ common/tuklib_mbstr_fw.c \ common/tuklib_mbstr_width.c \ + common/tuklib_mbstr_wrap.c \ + common/tuklib_mbstr_wrap.h \ common/tuklib_open_stdxxx.c \ common/tuklib_open_stdxxx.h \ common/tuklib_physmem.c \ diff --git a/src/common/tuklib_gettext.h b/src/common/tuklib_gettext.h index 4021c98f..2ee91cb2 100644 --- a/src/common/tuklib_gettext.h +++ b/src/common/tuklib_gettext.h @@ -68,4 +68,15 @@ #endif #define N_(msgid) msgid +// Optional: Strings that are word wrapped using tuklib_mbstr_wrap may be +// marked with W_("foo) in the source code. xgettext can then add a comment +// to all such strings to inform translators. The following option needs to +// be added to XGETTEXT_OPTIONS in po/Makevars or in an equivalent place: +// +// '--keyword=W_:1,"This is word wrapped at spaces. The Unicode character U+00A0 works as a non-breaking space. Tab (\t) is interpret as a zero-width space (the tab itself is not displayed); U+200B is NOT supported. Manual word wrapping with \n is supported but requires care."' +// +// NOTE: The double-quotes in the --keyword argument above must be passed to +// xgettext as is, thus one needs the single-quotes in Makevars. +#define W_(msgid) _(msgid) + #endif diff --git a/src/common/tuklib_mbstr_wrap.c b/src/common/tuklib_mbstr_wrap.c new file mode 100644 index 00000000..4cc559d2 --- /dev/null +++ b/src/common/tuklib_mbstr_wrap.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: 0BSD + +/////////////////////////////////////////////////////////////////////////////// +// +/// \file tuklib_mbstr_wrap.c +/// \brief Word wraps a string and prints it to a FILE stream +/// +/// This depends on tuklib_mbstr_width.c. +// +// Author: Lasse Collin +// +/////////////////////////////////////////////////////////////////////////////// + +#include "tuklib_mbstr.h" +#include "tuklib_mbstr_wrap.h" +#include +#include +#include +#include + + +extern int +tuklib_wraps(FILE *outfile, const struct tuklib_wrap_opt *opt, const char *str) +{ + // left_cont may be less than left_margin. In that case, if the first + // word is extremely long, it will stay on the first line even if + // the line then gets overlong. + // + // On the other hand, left2_cont < left2_margin isn't allowed because + // it could result in inconsistent behavior when a very long word + // comes right after a \v. + // + // It is fine to have left2_margin < left_margin although it would be + // an odd use case. + if (!(opt->left_margin < opt->right_margin + && opt->left_cont < opt->right_margin + && opt->left2_margin <= opt->left2_cont + && opt->left2_cont < opt->right_margin)) + return TUKLIB_WRAP_ERR_OPT; + + // This is set to TUKLIB_WRAP_WARN_OVERLONG if one or more + // output lines extend past opt->right_margin columns. + int warn_overlong = 0; + + // Indentation of the first output line after \n or \r. + // \v sets this to opt->left2_margin. + // \r resets this back to the original value. + size_t first_indent = opt->left_margin; + + // Indentation of the output lines that occur due to word wrapping. + // \v sets this to opt->left2_cont and \r back to the original value. + size_t cont_indent = opt->left_cont; + + // If word wrapping occurs, the newline isn't printed unless more + // text would be put on the continuation line. This is also used + // when \v needs to start on a new line. + bool pending_newline = false; + + // Spaces are printed only when there is something else to put + // after the spaces on the line. This avoids unwanted empty lines + // in the output and makes it possible to ignore possible spaces + // before a \v character. + size_t pending_spaces = first_indent; + + // Current output column. When cur_col == pending_spaces, nothing + // has been actually printed to the current output line. + size_t cur_col = pending_spaces; + + while (true) { + // Number of bytes until the *next* line-break opportunity. + size_t len = 0; + + // Number of columns until the *next* line-break opportunity. + size_t width = 0; + + // Text between a pair of \b characters is treated as + // an unbreakable block even if it contains spaces. + // It must not contain any control characters before + // the closing \b. + bool unbreakable = false; + + while (true) { + // Find the next character that we handle specially. + // In an unbreakable block, search only for the + // closing \b; if missing, the unbreakable block + // extends to the end of the string. + const size_t n = strcspn(str + len, + unbreakable ? "\b" : " \t\n\r\v\b"); + + // Calculate how many columns the characters need. + const size_t w = tuklib_mbstr_width_mem(str + len, n); + if (w == (size_t)-1) + return TUKLIB_WRAP_ERR_STR; + + width += w; + len += n; + + // \b isn't a line-break opportunity so it has to + // be handled here. For simplicity, empty blocks + // are treated as zero-width characters. + if (str[len] == '\b') { + ++len; + unbreakable = !unbreakable; + continue; + } + + break; + } + + // Determine if adding this chunk of text would make the + // current output line exceed opt->right_margin columns. + const bool too_long = cur_col + width > opt->right_margin; + + // Wrap the line if needed. However: + // + // - Don't wrap if the current column is less than where + // the continuation line would begin. In that case + // the chunk wouldn't fit on the next line either so + // we just have to produce an overlong line. + // + // - Don't wrap if so far the line only contains spaces. + // Wrapping in that case would leave a weird empty line. + // NOTE: This "only contains spaces" condition is the + // reason why left2_margin > left2_cont isn't allowed. + if (too_long && cur_col > cont_indent + && cur_col > pending_spaces) { + // There might be trailing spaces or zero-width spaces + // which need to be ignored to keep the output pretty. + // + // Spaces need to be ignored because in some + // writing styles there are two spaces after + // a full stop. Example string: + // + // "Foo bar. Abc def." + // ^ + // If the first space after the first full stop + // triggers word wrapping, both spaces must be + // ignored. Otherwise the next line would be + // indented too much. + // + // Zero-width spaces are ignored the same way + // because they are meaningless if an adjacent + // character is a space. + while (*str == ' ' || *str == '\t') + ++str; + + // Don't print the newline here; only mark it as + // pending. This avoids an unwanted empty line if + // there is a \n or \r or \0 after the spaces have + // been ignored. + pending_newline = true; + pending_spaces = cont_indent; + cur_col = pending_spaces; + + // Since str may have been incremented due to the + // ignored spaces, the loop needs to be restarted. + continue; + } + + // Print the current chunk of text before the next + // line-break opportunity. If the chunk was empty, + // don't print anything so that the pending newline + // and pending spaces aren't printed on their own. + if (len > 0) { + if (pending_newline) { + pending_newline = false; + if (putc('\n', outfile) == EOF) + return TUKLIB_WRAP_ERR_IO; + } + + while (pending_spaces > 0) { + if (putc(' ', outfile) == EOF) + return TUKLIB_WRAP_ERR_IO; + + --pending_spaces; + } + + for (size_t i = 0; i < len; ++i) { + // Ignore unbreakable block characters (\b). + const int c = (unsigned char)str[i]; + if (c != '\b' && putc(c, outfile) == EOF) + return TUKLIB_WRAP_ERR_IO; + } + + str += len; + cur_col += width; + + // Remember if the line got overlong. If no other + // errors occur, we return warn_overlong. It might + // help in catching problematic strings. + if (too_long) + warn_overlong = TUKLIB_WRAP_WARN_OVERLONG; + } + + // Handle the special character after the chunk of text. + switch (*str) { + case ' ': + // Regular space. + ++cur_col; + ++pending_spaces; + break; + + case '\v': + // Set the alternative indentation settings. + first_indent = opt->left2_margin; + cont_indent = opt->left2_cont; + + if (first_indent > cur_col) { + // Add one or more spaces to reach + // the column specified in first_indent. + pending_spaces += first_indent - cur_col; + } else { + // There is no room to add even one space + // before reaching the column first_indent. + pending_newline = true; + pending_spaces = first_indent; + } + + cur_col = first_indent; + break; + + case '\0': // Implicit newline at the end of the string. + case '\r': // Newline that also resets the effect of \v. + case '\n': // Newline without resetting the indentation mode. + if (putc('\n', outfile) == EOF) + return TUKLIB_WRAP_ERR_IO; + + if (*str == '\0') + return warn_overlong; + + if (*str == '\r') { + first_indent = opt->left_margin; + cont_indent = opt->left_cont; + } + + pending_newline = false; + pending_spaces = first_indent; + cur_col = first_indent; + break; + } + + // Skip the specially-handled character. + ++str; + } +} + + +extern int +tuklib_wrapf(FILE *stream, const struct tuklib_wrap_opt *opt, + const char *fmt, ...) +{ + va_list ap; + char *buf; + +#ifdef HAVE_VASPRINTF + va_start(ap, fmt); + const int n = vasprintf(&buf, fmt, ap); + va_end(ap); + if (n == -1) + return TUKLIB_WRAP_ERR_FORMAT; +#else + // Fixed buffer size is dumb but in practice one shouldn't need + // huge strings for *formatted* output. This simple method is safe + // with pre-C99 vsnprintf() implementations too which don't return + // the required buffer size (they return -1 or buf_size - 1) or + // which might not null-terminate the buffer in case it's too small. + const size_t buf_size = 128 * 1024; + buf = malloc(buf_size); + if (buf == NULL) + return TUKLIB_WRAP_ERR_FORMAT; + + va_start(ap, fmt); + const int n = vsnprintf(buf, buf_size, fmt, ap); + va_end(ap); + + if (n <= 0 || n >= (int)(buf_size - 1)) { + free(buf); + return TUKLIB_WRAP_ERR_FORMAT; + } +#endif + + const int ret = tuklib_wraps(stream, opt, buf); + free(buf); + return ret; +} diff --git a/src/common/tuklib_mbstr_wrap.h b/src/common/tuklib_mbstr_wrap.h new file mode 100644 index 00000000..e20ffdaf --- /dev/null +++ b/src/common/tuklib_mbstr_wrap.h @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: 0BSD + +/////////////////////////////////////////////////////////////////////////////// +// +/// \file tuklib_mbstr_wrap.h +/// \brief Word wrapping for multibyte strings +/// +/// The word wrapping functions are intended to be usable, for example, +/// for printing --help text in command line tools. While manually-wrapped +/// --help text allows precise formatting, such freedom requires translators +/// to count spaces and determine where line breaks should occur. It's +/// tedious and error prone, and experience has shown that only some +/// translators do it well. Automatic word wrapping is less flexible but +/// results in polished-enough look with less effort from everyone. +/// Right-to-left languages and languages that don't use spaces between +/// words will still need extra effort though. +// +// Author: Lasse Collin +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef TUKLIB_MBSTR_WRAP_H +#define TUKLIB_MBSTR_WRAP_H + +#include "tuklib_common.h" +#include + +TUKLIB_DECLS_BEGIN + +/// One or more output lines exceeded right_margin. +/// This only a warning; everything was still printed successfully. +#define TUKLIB_WRAP_WARN_OVERLONG 0x01 + +/// Error writing to to the output FILE. The error flag in the FILE +/// should have been set as well. +#define TUKLIB_WRAP_ERR_IO 0x02 + +/// Invalid options in struct tuklib_wrap_opt. +/// Nothing was printed. +#define TUKLIB_WRAP_ERR_OPT 0x04 + +/// Invalid or unsupported multibyte character in the input string: +/// either mbrtowc() failed or wcwidth() returned a negative value. +#define TUKLIB_WRAP_ERR_STR 0x08 + +/// Only tuklib_wrapf(): Error in converting the format string. +/// It's either a memory allocation failure or something bad with the +/// format string or arguments. +#define TUKLIB_WRAP_ERR_FORMAT 0x10 + +/// Options for tuklib_wraps() and tuklib_wrapf() +struct tuklib_wrap_opt { + /// Indentation of the first output line after `\n` or `\r`. + /// This can be anything less than right_margin. + unsigned short left_margin; + + /// Column where word-wrapped continuation lines start. + /// This can be anything less than right_margin. + unsigned short left_cont; + + /// Column where the text after `\v` will start, either on the current + /// line (when there is room to add at least one space) or on a new + /// empty line. + unsigned short left2_margin; + + /// Like left_cont but for text after a `\v`. However, this must + /// be greater than or equal to left2_margin in addition to being + /// less than right_margin. + unsigned short left2_cont; + + /// For 80-column terminals, it is recommended to use 79 here for + /// maximum portability. 80 will work most of the time but it will + /// result in unwanted empty lines in the rare case where a terminal + /// moves the cursor to the beginning of the next line immediately + /// when the last column has been used. + unsigned short right_margin; +}; + +#define tuklib_wraps TUKLIB_SYMBOL(tuklib_wraps) +extern int tuklib_wraps(FILE *stream, const struct tuklib_wrap_opt *opt, + const char *str); +///< +/// \brief Word wrap a multibyte string and write it to a FILE +/// +/// Word wrapping is done only at spaces and at the special control characters +/// described below. Multiple consecutive spaces are handled properly: strings +/// that have two (or more) spaces after a full sentence will look good even +/// when the spaces occur at a word wrapping boundary. Trailing spaces are +/// ignored at the end of a line or at the end of a string. +/// +/// The following control characters have been repurposed: +/// +/// - `\t` = Zero-width space allows a line break without producing any +/// output by itself. This can be useful after hard hyphens as +/// hyphens aren't otherwise used for line breaking. This can also +/// be useful in languages that don't use spaces between words. +/// (The Unicode character U+200B isn't supported.) +/// - `\b` = Text between a pair of `\b` characters is treated as an +/// unbreakable block (not wrapped even if there are spaces). +/// For example, a non-breaking space can be done like +/// in `"123\b \bMiB"`. Control characters (like `\n` or `\t`) +/// aren't allowed before the closing `\b`. If closing `\b` is +/// missing, the block extends to the end of the string. Empty +/// blocks are treated as zero-width characters. If line breaks +/// are possible around an empty block (like in `"foo \b\b bar"` +/// or `"foo \b"`), it can result in weird output. +/// - `\v` = Change to alternative indentation (left2_margin). +/// - `\r` = Reset back to the initial indentation and add a newline. +/// The next line will be indented by left_margin. +/// - `\n` = Add a newline without resetting the effect of `\v`. The +/// next line will be indented by left_margin or left2_margin +/// (not left_cont or left2_cont). +/// +/// Only `\n` should appear in translatable strings. `\t` works too but +/// even that might confuse some translators even if there is a TRANSLATORS +/// comment explaining its meaning. +/// +/// To use the other control characters in messages, one should use +/// tuklib_wrapf() with appropriate printf format string to combine +/// translatable strings with non-translatable portions. For example: +/// +/// \code{.c} +/// static const struct tuklib_wrap_opt wrap2 = { 2, 2, 22, 22, 79 }; +/// int e = 0; +/// ... +/// e |= tuklib_wrapf(stdout, &wrap2, +/// "-h, --help\v%s\r" +/// " --version\v%s", +/// W_("display this help and exit"), +/// W_("display version information and exit")); +/// ... +/// if (e != 0) { +/// // Handle warning or error. +/// ... +/// } +/// \endcode +/// +/// Control characters other than `\n` and `\t` are unusable in +/// translatable strings: +/// +/// - Gettext tools show annoying warnings if C escape sequences other +/// than `\n` or `\t` are seen. (Otherwise they still work perfectly +/// fine though.) +/// +/// - While at least Poedit and Lokalize support all escapes, some +/// editors only support `\n` and `\t`. +/// +/// - They could confuse some translators, resulting in broken +/// translations. +/// +/// Using non-control characters would solve some issues but it wouldn't +/// help with the unfortunate real-world issue that some translators would +/// likely have trouble understanding a new syntax. The Gettext manual +/// specifically warns about this, see the subheading "No unusual markup" +/// in `info (gettext)Preparing Strings`. (While using `\t` for zero-width +/// space is such custom markup, most translators will never need it.) +/// +/// Translators can use the Unicode character U+00A0 (or U+202F) if they +/// need a non-breaking space. For example, in French a non-breaking space +/// may be needed before colons and question marks (U+00A0 is common in +/// real-world French PO files). +/// +/// Using a non-ASCII char in a string in the C code (like `"123\u00A0MiB"`) +/// can work if one tells xgettext that input encoding is UTF-8, one +/// ensures that the C compiler uses UTF-8 as the input charset, and one +/// is certain that the program is *always* run under an UTF-8 locale. +/// Unfortunately a portable program cannot make this kind of assumptions, +/// which means that there is no pretty way to have a non-breaking space in +/// a translatable string. +/// +/// Optional: To tell translators which strings are automatically word +/// wrapped, see the macro `W_` in tuklib_gettext.h. +/// +/// \param stream Output FILE stream. For decent performance, it +/// should be in buffered mode because this function +/// writes the output one byte at a time with fputc(). +/// \param opt Word wrapping options. +/// \param str Null-terminated multibyte string that is in +/// the encoding used by the current locale. +/// +/// \return Returns 0 on success. If an error or warning occurs, one of +/// TUKLIB_WRAP_* codes is returned. Those codes are powers +/// of two. When warning/error detection can be delayed, the +/// return values can be accumulated from multiple calls using +/// bitwise-or into a single variable which can be checked after +/// all strings have (hopefully) been printed. + +#define tuklib_wrapf TUKLIB_SYMBOL(tuklib_wrapf) +extern int tuklib_wrapf(FILE *stream, const struct tuklib_wrap_opt *opt, + const char *fmt, ...); +///< +/// \brief Format and word-wrap a multibyte string and write it to a FILE +/// +/// This is like tuklib_wraps() except that this takes a printf +/// format string. +/// +/// \note On platforms that lack vasprintf(), the intermediate +/// result from vsnprintf() must fit into a 128 KiB buffer. +/// TUKLIB_WRAP_ERR_FORMAT is returned if it doesn't but +/// only on platforms that lack vasprintf(). + +TUKLIB_DECLS_END +#endif