Add tuklib_mbstr_wrap for automatic word wrapping

Automatic word wrapping makes translators' work easier and reduces
errors like misaligned columns or overlong lines. Right-to-left
languages and languages that don't use spaces between words will
still need extra effort. (xz hasn't been translated to any RTL
language so far.)
This commit is contained in:
Lasse Collin 2024-12-16 18:43:52 +02:00
parent 314b83ceba
commit ca529c3f41
No known key found for this signature in database
GPG Key ID: 38EE757D69184620
6 changed files with 506 additions and 1 deletions

View File

@ -18,4 +18,8 @@ function(tuklib_mbstr TARGET_OR_ALL)
# NOTE: wcwidth() requires _GNU_SOURCE or _XOPEN_SOURCE on GNU/Linux. # NOTE: wcwidth() requires _GNU_SOURCE or _XOPEN_SOURCE on GNU/Linux.
check_symbol_exists(wcwidth wchar.h HAVE_WCWIDTH) check_symbol_exists(wcwidth wchar.h HAVE_WCWIDTH)
tuklib_add_definition_if("${TARGET_OR_ALL}" HAVE_WCWIDTH) tuklib_add_definition_if("${TARGET_OR_ALL}" HAVE_WCWIDTH)
# NOTE: vasprintf() requires _GNU_SOURCE on GNU/Linux.
check_symbol_exists(vasprintf stdio.h HAVE_VASPRINTF)
tuklib_add_definition_if("${TARGET_OR_ALL}" HAVE_VASPRINTF)
endfunction() endfunction()

View File

@ -27,5 +27,5 @@
AC_DEFUN_ONCE([TUKLIB_MBSTR], [ AC_DEFUN_ONCE([TUKLIB_MBSTR], [
AC_REQUIRE([TUKLIB_COMMON]) AC_REQUIRE([TUKLIB_COMMON])
AC_FUNC_MBRTOWC AC_FUNC_MBRTOWC
AC_CHECK_FUNCS([wcwidth]) AC_CHECK_FUNCS([wcwidth vasprintf])
])dnl ])dnl

View File

@ -30,6 +30,8 @@ EXTRA_DIST = \
common/tuklib_mbstr.h \ common/tuklib_mbstr.h \
common/tuklib_mbstr_fw.c \ common/tuklib_mbstr_fw.c \
common/tuklib_mbstr_width.c \ common/tuklib_mbstr_width.c \
common/tuklib_mbstr_wrap.c \
common/tuklib_mbstr_wrap.h \
common/tuklib_open_stdxxx.c \ common/tuklib_open_stdxxx.c \
common/tuklib_open_stdxxx.h \ common/tuklib_open_stdxxx.h \
common/tuklib_physmem.c \ common/tuklib_physmem.c \

View File

@ -68,4 +68,15 @@
#endif #endif
#define N_(msgid) msgid #define N_(msgid) msgid
// Optional: Strings that are word wrapped using tuklib_mbstr_wrap may be
// marked with W_("foo) in the source code. xgettext can then add a comment
// to all such strings to inform translators. The following option needs to
// be added to XGETTEXT_OPTIONS in po/Makevars or in an equivalent place:
//
// '--keyword=W_:1,"This is word wrapped at spaces. The Unicode character U+00A0 works as a non-breaking space. Tab (\t) is interpret as a zero-width space (the tab itself is not displayed); U+200B is NOT supported. Manual word wrapping with \n is supported but requires care."'
//
// NOTE: The double-quotes in the --keyword argument above must be passed to
// xgettext as is, thus one needs the single-quotes in Makevars.
#define W_(msgid) _(msgid)
#endif #endif

View File

@ -0,0 +1,285 @@
// SPDX-License-Identifier: 0BSD
///////////////////////////////////////////////////////////////////////////////
//
/// \file tuklib_mbstr_wrap.c
/// \brief Word wraps a string and prints it to a FILE stream
///
/// This depends on tuklib_mbstr_width.c.
//
// Author: Lasse Collin
//
///////////////////////////////////////////////////////////////////////////////
#include "tuklib_mbstr.h"
#include "tuklib_mbstr_wrap.h"
#include <stdarg.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
extern int
tuklib_wraps(FILE *outfile, const struct tuklib_wrap_opt *opt, const char *str)
{
// left_cont may be less than left_margin. In that case, if the first
// word is extremely long, it will stay on the first line even if
// the line then gets overlong.
//
// On the other hand, left2_cont < left2_margin isn't allowed because
// it could result in inconsistent behavior when a very long word
// comes right after a \v.
//
// It is fine to have left2_margin < left_margin although it would be
// an odd use case.
if (!(opt->left_margin < opt->right_margin
&& opt->left_cont < opt->right_margin
&& opt->left2_margin <= opt->left2_cont
&& opt->left2_cont < opt->right_margin))
return TUKLIB_WRAP_ERR_OPT;
// This is set to TUKLIB_WRAP_WARN_OVERLONG if one or more
// output lines extend past opt->right_margin columns.
int warn_overlong = 0;
// Indentation of the first output line after \n or \r.
// \v sets this to opt->left2_margin.
// \r resets this back to the original value.
size_t first_indent = opt->left_margin;
// Indentation of the output lines that occur due to word wrapping.
// \v sets this to opt->left2_cont and \r back to the original value.
size_t cont_indent = opt->left_cont;
// If word wrapping occurs, the newline isn't printed unless more
// text would be put on the continuation line. This is also used
// when \v needs to start on a new line.
bool pending_newline = false;
// Spaces are printed only when there is something else to put
// after the spaces on the line. This avoids unwanted empty lines
// in the output and makes it possible to ignore possible spaces
// before a \v character.
size_t pending_spaces = first_indent;
// Current output column. When cur_col == pending_spaces, nothing
// has been actually printed to the current output line.
size_t cur_col = pending_spaces;
while (true) {
// Number of bytes until the *next* line-break opportunity.
size_t len = 0;
// Number of columns until the *next* line-break opportunity.
size_t width = 0;
// Text between a pair of \b characters is treated as
// an unbreakable block even if it contains spaces.
// It must not contain any control characters before
// the closing \b.
bool unbreakable = false;
while (true) {
// Find the next character that we handle specially.
// In an unbreakable block, search only for the
// closing \b; if missing, the unbreakable block
// extends to the end of the string.
const size_t n = strcspn(str + len,
unbreakable ? "\b" : " \t\n\r\v\b");
// Calculate how many columns the characters need.
const size_t w = tuklib_mbstr_width_mem(str + len, n);
if (w == (size_t)-1)
return TUKLIB_WRAP_ERR_STR;
width += w;
len += n;
// \b isn't a line-break opportunity so it has to
// be handled here. For simplicity, empty blocks
// are treated as zero-width characters.
if (str[len] == '\b') {
++len;
unbreakable = !unbreakable;
continue;
}
break;
}
// Determine if adding this chunk of text would make the
// current output line exceed opt->right_margin columns.
const bool too_long = cur_col + width > opt->right_margin;
// Wrap the line if needed. However:
//
// - Don't wrap if the current column is less than where
// the continuation line would begin. In that case
// the chunk wouldn't fit on the next line either so
// we just have to produce an overlong line.
//
// - Don't wrap if so far the line only contains spaces.
// Wrapping in that case would leave a weird empty line.
// NOTE: This "only contains spaces" condition is the
// reason why left2_margin > left2_cont isn't allowed.
if (too_long && cur_col > cont_indent
&& cur_col > pending_spaces) {
// There might be trailing spaces or zero-width spaces
// which need to be ignored to keep the output pretty.
//
// Spaces need to be ignored because in some
// writing styles there are two spaces after
// a full stop. Example string:
//
// "Foo bar. Abc def."
// ^
// If the first space after the first full stop
// triggers word wrapping, both spaces must be
// ignored. Otherwise the next line would be
// indented too much.
//
// Zero-width spaces are ignored the same way
// because they are meaningless if an adjacent
// character is a space.
while (*str == ' ' || *str == '\t')
++str;
// Don't print the newline here; only mark it as
// pending. This avoids an unwanted empty line if
// there is a \n or \r or \0 after the spaces have
// been ignored.
pending_newline = true;
pending_spaces = cont_indent;
cur_col = pending_spaces;
// Since str may have been incremented due to the
// ignored spaces, the loop needs to be restarted.
continue;
}
// Print the current chunk of text before the next
// line-break opportunity. If the chunk was empty,
// don't print anything so that the pending newline
// and pending spaces aren't printed on their own.
if (len > 0) {
if (pending_newline) {
pending_newline = false;
if (putc('\n', outfile) == EOF)
return TUKLIB_WRAP_ERR_IO;
}
while (pending_spaces > 0) {
if (putc(' ', outfile) == EOF)
return TUKLIB_WRAP_ERR_IO;
--pending_spaces;
}
for (size_t i = 0; i < len; ++i) {
// Ignore unbreakable block characters (\b).
const int c = (unsigned char)str[i];
if (c != '\b' && putc(c, outfile) == EOF)
return TUKLIB_WRAP_ERR_IO;
}
str += len;
cur_col += width;
// Remember if the line got overlong. If no other
// errors occur, we return warn_overlong. It might
// help in catching problematic strings.
if (too_long)
warn_overlong = TUKLIB_WRAP_WARN_OVERLONG;
}
// Handle the special character after the chunk of text.
switch (*str) {
case ' ':
// Regular space.
++cur_col;
++pending_spaces;
break;
case '\v':
// Set the alternative indentation settings.
first_indent = opt->left2_margin;
cont_indent = opt->left2_cont;
if (first_indent > cur_col) {
// Add one or more spaces to reach
// the column specified in first_indent.
pending_spaces += first_indent - cur_col;
} else {
// There is no room to add even one space
// before reaching the column first_indent.
pending_newline = true;
pending_spaces = first_indent;
}
cur_col = first_indent;
break;
case '\0': // Implicit newline at the end of the string.
case '\r': // Newline that also resets the effect of \v.
case '\n': // Newline without resetting the indentation mode.
if (putc('\n', outfile) == EOF)
return TUKLIB_WRAP_ERR_IO;
if (*str == '\0')
return warn_overlong;
if (*str == '\r') {
first_indent = opt->left_margin;
cont_indent = opt->left_cont;
}
pending_newline = false;
pending_spaces = first_indent;
cur_col = first_indent;
break;
}
// Skip the specially-handled character.
++str;
}
}
extern int
tuklib_wrapf(FILE *stream, const struct tuklib_wrap_opt *opt,
const char *fmt, ...)
{
va_list ap;
char *buf;
#ifdef HAVE_VASPRINTF
va_start(ap, fmt);
const int n = vasprintf(&buf, fmt, ap);
va_end(ap);
if (n == -1)
return TUKLIB_WRAP_ERR_FORMAT;
#else
// Fixed buffer size is dumb but in practice one shouldn't need
// huge strings for *formatted* output. This simple method is safe
// with pre-C99 vsnprintf() implementations too which don't return
// the required buffer size (they return -1 or buf_size - 1) or
// which might not null-terminate the buffer in case it's too small.
const size_t buf_size = 128 * 1024;
buf = malloc(buf_size);
if (buf == NULL)
return TUKLIB_WRAP_ERR_FORMAT;
va_start(ap, fmt);
const int n = vsnprintf(buf, buf_size, fmt, ap);
va_end(ap);
if (n <= 0 || n >= (int)(buf_size - 1)) {
free(buf);
return TUKLIB_WRAP_ERR_FORMAT;
}
#endif
const int ret = tuklib_wraps(stream, opt, buf);
free(buf);
return ret;
}

View File

@ -0,0 +1,203 @@
// SPDX-License-Identifier: 0BSD
///////////////////////////////////////////////////////////////////////////////
//
/// \file tuklib_mbstr_wrap.h
/// \brief Word wrapping for multibyte strings
///
/// The word wrapping functions are intended to be usable, for example,
/// for printing --help text in command line tools. While manually-wrapped
/// --help text allows precise formatting, such freedom requires translators
/// to count spaces and determine where line breaks should occur. It's
/// tedious and error prone, and experience has shown that only some
/// translators do it well. Automatic word wrapping is less flexible but
/// results in polished-enough look with less effort from everyone.
/// Right-to-left languages and languages that don't use spaces between
/// words will still need extra effort though.
//
// Author: Lasse Collin
//
///////////////////////////////////////////////////////////////////////////////
#ifndef TUKLIB_MBSTR_WRAP_H
#define TUKLIB_MBSTR_WRAP_H
#include "tuklib_common.h"
#include <stdio.h>
TUKLIB_DECLS_BEGIN
/// One or more output lines exceeded right_margin.
/// This only a warning; everything was still printed successfully.
#define TUKLIB_WRAP_WARN_OVERLONG 0x01
/// Error writing to to the output FILE. The error flag in the FILE
/// should have been set as well.
#define TUKLIB_WRAP_ERR_IO 0x02
/// Invalid options in struct tuklib_wrap_opt.
/// Nothing was printed.
#define TUKLIB_WRAP_ERR_OPT 0x04
/// Invalid or unsupported multibyte character in the input string:
/// either mbrtowc() failed or wcwidth() returned a negative value.
#define TUKLIB_WRAP_ERR_STR 0x08
/// Only tuklib_wrapf(): Error in converting the format string.
/// It's either a memory allocation failure or something bad with the
/// format string or arguments.
#define TUKLIB_WRAP_ERR_FORMAT 0x10
/// Options for tuklib_wraps() and tuklib_wrapf()
struct tuklib_wrap_opt {
/// Indentation of the first output line after `\n` or `\r`.
/// This can be anything less than right_margin.
unsigned short left_margin;
/// Column where word-wrapped continuation lines start.
/// This can be anything less than right_margin.
unsigned short left_cont;
/// Column where the text after `\v` will start, either on the current
/// line (when there is room to add at least one space) or on a new
/// empty line.
unsigned short left2_margin;
/// Like left_cont but for text after a `\v`. However, this must
/// be greater than or equal to left2_margin in addition to being
/// less than right_margin.
unsigned short left2_cont;
/// For 80-column terminals, it is recommended to use 79 here for
/// maximum portability. 80 will work most of the time but it will
/// result in unwanted empty lines in the rare case where a terminal
/// moves the cursor to the beginning of the next line immediately
/// when the last column has been used.
unsigned short right_margin;
};
#define tuklib_wraps TUKLIB_SYMBOL(tuklib_wraps)
extern int tuklib_wraps(FILE *stream, const struct tuklib_wrap_opt *opt,
const char *str);
///<
/// \brief Word wrap a multibyte string and write it to a FILE
///
/// Word wrapping is done only at spaces and at the special control characters
/// described below. Multiple consecutive spaces are handled properly: strings
/// that have two (or more) spaces after a full sentence will look good even
/// when the spaces occur at a word wrapping boundary. Trailing spaces are
/// ignored at the end of a line or at the end of a string.
///
/// The following control characters have been repurposed:
///
/// - `\t` = Zero-width space allows a line break without producing any
/// output by itself. This can be useful after hard hyphens as
/// hyphens aren't otherwise used for line breaking. This can also
/// be useful in languages that don't use spaces between words.
/// (The Unicode character U+200B isn't supported.)
/// - `\b` = Text between a pair of `\b` characters is treated as an
/// unbreakable block (not wrapped even if there are spaces).
/// For example, a non-breaking space can be done like
/// in `"123\b \bMiB"`. Control characters (like `\n` or `\t`)
/// aren't allowed before the closing `\b`. If closing `\b` is
/// missing, the block extends to the end of the string. Empty
/// blocks are treated as zero-width characters. If line breaks
/// are possible around an empty block (like in `"foo \b\b bar"`
/// or `"foo \b"`), it can result in weird output.
/// - `\v` = Change to alternative indentation (left2_margin).
/// - `\r` = Reset back to the initial indentation and add a newline.
/// The next line will be indented by left_margin.
/// - `\n` = Add a newline without resetting the effect of `\v`. The
/// next line will be indented by left_margin or left2_margin
/// (not left_cont or left2_cont).
///
/// Only `\n` should appear in translatable strings. `\t` works too but
/// even that might confuse some translators even if there is a TRANSLATORS
/// comment explaining its meaning.
///
/// To use the other control characters in messages, one should use
/// tuklib_wrapf() with appropriate printf format string to combine
/// translatable strings with non-translatable portions. For example:
///
/// \code{.c}
/// static const struct tuklib_wrap_opt wrap2 = { 2, 2, 22, 22, 79 };
/// int e = 0;
/// ...
/// e |= tuklib_wrapf(stdout, &wrap2,
/// "-h, --help\v%s\r"
/// " --version\v%s",
/// W_("display this help and exit"),
/// W_("display version information and exit"));
/// ...
/// if (e != 0) {
/// // Handle warning or error.
/// ...
/// }
/// \endcode
///
/// Control characters other than `\n` and `\t` are unusable in
/// translatable strings:
///
/// - Gettext tools show annoying warnings if C escape sequences other
/// than `\n` or `\t` are seen. (Otherwise they still work perfectly
/// fine though.)
///
/// - While at least Poedit and Lokalize support all escapes, some
/// editors only support `\n` and `\t`.
///
/// - They could confuse some translators, resulting in broken
/// translations.
///
/// Using non-control characters would solve some issues but it wouldn't
/// help with the unfortunate real-world issue that some translators would
/// likely have trouble understanding a new syntax. The Gettext manual
/// specifically warns about this, see the subheading "No unusual markup"
/// in `info (gettext)Preparing Strings`. (While using `\t` for zero-width
/// space is such custom markup, most translators will never need it.)
///
/// Translators can use the Unicode character U+00A0 (or U+202F) if they
/// need a non-breaking space. For example, in French a non-breaking space
/// may be needed before colons and question marks (U+00A0 is common in
/// real-world French PO files).
///
/// Using a non-ASCII char in a string in the C code (like `"123\u00A0MiB"`)
/// can work if one tells xgettext that input encoding is UTF-8, one
/// ensures that the C compiler uses UTF-8 as the input charset, and one
/// is certain that the program is *always* run under an UTF-8 locale.
/// Unfortunately a portable program cannot make this kind of assumptions,
/// which means that there is no pretty way to have a non-breaking space in
/// a translatable string.
///
/// Optional: To tell translators which strings are automatically word
/// wrapped, see the macro `W_` in tuklib_gettext.h.
///
/// \param stream Output FILE stream. For decent performance, it
/// should be in buffered mode because this function
/// writes the output one byte at a time with fputc().
/// \param opt Word wrapping options.
/// \param str Null-terminated multibyte string that is in
/// the encoding used by the current locale.
///
/// \return Returns 0 on success. If an error or warning occurs, one of
/// TUKLIB_WRAP_* codes is returned. Those codes are powers
/// of two. When warning/error detection can be delayed, the
/// return values can be accumulated from multiple calls using
/// bitwise-or into a single variable which can be checked after
/// all strings have (hopefully) been printed.
#define tuklib_wrapf TUKLIB_SYMBOL(tuklib_wrapf)
extern int tuklib_wrapf(FILE *stream, const struct tuklib_wrap_opt *opt,
const char *fmt, ...);
///<
/// \brief Format and word-wrap a multibyte string and write it to a FILE
///
/// This is like tuklib_wraps() except that this takes a printf
/// format string.
///
/// \note On platforms that lack vasprintf(), the intermediate
/// result from vsnprintf() must fit into a 128 KiB buffer.
/// TUKLIB_WRAP_ERR_FORMAT is returned if it doesn't but
/// only on platforms that lack vasprintf().
TUKLIB_DECLS_END
#endif