From b797c44c42ea54fe1c52722a2fca0c9618575598 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Mon, 16 Dec 2024 20:06:07 +0200 Subject: [PATCH] tuklib_mbstr_width: Change the behavior when wcwidth() is not available If wcwidth() isn't available (Windows), previously it was assumed that one byte == one column in the terminal. Now it is assumed that one multibyte character == one column. This works better with UTF-8. Languages that only use single-width characters without any combining characters should work correctly with this. In xz, none of po/*.po contain combining characters and only ko.po, zh_CN.po, and zh_TW.po contain fullwidth characters. Thus, "only" those three translations in xz are broken on Windows with the UTF-8 code page. Broken means that column headings in xz -lvv and (only in the master branch) strings in --long-help are misaligned, so it's not a huge problem. I don't know if those three languages displayed perfectly before the UTF-8 change because I hadn't tested translations with native Windows builds before. Fixes: 46ee0061629fb075d61d83839e14dd193337af59 --- src/common/tuklib_mbstr_width.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/common/tuklib_mbstr_width.c b/src/common/tuklib_mbstr_width.c index 7a8bf070..3c63dd1a 100644 --- a/src/common/tuklib_mbstr_width.c +++ b/src/common/tuklib_mbstr_width.c @@ -12,7 +12,7 @@ #include "tuklib_mbstr.h" #include -#if defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) +#ifdef HAVE_MBRTOWC # include #endif @@ -24,7 +24,7 @@ tuklib_mbstr_width(const char *str, size_t *bytes) if (bytes != NULL) *bytes = len; -#if !(defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH)) +#ifndef HAVE_MBRTOWC // In single-byte mode, the width of the string is the same // as its length. return len; @@ -46,11 +46,20 @@ tuklib_mbstr_width(const char *str, size_t *bytes) i += ret; +#ifdef HAVE_WCWIDTH const int wc_width = wcwidth(wc); if (wc_width < 0) return (size_t)-1; width += (size_t)wc_width; +#else + // Without wcwidth() (like in a native Windows build), + // assume that one multibyte char == one column. With + // UTF-8, this is less bad than one byte == one column. + // This way quite a few languages will be handled correctly + // in practice; CJK chars will be very wrong though. + ++width; +#endif } // Require that the string ends in the initial shift state.