xz/src/common/tuklib_mbstr_width.c

// SPDX-License-Identifier: 0BSD

///////////////////////////////////////////////////////////////////////////////
//
/// \file       tuklib_mbstr_width.c
/// \brief      Calculate width of a multibyte string
//
//  Author:     Lasse Collin
//
///////////////////////////////////////////////////////////////////////////////

#include "tuklib_mbstr.h"
#include <string.h>

#ifdef HAVE_MBRTOWC
#	include <wchar.h>
#endif


extern size_t
tuklib_mbstr_width(const char *str, size_t *bytes)
{
	const size_t len = strlen(str);
	if (bytes != NULL)
		*bytes = len;

#ifndef HAVE_MBRTOWC
	// In single-byte mode, the width of the string is the same
	// as its length.
	return len;

#else
	mbstate_t state;
	memset(&state, 0, sizeof(state));

	size_t width = 0;
	size_t i = 0;

	// Convert one multibyte character at a time to wchar_t
	// and get its width using wcwidth().
	while (i < len) {
		wchar_t wc;
		const size_t ret = mbrtowc(&wc, str + i, len - i, &state);
		if (ret < 1 || ret > len)
			return (size_t)-1;

		i += ret;

#ifdef HAVE_WCWIDTH
		const int wc_width = wcwidth(wc);
		if (wc_width < 0)
			return (size_t)-1;

		width += (size_t)wc_width;
#else
		// Without wcwidth() (like in a native Windows build),
		// assume that one multibyte char == one column. With
		// UTF-8, this is less bad than one byte == one column.
		// This way quite a few languages will be handled correctly
		// in practice; CJK chars will be very wrong though.
		++width;
#endif
	}

	// Require that the string ends in the initial shift state.
	// This way the caller can be combine the string with other
	// strings without needing to worry about the shift states.
	if (!mbsinit(&state))
		return (size_t)-1;

	return width;
#endif
}
Add SPDX license identifier into 0BSD source code files. 2024-02-12 17:09:10 +02:00			`// SPDX-License-Identifier: 0BSD`

xz: Multiple fixes. The code assumed that printing numbers with thousand separators and decimal points would always produce only US-ASCII characters. This was used for buffer sizes (with snprintf(), no overflows) and aligning columns of the progress indicator and --list. That assumption was wrong (e.g. LC_ALL=fi_FI.UTF-8 with glibc), so multibyte character support was added in this commit. The old way is used if the operating system doesn't have enough multibyte support (e.g. lacks wcwidth()). The sizes of buffers were increased to accomodate multibyte characters. I don't know how big they should be exactly, but they aren't used for anything critical, so it's not too bad. If they still aren't big enough, I hopefully get a bug report. snprintf() takes care of avoiding buffer overflows. Some static buffers were replaced with buffers allocated on stack. double_to_str() was removed. uint64_to_str() and uint64_to_nicestr() now share the static buffer and test for thousand separator support. Integrity check names "None" and "Unknown-N" (2 <= N <= 15) were marked to be translated. I had forgot these, plus they wouldn't have worked correctly anyway before this commit, because printing tables with multibyte strings didn't work. Thanks to Marek Černocký for reporting the bug about misaligned table columns in --list output. 2010-09-10 10:30:33 +03:00			`///////////////////////////////////////////////////////////////////////////////`
			`//`
Fix comment typos in tuklib_mbstr* files. 2019-07-12 18:57:43 +03:00			`/// \file tuklib_mbstr_width.c`
xz: Multiple fixes. The code assumed that printing numbers with thousand separators and decimal points would always produce only US-ASCII characters. This was used for buffer sizes (with snprintf(), no overflows) and aligning columns of the progress indicator and --list. That assumption was wrong (e.g. LC_ALL=fi_FI.UTF-8 with glibc), so multibyte character support was added in this commit. The old way is used if the operating system doesn't have enough multibyte support (e.g. lacks wcwidth()). The sizes of buffers were increased to accomodate multibyte characters. I don't know how big they should be exactly, but they aren't used for anything critical, so it's not too bad. If they still aren't big enough, I hopefully get a bug report. snprintf() takes care of avoiding buffer overflows. Some static buffers were replaced with buffers allocated on stack. double_to_str() was removed. uint64_to_str() and uint64_to_nicestr() now share the static buffer and test for thousand separator support. Integrity check names "None" and "Unknown-N" (2 <= N <= 15) were marked to be translated. I had forgot these, plus they wouldn't have worked correctly anyway before this commit, because printing tables with multibyte strings didn't work. Thanks to Marek Černocký for reporting the bug about misaligned table columns in --list output. 2010-09-10 10:30:33 +03:00			`/// \brief Calculate width of a multibyte string`
			`//`
			`// Author: Lasse Collin`
			`//`
			`///////////////////////////////////////////////////////////////////////////////`

			`#include "tuklib_mbstr.h"`
Add missing include to tuklib_mbstr_width.c. It didn't matter in XZ Utils because sysdefs.h includes string.h anyway. 2019-07-12 18:30:46 +03:00			`#include <string.h>`
xz: Multiple fixes. The code assumed that printing numbers with thousand separators and decimal points would always produce only US-ASCII characters. This was used for buffer sizes (with snprintf(), no overflows) and aligning columns of the progress indicator and --list. That assumption was wrong (e.g. LC_ALL=fi_FI.UTF-8 with glibc), so multibyte character support was added in this commit. The old way is used if the operating system doesn't have enough multibyte support (e.g. lacks wcwidth()). The sizes of buffers were increased to accomodate multibyte characters. I don't know how big they should be exactly, but they aren't used for anything critical, so it's not too bad. If they still aren't big enough, I hopefully get a bug report. snprintf() takes care of avoiding buffer overflows. Some static buffers were replaced with buffers allocated on stack. double_to_str() was removed. uint64_to_str() and uint64_to_nicestr() now share the static buffer and test for thousand separator support. Integrity check names "None" and "Unknown-N" (2 <= N <= 15) were marked to be translated. I had forgot these, plus they wouldn't have worked correctly anyway before this commit, because printing tables with multibyte strings didn't work. Thanks to Marek Černocký for reporting the bug about misaligned table columns in --list output. 2010-09-10 10:30:33 +03:00
tuklib_mbstr_width: Change the behavior when wcwidth() is not available If wcwidth() isn't available (Windows), previously it was assumed that one byte == one column in the terminal. Now it is assumed that one multibyte character == one column. This works better with UTF-8. Languages that only use single-width characters without any combining characters should work correctly with this. In xz, none of po/*.po contain combining characters and only ko.po, zh_CN.po, and zh_TW.po contain fullwidth characters. Thus, "only" those three translations in xz are broken on Windows with the UTF-8 code page. Broken means that column headings in xz -lvv and (only in the master branch) strings in --long-help are misaligned, so it's not a huge problem. I don't know if those three languages displayed perfectly before the UTF-8 change because I hadn't tested translations with native Windows builds before. Fixes: 46ee0061629fb075d61d83839e14dd193337af59 2024-12-16 20:06:07 +02:00			`#ifdef HAVE_MBRTOWC`
xz: Multiple fixes. The code assumed that printing numbers with thousand separators and decimal points would always produce only US-ASCII characters. This was used for buffer sizes (with snprintf(), no overflows) and aligning columns of the progress indicator and --list. That assumption was wrong (e.g. LC_ALL=fi_FI.UTF-8 with glibc), so multibyte character support was added in this commit. The old way is used if the operating system doesn't have enough multibyte support (e.g. lacks wcwidth()). The sizes of buffers were increased to accomodate multibyte characters. I don't know how big they should be exactly, but they aren't used for anything critical, so it's not too bad. If they still aren't big enough, I hopefully get a bug report. snprintf() takes care of avoiding buffer overflows. Some static buffers were replaced with buffers allocated on stack. double_to_str() was removed. uint64_to_str() and uint64_to_nicestr() now share the static buffer and test for thousand separator support. Integrity check names "None" and "Unknown-N" (2 <= N <= 15) were marked to be translated. I had forgot these, plus they wouldn't have worked correctly anyway before this commit, because printing tables with multibyte strings didn't work. Thanks to Marek Černocký for reporting the bug about misaligned table columns in --list output. 2010-09-10 10:30:33 +03:00			`# include <wchar.h>`
			`#endif`


			`extern size_t`
			`tuklib_mbstr_width(const char str, size_t bytes)`
			`{`
			`const size_t len = strlen(str);`
			`if (bytes != NULL)`
			`*bytes = len;`

tuklib_mbstr_width: Change the behavior when wcwidth() is not available If wcwidth() isn't available (Windows), previously it was assumed that one byte == one column in the terminal. Now it is assumed that one multibyte character == one column. This works better with UTF-8. Languages that only use single-width characters without any combining characters should work correctly with this. In xz, none of po/*.po contain combining characters and only ko.po, zh_CN.po, and zh_TW.po contain fullwidth characters. Thus, "only" those three translations in xz are broken on Windows with the UTF-8 code page. Broken means that column headings in xz -lvv and (only in the master branch) strings in --long-help are misaligned, so it's not a huge problem. I don't know if those three languages displayed perfectly before the UTF-8 change because I hadn't tested translations with native Windows builds before. Fixes: 46ee0061629fb075d61d83839e14dd193337af59 2024-12-16 20:06:07 +02:00			`#ifndef HAVE_MBRTOWC`
xz: Multiple fixes. The code assumed that printing numbers with thousand separators and decimal points would always produce only US-ASCII characters. This was used for buffer sizes (with snprintf(), no overflows) and aligning columns of the progress indicator and --list. That assumption was wrong (e.g. LC_ALL=fi_FI.UTF-8 with glibc), so multibyte character support was added in this commit. The old way is used if the operating system doesn't have enough multibyte support (e.g. lacks wcwidth()). The sizes of buffers were increased to accomodate multibyte characters. I don't know how big they should be exactly, but they aren't used for anything critical, so it's not too bad. If they still aren't big enough, I hopefully get a bug report. snprintf() takes care of avoiding buffer overflows. Some static buffers were replaced with buffers allocated on stack. double_to_str() was removed. uint64_to_str() and uint64_to_nicestr() now share the static buffer and test for thousand separator support. Integrity check names "None" and "Unknown-N" (2 <= N <= 15) were marked to be translated. I had forgot these, plus they wouldn't have worked correctly anyway before this commit, because printing tables with multibyte strings didn't work. Thanks to Marek Černocký for reporting the bug about misaligned table columns in --list output. 2010-09-10 10:30:33 +03:00			`// In single-byte mode, the width of the string is the same`
			`// as its length.`
			`return len;`

			`#else`
			`mbstate_t state;`
			`memset(&state, 0, sizeof(state));`

			`size_t width = 0;`
			`size_t i = 0;`

			`// Convert one multibyte character at a time to wchar_t`
			`// and get its width using wcwidth().`
			`while (i < len) {`
			`wchar_t wc;`
			`const size_t ret = mbrtowc(&wc, str + i, len - i, &state);`
			`if (ret < 1 \|\| ret > len)`
			`return (size_t)-1;`

			`i += ret;`

tuklib_mbstr_width: Change the behavior when wcwidth() is not available If wcwidth() isn't available (Windows), previously it was assumed that one byte == one column in the terminal. Now it is assumed that one multibyte character == one column. This works better with UTF-8. Languages that only use single-width characters without any combining characters should work correctly with this. In xz, none of po/*.po contain combining characters and only ko.po, zh_CN.po, and zh_TW.po contain fullwidth characters. Thus, "only" those three translations in xz are broken on Windows with the UTF-8 code page. Broken means that column headings in xz -lvv and (only in the master branch) strings in --long-help are misaligned, so it's not a huge problem. I don't know if those three languages displayed perfectly before the UTF-8 change because I hadn't tested translations with native Windows builds before. Fixes: 46ee0061629fb075d61d83839e14dd193337af59 2024-12-16 20:06:07 +02:00			`#ifdef HAVE_WCWIDTH`
xz: Multiple fixes. The code assumed that printing numbers with thousand separators and decimal points would always produce only US-ASCII characters. This was used for buffer sizes (with snprintf(), no overflows) and aligning columns of the progress indicator and --list. That assumption was wrong (e.g. LC_ALL=fi_FI.UTF-8 with glibc), so multibyte character support was added in this commit. The old way is used if the operating system doesn't have enough multibyte support (e.g. lacks wcwidth()). The sizes of buffers were increased to accomodate multibyte characters. I don't know how big they should be exactly, but they aren't used for anything critical, so it's not too bad. If they still aren't big enough, I hopefully get a bug report. snprintf() takes care of avoiding buffer overflows. Some static buffers were replaced with buffers allocated on stack. double_to_str() was removed. uint64_to_str() and uint64_to_nicestr() now share the static buffer and test for thousand separator support. Integrity check names "None" and "Unknown-N" (2 <= N <= 15) were marked to be translated. I had forgot these, plus they wouldn't have worked correctly anyway before this commit, because printing tables with multibyte strings didn't work. Thanks to Marek Černocký for reporting the bug about misaligned table columns in --list output. 2010-09-10 10:30:33 +03:00			`const int wc_width = wcwidth(wc);`
			`if (wc_width < 0)`
			`return (size_t)-1;`

tuklib_mbstr_width: Fix a warning from -Wsign-conversion. 2019-06-23 23:22:45 +03:00			`width += (size_t)wc_width;`
tuklib_mbstr_width: Change the behavior when wcwidth() is not available If wcwidth() isn't available (Windows), previously it was assumed that one byte == one column in the terminal. Now it is assumed that one multibyte character == one column. This works better with UTF-8. Languages that only use single-width characters without any combining characters should work correctly with this. In xz, none of po/*.po contain combining characters and only ko.po, zh_CN.po, and zh_TW.po contain fullwidth characters. Thus, "only" those three translations in xz are broken on Windows with the UTF-8 code page. Broken means that column headings in xz -lvv and (only in the master branch) strings in --long-help are misaligned, so it's not a huge problem. I don't know if those three languages displayed perfectly before the UTF-8 change because I hadn't tested translations with native Windows builds before. Fixes: 46ee0061629fb075d61d83839e14dd193337af59 2024-12-16 20:06:07 +02:00			`#else`
			`// Without wcwidth() (like in a native Windows build),`
			`// assume that one multibyte char == one column. With`
			`// UTF-8, this is less bad than one byte == one column.`
			`// This way quite a few languages will be handled correctly`
			`// in practice; CJK chars will be very wrong though.`
			`++width;`
			`#endif`
xz: Multiple fixes. The code assumed that printing numbers with thousand separators and decimal points would always produce only US-ASCII characters. This was used for buffer sizes (with snprintf(), no overflows) and aligning columns of the progress indicator and --list. That assumption was wrong (e.g. LC_ALL=fi_FI.UTF-8 with glibc), so multibyte character support was added in this commit. The old way is used if the operating system doesn't have enough multibyte support (e.g. lacks wcwidth()). The sizes of buffers were increased to accomodate multibyte characters. I don't know how big they should be exactly, but they aren't used for anything critical, so it's not too bad. If they still aren't big enough, I hopefully get a bug report. snprintf() takes care of avoiding buffer overflows. Some static buffers were replaced with buffers allocated on stack. double_to_str() was removed. uint64_to_str() and uint64_to_nicestr() now share the static buffer and test for thousand separator support. Integrity check names "None" and "Unknown-N" (2 <= N <= 15) were marked to be translated. I had forgot these, plus they wouldn't have worked correctly anyway before this commit, because printing tables with multibyte strings didn't work. Thanks to Marek Černocký for reporting the bug about misaligned table columns in --list output. 2010-09-10 10:30:33 +03:00			`}`

			`// Require that the string ends in the initial shift state.`
			`// This way the caller can be combine the string with other`
			`// strings without needing to worry about the shift states.`
			`if (!mbsinit(&state))`
			`return (size_t)-1;`

			`return width;`
			`#endif`
			`}`