mirror of
				https://git.tukaani.org/xz.git
				synced 2025-11-03 23:12:57 +00:00 
			
		
		
		
	A typical use case is like this:
    printf("%s: %s\n", tuklib_mask_nonprint(filename), strerror(errno));
tuklib_mask_nonprint() may call mbrtowc() and malloc() which may modify
errno. If errno isn't preserved, the error message might be wrong if
a compiler decides to call tuklib_mask_nonprint() before strerror().
Fixes: 40e573305535960574404d2eae848b248c95ea7e
		
	
			
		
			
				
	
	
		
			163 lines
		
	
	
		
			3.8 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			163 lines
		
	
	
		
			3.8 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
// SPDX-License-Identifier: 0BSD
 | 
						|
 | 
						|
///////////////////////////////////////////////////////////////////////////////
 | 
						|
//
 | 
						|
/// \file       tuklib_mbstr_nonprint.c
 | 
						|
/// \brief      Find and replace non-printable characters with question marks
 | 
						|
//
 | 
						|
//  Author:     Lasse Collin
 | 
						|
//
 | 
						|
///////////////////////////////////////////////////////////////////////////////
 | 
						|
 | 
						|
#include "tuklib_mbstr_nonprint.h"
 | 
						|
#include <stdlib.h>
 | 
						|
#include <string.h>
 | 
						|
#include <errno.h>
 | 
						|
 | 
						|
#ifdef HAVE_MBRTOWC
 | 
						|
#	include <wchar.h>
 | 
						|
#	include <wctype.h>
 | 
						|
#else
 | 
						|
#	include <ctype.h>
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
static bool
 | 
						|
is_next_printable(const char *str, size_t len, size_t *next_len)
 | 
						|
{
 | 
						|
#ifdef HAVE_MBRTOWC
 | 
						|
	// This assumes that character sets with locking shift states aren't
 | 
						|
	// used, and thus mbsinit() is never needed.
 | 
						|
	mbstate_t ps;
 | 
						|
	memset(&ps, 0, sizeof(ps));
 | 
						|
 | 
						|
	wchar_t wc;
 | 
						|
	*next_len = mbrtowc(&wc, str, len, &ps);
 | 
						|
 | 
						|
	if (*next_len == (size_t)-2) {
 | 
						|
		// Incomplete multibyte sequence: Treat the whole sequence
 | 
						|
		// as a single non-printable multibyte character that ends
 | 
						|
		// the string.
 | 
						|
		*next_len = len;
 | 
						|
		return false;
 | 
						|
	}
 | 
						|
 | 
						|
	// Check more broadly than just ret == (size_t)-1 to be safe
 | 
						|
	// in case mbrtowc() returns something weird. This check
 | 
						|
	// covers (size_t)-1 (that is, SIZE_MAX) too because len is from
 | 
						|
	// strlen() and the terminating '\0' isn't part of the length.
 | 
						|
	if (*next_len < 1 || *next_len > len) {
 | 
						|
		// Invalid multibyte sequence: Treat the first byte as
 | 
						|
		// a non-printable single-byte character. Decoding will
 | 
						|
		// be restarted from the next byte on the next call to
 | 
						|
		// this function.
 | 
						|
		*next_len = 1;
 | 
						|
		return false;
 | 
						|
	}
 | 
						|
 | 
						|
#	if defined(_WIN32) && !defined(__CYGWIN__)
 | 
						|
	// On Windows, wchar_t stores UTF-16 code units, thus characters
 | 
						|
	// outside the Basic Multilingual Plane (BMP) don't fit into
 | 
						|
	// a single wchar_t. In an UTF-8 locale, UCRT's mbrtowc() returns
 | 
						|
	// successfully when the input is a non-BMP character but the
 | 
						|
	// output is the replacement character U+FFFD.
 | 
						|
	//
 | 
						|
	// iswprint() returns 0 for U+FFFD on Windows for some reason. Treat
 | 
						|
	// U+FFFD as printable and thus also all non-BMP chars as printable.
 | 
						|
	if (wc == 0xFFFD)
 | 
						|
		return true;
 | 
						|
#	endif
 | 
						|
 | 
						|
	return iswprint((wint_t)wc) != 0;
 | 
						|
#else
 | 
						|
	(void)len;
 | 
						|
	*next_len = 1;
 | 
						|
	return isprint((unsigned char)str[0]) != 0;
 | 
						|
#endif
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
static bool
 | 
						|
has_nonprint(const char *str, size_t len)
 | 
						|
{
 | 
						|
	for (size_t i = 0; i < len; ) {
 | 
						|
		size_t next_len;
 | 
						|
		if (!is_next_printable(str + i, len - i, &next_len))
 | 
						|
			return true;
 | 
						|
 | 
						|
		i += next_len;
 | 
						|
	}
 | 
						|
 | 
						|
	return false;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
extern bool
 | 
						|
tuklib_has_nonprint(const char *str)
 | 
						|
{
 | 
						|
	const int saved_errno = errno;
 | 
						|
	const bool ret = has_nonprint(str, strlen(str));
 | 
						|
	errno = saved_errno;
 | 
						|
	return ret;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
extern const char *
 | 
						|
tuklib_mask_nonprint_r(const char *str, char **mem)
 | 
						|
{
 | 
						|
	const int saved_errno = errno;
 | 
						|
 | 
						|
	// Free the old string, if any.
 | 
						|
	free(*mem);
 | 
						|
	*mem = NULL;
 | 
						|
 | 
						|
	// If the whole input string contains only printable characters,
 | 
						|
	// return the input string.
 | 
						|
	const size_t len = strlen(str);
 | 
						|
	if (!has_nonprint(str, len)) {
 | 
						|
		errno = saved_errno;
 | 
						|
		return str;
 | 
						|
	}
 | 
						|
 | 
						|
	// Allocate memory for the masked string. Since we use the single-byte
 | 
						|
	// character '?' to mask non-printable characters, it's possible that
 | 
						|
	// a few bytes less memory would be needed in reality if multibyte
 | 
						|
	// characters are masked.
 | 
						|
	//
 | 
						|
	// If allocation fails, return "???" because it should be safer than
 | 
						|
	// returning the unmasked string.
 | 
						|
	*mem = malloc(len + 1);
 | 
						|
	if (*mem == NULL) {
 | 
						|
		errno = saved_errno;
 | 
						|
		return "???";
 | 
						|
	}
 | 
						|
 | 
						|
	// Replace all non-printable characters with '?'.
 | 
						|
	char *dest = *mem;
 | 
						|
 | 
						|
	for (size_t i = 0; i < len; ) {
 | 
						|
		size_t next_len;
 | 
						|
		if (is_next_printable(str + i, len - i, &next_len)) {
 | 
						|
			memcpy(dest, str + i, next_len);
 | 
						|
			dest += next_len;
 | 
						|
		} else {
 | 
						|
			*dest++ = '?';
 | 
						|
		}
 | 
						|
 | 
						|
		i += next_len;
 | 
						|
	}
 | 
						|
 | 
						|
	*dest = '\0';
 | 
						|
 | 
						|
	errno = saved_errno;
 | 
						|
	return *mem;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
extern const char *
 | 
						|
tuklib_mask_nonprint(const char *str)
 | 
						|
{
 | 
						|
	static char *mem = NULL;
 | 
						|
	return tuklib_mask_nonprint_r(str, &mem);
 | 
						|
}
 |