1
0
mirror of https://git.tukaani.org/xz.git synced 2025-04-03 06:10:57 +00:00
xz/src/common/tuklib_mbstr_nonprint.c
Lasse Collin 40e5733055
Add tuklib_mbstr_nonprint to mask non-printable characters
Malicious filenames or other untrusted strings may affect the state of
the terminal when such strings are printed as part of (error) messages.
Add functions that mask such characters.

It's not enough to handle only single-byte control characters.
In multibyte locales, some control characters are multibyte too, for
example, terminals interpret C1 control characters (U+0080 to U+009F)
that are two bytes as UTF-8.

Instead of checking for control characters with iswcntrl(), this
uses iswprint() to detect printable characters. This is much stricter.
On Windows it's actually too strict as it rejects some characters that
definitely are printable.

Gnulib's quotearg would do a lot more but I hope this simpler method
is good enough here.

Thanks to Ryan Colyer for the discussion about the problems of
the earlier single-byte-only method.

Thanks to Christian Weisgerber for reporting a bug in an earlier
version of this code.

Thanks to Jeroen Roovers for a typo fix.

Closes: https://github.com/tukaani-project/xz/pull/118
2024-12-18 17:09:32 +02:00

152 lines
3.6 KiB
C

// SPDX-License-Identifier: 0BSD
///////////////////////////////////////////////////////////////////////////////
//
/// \file tuklib_mbstr_nonprint.c
/// \brief Find and replace non-printable characters with question marks
//
// Author: Lasse Collin
//
///////////////////////////////////////////////////////////////////////////////
#include "tuklib_mbstr_nonprint.h"
#include <stdlib.h>
#include <string.h>
#ifdef HAVE_MBRTOWC
# include <wchar.h>
# include <wctype.h>
#else
# include <ctype.h>
#endif
static bool
is_next_printable(const char *str, size_t len, size_t *next_len)
{
#ifdef HAVE_MBRTOWC
// This assumes that character sets with locking shift states aren't
// used, and thus mbsinit() is never needed.
mbstate_t ps;
memset(&ps, 0, sizeof(ps));
wchar_t wc;
*next_len = mbrtowc(&wc, str, len, &ps);
if (*next_len == (size_t)-2) {
// Incomplete multibyte sequence: Treat the whole sequence
// as a single non-printable multibyte character that ends
// the string.
*next_len = len;
return false;
}
// Check more broadly than just ret == (size_t)-1 to be safe
// in case mbrtowc() returns something weird. This check
// covers (size_t)-1 (that is, SIZE_MAX) too because len is from
// strlen() and the terminating '\0' isn't part of the length.
if (*next_len < 1 || *next_len > len) {
// Invalid multibyte sequence: Treat the first byte as
// a non-printable single-byte character. Decoding will
// be restarted from the next byte on the next call to
// this function.
*next_len = 1;
return false;
}
# if defined(_WIN32) && !defined(__CYGWIN__)
// On Windows, wchar_t stores UTF-16 code units, thus characters
// outside the Basic Multilingual Plane (BMP) don't fit into
// a single wchar_t. In an UTF-8 locale, UCRT's mbrtowc() returns
// successfully when the input is a non-BMP character but the
// output is the replacement character U+FFFD.
//
// iswprint() returns 0 for U+FFFD on Windows for some reason. Treat
// U+FFFD as printable and thus also all non-BMP chars as printable.
if (wc == 0xFFFD)
return true;
# endif
return iswprint((wint_t)wc) != 0;
#else
(void)len;
*next_len = 1;
return isprint((unsigned char)str[0]) != 0;
#endif
}
static bool
has_nonprint(const char *str, size_t len)
{
for (size_t i = 0; i < len; ) {
size_t next_len;
if (!is_next_printable(str + i, len - i, &next_len))
return true;
i += next_len;
}
return false;
}
extern bool
tuklib_has_nonprint(const char *str)
{
return has_nonprint(str, strlen(str));
}
extern const char *
tuklib_mask_nonprint_r(const char *str, char **mem)
{
// Free the old string, if any.
free(*mem);
*mem = NULL;
// If the whole input string contains only printable characters,
// return the input string.
const size_t len = strlen(str);
if (!has_nonprint(str, len))
return str;
// Allocate memory for the masked string. Since we use the single-byte
// character '?' to mask non-printable characters, it's possible that
// a few bytes less memory would be needed in reality if multibyte
// characters are masked.
//
// If allocation fails, return "???" because it should be safer than
// returning the unmasked string.
*mem = malloc(len + 1);
if (*mem == NULL)
return "???";
// Replace all non-printable characters with '?'.
char *dest = *mem;
for (size_t i = 0; i < len; ) {
size_t next_len;
if (is_next_printable(str + i, len - i, &next_len)) {
memcpy(dest, str + i, next_len);
dest += next_len;
} else {
*dest++ = '?';
}
i += next_len;
}
*dest = '\0';
return *mem;
}
extern const char *
tuklib_mask_nonprint(const char *str)
{
static char *mem = NULL;
return tuklib_mask_nonprint_r(str, &mem);
}