mirror of
https://git.tukaani.org/xz.git
synced 2025-04-03 06:10:57 +00:00
Malicious filenames or other untrusted strings may affect the state of the terminal when such strings are printed as part of (error) messages. Add functions that mask such characters. It's not enough to handle only single-byte control characters. In multibyte locales, some control characters are multibyte too, for example, terminals interpret C1 control characters (U+0080 to U+009F) that are two bytes as UTF-8. Instead of checking for control characters with iswcntrl(), this uses iswprint() to detect printable characters. This is much stricter. On Windows it's actually too strict as it rejects some characters that definitely are printable. Gnulib's quotearg would do a lot more but I hope this simpler method is good enough here. Thanks to Ryan Colyer for the discussion about the problems of the earlier single-byte-only method. Thanks to Christian Weisgerber for reporting a bug in an earlier version of this code. Thanks to Jeroen Roovers for a typo fix. Closes: https://github.com/tukaani-project/xz/pull/118
152 lines
3.6 KiB
C
152 lines
3.6 KiB
C
// SPDX-License-Identifier: 0BSD
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
//
|
|
/// \file tuklib_mbstr_nonprint.c
|
|
/// \brief Find and replace non-printable characters with question marks
|
|
//
|
|
// Author: Lasse Collin
|
|
//
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
#include "tuklib_mbstr_nonprint.h"
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#ifdef HAVE_MBRTOWC
|
|
# include <wchar.h>
|
|
# include <wctype.h>
|
|
#else
|
|
# include <ctype.h>
|
|
#endif
|
|
|
|
|
|
static bool
|
|
is_next_printable(const char *str, size_t len, size_t *next_len)
|
|
{
|
|
#ifdef HAVE_MBRTOWC
|
|
// This assumes that character sets with locking shift states aren't
|
|
// used, and thus mbsinit() is never needed.
|
|
mbstate_t ps;
|
|
memset(&ps, 0, sizeof(ps));
|
|
|
|
wchar_t wc;
|
|
*next_len = mbrtowc(&wc, str, len, &ps);
|
|
|
|
if (*next_len == (size_t)-2) {
|
|
// Incomplete multibyte sequence: Treat the whole sequence
|
|
// as a single non-printable multibyte character that ends
|
|
// the string.
|
|
*next_len = len;
|
|
return false;
|
|
}
|
|
|
|
// Check more broadly than just ret == (size_t)-1 to be safe
|
|
// in case mbrtowc() returns something weird. This check
|
|
// covers (size_t)-1 (that is, SIZE_MAX) too because len is from
|
|
// strlen() and the terminating '\0' isn't part of the length.
|
|
if (*next_len < 1 || *next_len > len) {
|
|
// Invalid multibyte sequence: Treat the first byte as
|
|
// a non-printable single-byte character. Decoding will
|
|
// be restarted from the next byte on the next call to
|
|
// this function.
|
|
*next_len = 1;
|
|
return false;
|
|
}
|
|
|
|
# if defined(_WIN32) && !defined(__CYGWIN__)
|
|
// On Windows, wchar_t stores UTF-16 code units, thus characters
|
|
// outside the Basic Multilingual Plane (BMP) don't fit into
|
|
// a single wchar_t. In an UTF-8 locale, UCRT's mbrtowc() returns
|
|
// successfully when the input is a non-BMP character but the
|
|
// output is the replacement character U+FFFD.
|
|
//
|
|
// iswprint() returns 0 for U+FFFD on Windows for some reason. Treat
|
|
// U+FFFD as printable and thus also all non-BMP chars as printable.
|
|
if (wc == 0xFFFD)
|
|
return true;
|
|
# endif
|
|
|
|
return iswprint((wint_t)wc) != 0;
|
|
#else
|
|
(void)len;
|
|
*next_len = 1;
|
|
return isprint((unsigned char)str[0]) != 0;
|
|
#endif
|
|
}
|
|
|
|
|
|
static bool
|
|
has_nonprint(const char *str, size_t len)
|
|
{
|
|
for (size_t i = 0; i < len; ) {
|
|
size_t next_len;
|
|
if (!is_next_printable(str + i, len - i, &next_len))
|
|
return true;
|
|
|
|
i += next_len;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
extern bool
|
|
tuklib_has_nonprint(const char *str)
|
|
{
|
|
return has_nonprint(str, strlen(str));
|
|
}
|
|
|
|
|
|
extern const char *
|
|
tuklib_mask_nonprint_r(const char *str, char **mem)
|
|
{
|
|
// Free the old string, if any.
|
|
free(*mem);
|
|
*mem = NULL;
|
|
|
|
// If the whole input string contains only printable characters,
|
|
// return the input string.
|
|
const size_t len = strlen(str);
|
|
if (!has_nonprint(str, len))
|
|
return str;
|
|
|
|
// Allocate memory for the masked string. Since we use the single-byte
|
|
// character '?' to mask non-printable characters, it's possible that
|
|
// a few bytes less memory would be needed in reality if multibyte
|
|
// characters are masked.
|
|
//
|
|
// If allocation fails, return "???" because it should be safer than
|
|
// returning the unmasked string.
|
|
*mem = malloc(len + 1);
|
|
if (*mem == NULL)
|
|
return "???";
|
|
|
|
// Replace all non-printable characters with '?'.
|
|
char *dest = *mem;
|
|
|
|
for (size_t i = 0; i < len; ) {
|
|
size_t next_len;
|
|
if (is_next_printable(str + i, len - i, &next_len)) {
|
|
memcpy(dest, str + i, next_len);
|
|
dest += next_len;
|
|
} else {
|
|
*dest++ = '?';
|
|
}
|
|
|
|
i += next_len;
|
|
}
|
|
|
|
*dest = '\0';
|
|
|
|
return *mem;
|
|
}
|
|
|
|
|
|
extern const char *
|
|
tuklib_mask_nonprint(const char *str)
|
|
{
|
|
static char *mem = NULL;
|
|
return tuklib_mask_nonprint_r(str, &mem);
|
|
}
|