From 40e573305535960574404d2eae848b248c95ea7e Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Wed, 18 Dec 2024 14:00:09 +0200 Subject: [PATCH] Add tuklib_mbstr_nonprint to mask non-printable characters Malicious filenames or other untrusted strings may affect the state of the terminal when such strings are printed as part of (error) messages. Add functions that mask such characters. It's not enough to handle only single-byte control characters. In multibyte locales, some control characters are multibyte too, for example, terminals interpret C1 control characters (U+0080 to U+009F) that are two bytes as UTF-8. Instead of checking for control characters with iswcntrl(), this uses iswprint() to detect printable characters. This is much stricter. On Windows it's actually too strict as it rejects some characters that definitely are printable. Gnulib's quotearg would do a lot more but I hope this simpler method is good enough here. Thanks to Ryan Colyer for the discussion about the problems of the earlier single-byte-only method. Thanks to Christian Weisgerber for reporting a bug in an earlier version of this code. Thanks to Jeroen Roovers for a typo fix. Closes: https://github.com/tukaani-project/xz/pull/118 --- src/Makefile.am | 2 + src/common/tuklib_mbstr_nonprint.c | 151 +++++++++++++++++++++++++++++ src/common/tuklib_mbstr_nonprint.h | 69 +++++++++++++ 3 files changed, 222 insertions(+) create mode 100644 src/common/tuklib_mbstr_nonprint.c create mode 100644 src/common/tuklib_mbstr_nonprint.h diff --git a/src/Makefile.am b/src/Makefile.am index cb5aed40..15eee834 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -29,6 +29,8 @@ EXTRA_DIST = \ common/tuklib_integer.h \ common/tuklib_mbstr.h \ common/tuklib_mbstr_fw.c \ + common/tuklib_mbstr_nonprint.c \ + common/tuklib_mbstr_nonprint.h \ common/tuklib_mbstr_width.c \ common/tuklib_mbstr_wrap.c \ common/tuklib_mbstr_wrap.h \ diff --git a/src/common/tuklib_mbstr_nonprint.c b/src/common/tuklib_mbstr_nonprint.c new file mode 100644 index 00000000..cac10bfe --- /dev/null +++ b/src/common/tuklib_mbstr_nonprint.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: 0BSD + +/////////////////////////////////////////////////////////////////////////////// +// +/// \file tuklib_mbstr_nonprint.c +/// \brief Find and replace non-printable characters with question marks +// +// Author: Lasse Collin +// +/////////////////////////////////////////////////////////////////////////////// + +#include "tuklib_mbstr_nonprint.h" +#include +#include + +#ifdef HAVE_MBRTOWC +# include +# include +#else +# include +#endif + + +static bool +is_next_printable(const char *str, size_t len, size_t *next_len) +{ +#ifdef HAVE_MBRTOWC + // This assumes that character sets with locking shift states aren't + // used, and thus mbsinit() is never needed. + mbstate_t ps; + memset(&ps, 0, sizeof(ps)); + + wchar_t wc; + *next_len = mbrtowc(&wc, str, len, &ps); + + if (*next_len == (size_t)-2) { + // Incomplete multibyte sequence: Treat the whole sequence + // as a single non-printable multibyte character that ends + // the string. + *next_len = len; + return false; + } + + // Check more broadly than just ret == (size_t)-1 to be safe + // in case mbrtowc() returns something weird. This check + // covers (size_t)-1 (that is, SIZE_MAX) too because len is from + // strlen() and the terminating '\0' isn't part of the length. + if (*next_len < 1 || *next_len > len) { + // Invalid multibyte sequence: Treat the first byte as + // a non-printable single-byte character. Decoding will + // be restarted from the next byte on the next call to + // this function. + *next_len = 1; + return false; + } + +# if defined(_WIN32) && !defined(__CYGWIN__) + // On Windows, wchar_t stores UTF-16 code units, thus characters + // outside the Basic Multilingual Plane (BMP) don't fit into + // a single wchar_t. In an UTF-8 locale, UCRT's mbrtowc() returns + // successfully when the input is a non-BMP character but the + // output is the replacement character U+FFFD. + // + // iswprint() returns 0 for U+FFFD on Windows for some reason. Treat + // U+FFFD as printable and thus also all non-BMP chars as printable. + if (wc == 0xFFFD) + return true; +# endif + + return iswprint((wint_t)wc) != 0; +#else + (void)len; + *next_len = 1; + return isprint((unsigned char)str[0]) != 0; +#endif +} + + +static bool +has_nonprint(const char *str, size_t len) +{ + for (size_t i = 0; i < len; ) { + size_t next_len; + if (!is_next_printable(str + i, len - i, &next_len)) + return true; + + i += next_len; + } + + return false; +} + + +extern bool +tuklib_has_nonprint(const char *str) +{ + return has_nonprint(str, strlen(str)); +} + + +extern const char * +tuklib_mask_nonprint_r(const char *str, char **mem) +{ + // Free the old string, if any. + free(*mem); + *mem = NULL; + + // If the whole input string contains only printable characters, + // return the input string. + const size_t len = strlen(str); + if (!has_nonprint(str, len)) + return str; + + // Allocate memory for the masked string. Since we use the single-byte + // character '?' to mask non-printable characters, it's possible that + // a few bytes less memory would be needed in reality if multibyte + // characters are masked. + // + // If allocation fails, return "???" because it should be safer than + // returning the unmasked string. + *mem = malloc(len + 1); + if (*mem == NULL) + return "???"; + + // Replace all non-printable characters with '?'. + char *dest = *mem; + + for (size_t i = 0; i < len; ) { + size_t next_len; + if (is_next_printable(str + i, len - i, &next_len)) { + memcpy(dest, str + i, next_len); + dest += next_len; + } else { + *dest++ = '?'; + } + + i += next_len; + } + + *dest = '\0'; + + return *mem; +} + + +extern const char * +tuklib_mask_nonprint(const char *str) +{ + static char *mem = NULL; + return tuklib_mask_nonprint_r(str, &mem); +} diff --git a/src/common/tuklib_mbstr_nonprint.h b/src/common/tuklib_mbstr_nonprint.h new file mode 100644 index 00000000..7c2bef15 --- /dev/null +++ b/src/common/tuklib_mbstr_nonprint.h @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: 0BSD + +/////////////////////////////////////////////////////////////////////////////// +// +/// \file tuklib_mbstr_nonprint.h +/// \brief Find and replace non-printable characters with question marks +/// +/// If mbrtowc(3) is available, it and iswprint(3) is used to check if all +/// characters are printable. Otherwise single-byte character set is assumed +/// and isprint(3) is used. +// +// Author: Lasse Collin +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef TUKLIB_MBSTR_NONPRINT_H +#define TUKLIB_MBSTR_NONPRINT_H + +#include "tuklib_common.h" +TUKLIB_DECLS_BEGIN + +#define tuklib_has_nonprint TUKLIB_SYMBOL(tuklib_has_nonprint) +extern bool tuklib_has_nonprint(const char *str); +///< +/// \brief Check if a string contains any non-printable characters +/// +/// \return false if str contains only valid multibyte characters and +/// iswprint(3) returns non-zero for all of them; true otherwise +/// +/// \note In case mbrtowc(3) isn't available, single-byte character set +/// is assumed and isprint(3) is used instead of iswprint(3). + +#define tuklib_mask_nonprint_r TUKLIB_SYMBOL(tuklib_mask_nonprint_r) +extern const char *tuklib_mask_nonprint_r(const char *str, char **mem); +///< +/// \brief Replace non-printable characters with question marks +/// +/// \param str Untrusted string, for example, a filename +/// \param mem This function always calls free(*mem) to free the old +/// allocation and then sets *mem = NULL. Before the first +/// call, *mem should be initialized to NULL. If this +/// function needs to allocate memory for a modified +/// string, a pointer to the allocated memory will be +/// stored to *mem. Otherwise *mem will remain NULL. +/// +/// \return If tuklib_has_nonprint(str) returns false, this function +/// returns str. Otherwise memory is allocated to hold a modified +/// string and a pointer to that is returned. The pointer to the +/// allocated memory is also stored to *mem. A modified string +/// has the problematic characters replaced by '?'. If memory +/// allocation fails, "???" is returned and *mem is NULL. + +#define tuklib_mask_nonprint TUKLIB_SYMBOL(tuklib_mask_nonprint) +extern const char *tuklib_mask_nonprint(const char *str); +///< +/// \brief Replace non-printable characters with question marks +/// +/// This is a convenience function for single-threaded use. This calls +/// tuklib_mask_nonprint_r() using an internal static variable to hold +/// the possible allocation. +/// +/// \param str Untrusted string, for example, a filename +/// +/// \return See tuklib_mask_nonprint_r(). +/// +/// \note This function is not thread safe! + +TUKLIB_DECLS_END +#endif