diff --git a/src/Makefile.am b/src/Makefile.am index cb5aed40..15eee834 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -29,6 +29,8 @@ EXTRA_DIST = \ common/tuklib_integer.h \ common/tuklib_mbstr.h \ common/tuklib_mbstr_fw.c \ + common/tuklib_mbstr_nonprint.c \ + common/tuklib_mbstr_nonprint.h \ common/tuklib_mbstr_width.c \ common/tuklib_mbstr_wrap.c \ common/tuklib_mbstr_wrap.h \ diff --git a/src/common/tuklib_mbstr_nonprint.c b/src/common/tuklib_mbstr_nonprint.c new file mode 100644 index 00000000..cac10bfe --- /dev/null +++ b/src/common/tuklib_mbstr_nonprint.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: 0BSD + +/////////////////////////////////////////////////////////////////////////////// +// +/// \file tuklib_mbstr_nonprint.c +/// \brief Find and replace non-printable characters with question marks +// +// Author: Lasse Collin +// +/////////////////////////////////////////////////////////////////////////////// + +#include "tuklib_mbstr_nonprint.h" +#include +#include + +#ifdef HAVE_MBRTOWC +# include +# include +#else +# include +#endif + + +static bool +is_next_printable(const char *str, size_t len, size_t *next_len) +{ +#ifdef HAVE_MBRTOWC + // This assumes that character sets with locking shift states aren't + // used, and thus mbsinit() is never needed. + mbstate_t ps; + memset(&ps, 0, sizeof(ps)); + + wchar_t wc; + *next_len = mbrtowc(&wc, str, len, &ps); + + if (*next_len == (size_t)-2) { + // Incomplete multibyte sequence: Treat the whole sequence + // as a single non-printable multibyte character that ends + // the string. + *next_len = len; + return false; + } + + // Check more broadly than just ret == (size_t)-1 to be safe + // in case mbrtowc() returns something weird. This check + // covers (size_t)-1 (that is, SIZE_MAX) too because len is from + // strlen() and the terminating '\0' isn't part of the length. + if (*next_len < 1 || *next_len > len) { + // Invalid multibyte sequence: Treat the first byte as + // a non-printable single-byte character. Decoding will + // be restarted from the next byte on the next call to + // this function. + *next_len = 1; + return false; + } + +# if defined(_WIN32) && !defined(__CYGWIN__) + // On Windows, wchar_t stores UTF-16 code units, thus characters + // outside the Basic Multilingual Plane (BMP) don't fit into + // a single wchar_t. In an UTF-8 locale, UCRT's mbrtowc() returns + // successfully when the input is a non-BMP character but the + // output is the replacement character U+FFFD. + // + // iswprint() returns 0 for U+FFFD on Windows for some reason. Treat + // U+FFFD as printable and thus also all non-BMP chars as printable. + if (wc == 0xFFFD) + return true; +# endif + + return iswprint((wint_t)wc) != 0; +#else + (void)len; + *next_len = 1; + return isprint((unsigned char)str[0]) != 0; +#endif +} + + +static bool +has_nonprint(const char *str, size_t len) +{ + for (size_t i = 0; i < len; ) { + size_t next_len; + if (!is_next_printable(str + i, len - i, &next_len)) + return true; + + i += next_len; + } + + return false; +} + + +extern bool +tuklib_has_nonprint(const char *str) +{ + return has_nonprint(str, strlen(str)); +} + + +extern const char * +tuklib_mask_nonprint_r(const char *str, char **mem) +{ + // Free the old string, if any. + free(*mem); + *mem = NULL; + + // If the whole input string contains only printable characters, + // return the input string. + const size_t len = strlen(str); + if (!has_nonprint(str, len)) + return str; + + // Allocate memory for the masked string. Since we use the single-byte + // character '?' to mask non-printable characters, it's possible that + // a few bytes less memory would be needed in reality if multibyte + // characters are masked. + // + // If allocation fails, return "???" because it should be safer than + // returning the unmasked string. + *mem = malloc(len + 1); + if (*mem == NULL) + return "???"; + + // Replace all non-printable characters with '?'. + char *dest = *mem; + + for (size_t i = 0; i < len; ) { + size_t next_len; + if (is_next_printable(str + i, len - i, &next_len)) { + memcpy(dest, str + i, next_len); + dest += next_len; + } else { + *dest++ = '?'; + } + + i += next_len; + } + + *dest = '\0'; + + return *mem; +} + + +extern const char * +tuklib_mask_nonprint(const char *str) +{ + static char *mem = NULL; + return tuklib_mask_nonprint_r(str, &mem); +} diff --git a/src/common/tuklib_mbstr_nonprint.h b/src/common/tuklib_mbstr_nonprint.h new file mode 100644 index 00000000..7c2bef15 --- /dev/null +++ b/src/common/tuklib_mbstr_nonprint.h @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: 0BSD + +/////////////////////////////////////////////////////////////////////////////// +// +/// \file tuklib_mbstr_nonprint.h +/// \brief Find and replace non-printable characters with question marks +/// +/// If mbrtowc(3) is available, it and iswprint(3) is used to check if all +/// characters are printable. Otherwise single-byte character set is assumed +/// and isprint(3) is used. +// +// Author: Lasse Collin +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef TUKLIB_MBSTR_NONPRINT_H +#define TUKLIB_MBSTR_NONPRINT_H + +#include "tuklib_common.h" +TUKLIB_DECLS_BEGIN + +#define tuklib_has_nonprint TUKLIB_SYMBOL(tuklib_has_nonprint) +extern bool tuklib_has_nonprint(const char *str); +///< +/// \brief Check if a string contains any non-printable characters +/// +/// \return false if str contains only valid multibyte characters and +/// iswprint(3) returns non-zero for all of them; true otherwise +/// +/// \note In case mbrtowc(3) isn't available, single-byte character set +/// is assumed and isprint(3) is used instead of iswprint(3). + +#define tuklib_mask_nonprint_r TUKLIB_SYMBOL(tuklib_mask_nonprint_r) +extern const char *tuklib_mask_nonprint_r(const char *str, char **mem); +///< +/// \brief Replace non-printable characters with question marks +/// +/// \param str Untrusted string, for example, a filename +/// \param mem This function always calls free(*mem) to free the old +/// allocation and then sets *mem = NULL. Before the first +/// call, *mem should be initialized to NULL. If this +/// function needs to allocate memory for a modified +/// string, a pointer to the allocated memory will be +/// stored to *mem. Otherwise *mem will remain NULL. +/// +/// \return If tuklib_has_nonprint(str) returns false, this function +/// returns str. Otherwise memory is allocated to hold a modified +/// string and a pointer to that is returned. The pointer to the +/// allocated memory is also stored to *mem. A modified string +/// has the problematic characters replaced by '?'. If memory +/// allocation fails, "???" is returned and *mem is NULL. + +#define tuklib_mask_nonprint TUKLIB_SYMBOL(tuklib_mask_nonprint) +extern const char *tuklib_mask_nonprint(const char *str); +///< +/// \brief Replace non-printable characters with question marks +/// +/// This is a convenience function for single-threaded use. This calls +/// tuklib_mask_nonprint_r() using an internal static variable to hold +/// the possible allocation. +/// +/// \param str Untrusted string, for example, a filename +/// +/// \return See tuklib_mask_nonprint_r(). +/// +/// \note This function is not thread safe! + +TUKLIB_DECLS_END +#endif