diff --git a/m4/tuklib_integer.m4 b/m4/tuklib_integer.m4 index 2ab72a2f..ae3c1f2a 100644 --- a/m4/tuklib_integer.m4 +++ b/m4/tuklib_integer.m4 @@ -45,6 +45,20 @@ main(void) ])dnl fi +AC_MSG_CHECKING([if __builtin_bswap16/32/64 are supported]) +AC_LINK_IFELSE([AC_LANG_PROGRAM([[]], + [[__builtin_bswap16(1); + __builtin_bswap32(1); + __builtin_bswap64(1);]])], + [ + AC_DEFINE([HAVE___BUILTIN_BSWAPXX], [1], + [Define to 1 if the GNU C extensions + __builtin_bswap16/32/64 are supported.]) + AC_MSG_RESULT([yes]) + ], [ + AC_MSG_RESULT([no]) + ]) + AC_MSG_CHECKING([if unaligned memory access should be used]) AC_ARG_ENABLE([unaligned-access], AS_HELP_STRING([--enable-unaligned-access], [Enable if the system supports *fast* unaligned memory access @@ -71,4 +85,33 @@ if test "x$enable_unaligned_access" = xyes ; then else AC_MSG_RESULT([no]) fi + +AC_MSG_CHECKING([if unsafe type punning should be used]) +AC_ARG_ENABLE([unsafe-type-punning], + AS_HELP_STRING([--enable-unsafe-type-punning], + [This introduces strict aliasing violations and may result + in broken code. However, this might improve performance in + some cases, especially with old compilers (e.g. + GCC 3 and early 4.x on x86, GCC < 6 on ARMv6 and ARMv7).]), + [], [enable_unsafe_type_punning=no]) +if test "x$enable_unsafe_type_punning" = xyes ; then + AC_DEFINE([TUKLIB_USE_UNSAFE_TYPE_PUNNING], [1], [Define to 1 to use + unsafe type punning, e.g. char *x = ...; *(int *)x = 123; + which violates strict aliasing rules and thus is + undefined behavior and might result in broken code.]) + AC_MSG_RESULT([yes]) +else + AC_MSG_RESULT([no]) +fi + +AC_MSG_CHECKING([if __builtin_assume_aligned is supported]) +AC_LINK_IFELSE([AC_LANG_PROGRAM([[]], [[__builtin_assume_aligned("", 1);]])], + [ + AC_DEFINE([HAVE___BUILTIN_ASSUME_ALIGNED], [1], + [Define to 1 if the GNU C extension + __builtin_assume_aligned is supported.]) + AC_MSG_RESULT([yes]) + ], [ + AC_MSG_RESULT([no]) + ]) ])dnl diff --git a/src/common/tuklib_integer.h b/src/common/tuklib_integer.h index 52564481..699d5fe6 100644 --- a/src/common/tuklib_integer.h +++ b/src/common/tuklib_integer.h @@ -6,22 +6,26 @@ /// This file provides macros or functions to do some basic integer and bit /// operations. /// -/// Endianness related integer operations (XX = 16, 32, or 64; Y = b or l): +/// Native endian inline functions (XX = 16, 32, or 64): +/// - Unaligned native endian reads: unaligned_readXXne(ptr) +/// - Unaligned native endian writes: unaligned_writeXXne(ptr, num) +/// - Aligned native endian reads: readXXne(ptr) +/// - Aligned native endian writes: writeXXne(ptr, num) +/// +/// Endianness-converting integer operations (these can be macros!) +/// (XX = 16, 32, or 64; Y = b or l): /// - Byte swapping: bswapXX(num) -/// - Byte order conversions to/from native: convXXYe(num) +/// - Byte order conversions to/from native (byteswaps if Y isn't +/// the native endianness): convXXYe(num) /// - Aligned reads: readXXYe(ptr) /// - Aligned writes: writeXXYe(ptr, num) /// - Unaligned reads (16/32-bit only): unaligned_readXXYe(ptr) /// - Unaligned writes (16/32-bit only): unaligned_writeXXYe(ptr, num) /// -/// Since they can macros, the arguments should have no side effects since -/// they may be evaluated more than once. +/// Since the above can macros, the arguments should have no side effects +/// because they may be evaluated more than once. /// -/// \todo PowerPC and possibly some other architectures support -/// byte swapping load and store instructions. This file -/// doesn't take advantage of those instructions. -/// -/// Bit scan operations for non-zero 32-bit integers: +/// Bit scan operations for non-zero 32-bit integers (inline functions): /// - Bit scan reverse (find highest non-zero bit): bsr32(num) /// - Count leading zeros: clz32(num) /// - Count trailing zeros: ctz32(num) @@ -44,12 +48,24 @@ #include "tuklib_common.h" #include +// Newer Intel C compilers require immintrin.h for _bit_scan_reverse() +// and such functions. +#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) +# include +#endif -//////////////////////////////////////// -// Operating system specific features // -//////////////////////////////////////// -#if defined(HAVE_BYTESWAP_H) +/////////////////// +// Byte swapping // +/////////////////// + +#if defined(HAVE___BUILTIN_BSWAPXX) + // GCC >= 4.8 and Clang +# define bswap16(n) __builtin_bswap16(n) +# define bswap32(n) __builtin_bswap32(n) +# define bswap64(n) __builtin_bswap64(n) + +#elif defined(HAVE_BYTESWAP_H) // glibc, uClibc, dietlibc # include # ifdef HAVE_BSWAP_16 @@ -98,45 +114,33 @@ # endif #endif - -//////////////////////////////// -// Compiler-specific features // -//////////////////////////////// - -// Newer Intel C compilers require immintrin.h for _bit_scan_reverse() -// and such functions. -#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) -# include -#endif - - -/////////////////// -// Byte swapping // -/////////////////// - #ifndef bswap16 -# define bswap16(num) \ - (((uint16_t)(num) << 8) | ((uint16_t)(num) >> 8)) +# define bswap16(n) (uint16_t)( \ + (((n) & 0x00FFU) << 8) \ + | (((n) & 0xFF00U) >> 8) \ + ) #endif #ifndef bswap32 -# define bswap32(num) \ - ( (((uint32_t)(num) << 24) ) \ - | (((uint32_t)(num) << 8) & UINT32_C(0x00FF0000)) \ - | (((uint32_t)(num) >> 8) & UINT32_C(0x0000FF00)) \ - | (((uint32_t)(num) >> 24) ) ) +# define bswap32(n) (uint32_t)( \ + (((n) & UINT32_C(0x000000FF)) << 24) \ + | (((n) & UINT32_C(0x0000FF00)) << 8) \ + | (((n) & UINT32_C(0x00FF0000)) >> 8) \ + | (((n) & UINT32_C(0xFF000000)) >> 24) \ + ) #endif #ifndef bswap64 -# define bswap64(num) \ - ( (((uint64_t)(num) << 56) ) \ - | (((uint64_t)(num) << 40) & UINT64_C(0x00FF000000000000)) \ - | (((uint64_t)(num) << 24) & UINT64_C(0x0000FF0000000000)) \ - | (((uint64_t)(num) << 8) & UINT64_C(0x000000FF00000000)) \ - | (((uint64_t)(num) >> 8) & UINT64_C(0x00000000FF000000)) \ - | (((uint64_t)(num) >> 24) & UINT64_C(0x0000000000FF0000)) \ - | (((uint64_t)(num) >> 40) & UINT64_C(0x000000000000FF00)) \ - | (((uint64_t)(num) >> 56) ) ) +# define bswap64(n) (uint64_t)( \ + (((n) & UINT64_C(0x00000000000000FF)) << 56) \ + | (((n) & UINT64_C(0x000000000000FF00)) << 40) \ + | (((n) & UINT64_C(0x0000000000FF0000)) << 24) \ + | (((n) & UINT64_C(0x00000000FF000000)) << 8) \ + | (((n) & UINT64_C(0x000000FF00000000)) >> 8) \ + | (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \ + | (((n) & UINT64_C(0x00FF000000000000)) >> 40) \ + | (((n) & UINT64_C(0xFF00000000000000)) >> 56) \ + ) #endif // Define conversion macros using the basic byte swapping macros. @@ -181,148 +185,31 @@ #endif -////////////////////////////// -// Aligned reads and writes // -////////////////////////////// - -static inline uint16_t -read16be(const uint8_t *buf) -{ - uint16_t num = *(const uint16_t *)buf; - return conv16be(num); -} - - -static inline uint16_t -read16le(const uint8_t *buf) -{ - uint16_t num = *(const uint16_t *)buf; - return conv16le(num); -} - - -static inline uint32_t -read32be(const uint8_t *buf) -{ - uint32_t num = *(const uint32_t *)buf; - return conv32be(num); -} - - -static inline uint32_t -read32le(const uint8_t *buf) -{ - uint32_t num = *(const uint32_t *)buf; - return conv32le(num); -} - - -static inline uint64_t -read64be(const uint8_t *buf) -{ - uint64_t num = *(const uint64_t *)buf; - return conv64be(num); -} - - -static inline uint64_t -read64le(const uint8_t *buf) -{ - uint64_t num = *(const uint64_t *)buf; - return conv64le(num); -} - - -// NOTE: Possible byte swapping must be done in a macro to allow GCC -// to optimize byte swapping of constants when using glibc's or *BSD's -// byte swapping macros. The actual write is done in an inline function -// to make type checking of the buf pointer possible similarly to readXXYe() -// functions. - -#define write16be(buf, num) write16ne((buf), conv16be(num)) -#define write16le(buf, num) write16ne((buf), conv16le(num)) -#define write32be(buf, num) write32ne((buf), conv32be(num)) -#define write32le(buf, num) write32ne((buf), conv32le(num)) -#define write64be(buf, num) write64ne((buf), conv64be(num)) -#define write64le(buf, num) write64ne((buf), conv64le(num)) - - -static inline void -write16ne(uint8_t *buf, uint16_t num) -{ - *(uint16_t *)buf = num; - return; -} - - -static inline void -write32ne(uint8_t *buf, uint32_t num) -{ - *(uint32_t *)buf = num; - return; -} - - -static inline void -write64ne(uint8_t *buf, uint64_t num) -{ - *(uint64_t *)buf = num; - return; -} - - //////////////////////////////// // Unaligned reads and writes // //////////////////////////////// // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer -// is bad (at least) because compilers can emit vector instructions that -// require aligned pointers even if non-vector instructions work with -// unaligned pointers. +// is bad even if the uint8_pointer is properly aligned because this kind +// of casts break strict aliasing rules and result in undefined behavior. +// With unaligned pointers it's even worse: compilers may emit vector +// instructions that require aligned pointers even if non-vector +// instructions work with unaligned pointers. // // Using memcpy() is the standard compliant way to do unaligned access. // Many modern compilers inline it so there is no function call overhead. -// -// However, it seems that some compilers generate better code with a cast -// to a packed struct than with memcpy(): -// - Old GCC versions (early 4.x and older) on x86 -// - GCC <= 8.2 (and possibly newer) on ARMv5 (but ARMv5 is old and maybe -// doesn't matter so much) -// - GCC <= 5.x on ARMv7 (on 4.x neither is great but packed is less bad) -// - Intel C Compiler <= 19 (and possibly newer) -// -// GCC on ARMv6 is weird: -// - GCC >= 6.x is better with memcpy() than with a packed struct. -// - On GCC < 6 neither method is good, but packed seems less bad. -// -// https://gcc.godbolt.org/ was useful for seeing what kind of code is -// generated by different compilers on different archs. Note that one -// may need to try a little less trivial code than than these functions -// alone to spot differences. For example this is better with packed method -// on Intel C Compiler 19: -// -// int foo(const uint8_t *a, const uint8_t *b) -// { -// return unaligned_read16ne(a) == unaligned_read16ne(b); -// } -// -// Based on the above information, prefer the memcpy() method in -// general (including all Clang versions), but use the packed struct -// with GCC 5.x and older and with the Intel C Compiler. This isn't -// optimal but at least it covers some known special cases. -#if defined(__GNUC__) && !defined(__clang__) \ - && (__GNUC__ < 6 || defined(__INTEL_COMPILER)) -# define TUKLIB_UNALIGNED_WITH_PACKED 1 -#endif - +// For those compilers that don't handle the memcpy() method well, the +// old casting method (that violates strict aliasing) can be requested at +// build time. A third method, casting to a packed struct, would also be +// an option but isn't provided to keep things simpler (it's already a mess). +// Hopefully this is flexible enough in practice. static inline uint16_t unaligned_read16ne(const uint8_t *buf) { -#ifdef TUKLIB_UNALIGNED_WITH_PACKED - struct __attribute__((__packed__)) s { uint16_t v; }; - const struct s *p = (const struct s *)buf; - return p->v; +#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ + && defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) + return *(const uint16_t *)buf; #else uint16_t num; memcpy(&num, buf, sizeof(num)); @@ -334,10 +221,9 @@ unaligned_read16ne(const uint8_t *buf) static inline uint32_t unaligned_read32ne(const uint8_t *buf) { -#ifdef TUKLIB_UNALIGNED_WITH_PACKED - struct __attribute__((__packed__)) s { uint32_t v; }; - const struct s *p = (const struct s *)buf; - return p->v; +#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ + && defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) + return *(const uint32_t *)buf; #else uint32_t num; memcpy(&num, buf, sizeof(num)); @@ -349,10 +235,9 @@ unaligned_read32ne(const uint8_t *buf) static inline uint64_t unaligned_read64ne(const uint8_t *buf) { -#ifdef TUKLIB_UNALIGNED_WITH_PACKED - struct __attribute__((__packed__)) s { uint64_t v; }; - const struct s *p = (const struct s *)buf; - return p->v; +#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ + && defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) + return *(const uint64_t *)buf; #else uint64_t num; memcpy(&num, buf, sizeof(num)); @@ -364,10 +249,9 @@ unaligned_read64ne(const uint8_t *buf) static inline void unaligned_write16ne(uint8_t *buf, uint16_t num) { -#ifdef TUKLIB_UNALIGNED_WITH_PACKED - struct __attribute__((__packed__)) s { uint16_t v; }; - struct s *p = (struct s *)buf; - p->v = num; +#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ + && defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) + *(uint16_t *)buf = num; #else memcpy(buf, &num, sizeof(num)); #endif @@ -378,10 +262,9 @@ unaligned_write16ne(uint8_t *buf, uint16_t num) static inline void unaligned_write32ne(uint8_t *buf, uint32_t num) { -#ifdef TUKLIB_UNALIGNED_WITH_PACKED - struct __attribute__((__packed__)) s { uint32_t v; }; - struct s *p = (struct s *)buf; - p->v = num; +#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ + && defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) + *(uint32_t *)buf = num; #else memcpy(buf, &num, sizeof(num)); #endif @@ -392,10 +275,9 @@ unaligned_write32ne(uint8_t *buf, uint32_t num) static inline void unaligned_write64ne(uint8_t *buf, uint64_t num) { -#ifdef TUKLIB_UNALIGNED_WITH_PACKED - struct __attribute__((__packed__)) s { uint64_t v; }; - struct s *p = (struct s *)buf; - p->v = num; +#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ + && defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) + *(uint64_t *)buf = num; #else memcpy(buf, &num, sizeof(num)); #endif @@ -403,12 +285,6 @@ unaligned_write64ne(uint8_t *buf, uint64_t num) } -// NOTE: TUKLIB_FAST_UNALIGNED_ACCESS indicates only support for 16-bit and -// 32-bit unaligned integer loads and stores. It's possible that 64-bit -// unaligned access doesn't work or is slower than byte-by-byte access. -// Since unaligned 64-bit is probably not needed as often as 16-bit or -// 32-bit, we simply don't support 64-bit unaligned access for now. - static inline uint16_t unaligned_read16be(const uint8_t *buf) { @@ -467,8 +343,11 @@ unaligned_read32le(const uint8_t *buf) } +// NOTE: Possible byte swapping must be done in a macro to allow the compiler +// to optimize byte swapping of constants when using glibc's or *BSD's +// byte swapping macros. The actual write is done in an inline function +// to make type checking of the buf pointer possible. #if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS) -// Like in the aligned case, these need to be macros. # define unaligned_write16be(buf, num) \ unaligned_write16ne(buf, conv16be(num)) # define unaligned_write32be(buf, num) \ @@ -531,6 +410,181 @@ unaligned_write32le(uint8_t *buf, uint32_t num) #endif +////////////////////////////// +// Aligned reads and writes // +////////////////////////////// + +// Separate functions for aligned reads and writes are provided since on +// strict-align archs aligned access is much faster than unaligned access. +// +// Just like in the unaligned case, memcpy() is needed to avoid +// strict aliasing violations. However, on archs that don't support +// unaligned access the compiler cannot know that the pointers given +// to memcpy() are aligned which results in slow code. As of C11 there is +// no standard way to tell the compiler that we know that the address is +// aligned but some compilers have language extensions to do that. With +// such language extensions the memcpy() method gives excellent results. +// +// What to do on a strict-align system when no known language extentensions +// are available? Falling back to byte-by-byte access would be safe but ruin +// optimizations that have been made specifically with aligned access in mind. +// As a compromise, aligned reads will fall back to non-compliant type punning +// but aligned writes will be byte-by-byte, that is, fast reads are preferred +// over fast writes. This obviously isn't great but hopefully it's a working +// compromise for now. +// +// __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6. +#ifdef HAVE___BUILTIN_ASSUME_ALIGNED +# define tuklib_memcpy_aligned(dest, src, size) \ + memcpy(dest, __builtin_assume_aligned(src, size), size) +#else +# define tuklib_memcpy_aligned(dest, src, size) \ + memcpy(dest, src, size) +# ifndef TUKLIB_FAST_UNALIGNED_ACCESS +# define TUKLIB_USE_UNSAFE_ALIGNED_READS 1 +# endif +#endif + + +static inline uint16_t +read16ne(const uint8_t *buf) +{ +#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ + || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) + return *(const uint16_t *)buf; +#else + uint16_t num; + tuklib_memcpy_aligned(&num, buf, sizeof(num)); + return num; +#endif +} + + +static inline uint32_t +read32ne(const uint8_t *buf) +{ +#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ + || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) + return *(const uint32_t *)buf; +#else + uint32_t num; + tuklib_memcpy_aligned(&num, buf, sizeof(num)); + return num; +#endif +} + + +static inline uint64_t +read64ne(const uint8_t *buf) +{ +#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ + || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) + return *(const uint64_t *)buf; +#else + uint64_t num; + tuklib_memcpy_aligned(&num, buf, sizeof(num)); + return num; +#endif +} + + +static inline void +write16ne(uint8_t *buf, uint16_t num) +{ +#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING + *(uint16_t *)buf = num; +#else + tuklib_memcpy_aligned(buf, &num, sizeof(num)); +#endif + return; +} + + +static inline void +write32ne(uint8_t *buf, uint32_t num) +{ +#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING + *(uint32_t *)buf = num; +#else + tuklib_memcpy_aligned(buf, &num, sizeof(num)); +#endif + return; +} + + +static inline void +write64ne(uint8_t *buf, uint64_t num) +{ +#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING + *(uint64_t *)buf = num; +#else + tuklib_memcpy_aligned(buf, &num, sizeof(num)); +#endif + return; +} + + +static inline uint16_t +read16be(const uint8_t *buf) +{ + uint16_t num = read16ne(buf); + return conv16be(num); +} + + +static inline uint16_t +read16le(const uint8_t *buf) +{ + uint16_t num = read16ne(buf); + return conv16le(num); +} + + +static inline uint32_t +read32be(const uint8_t *buf) +{ + uint32_t num = read32ne(buf); + return conv32be(num); +} + + +static inline uint32_t +read32le(const uint8_t *buf) +{ + uint32_t num = read32ne(buf); + return conv32le(num); +} + + +static inline uint64_t +read64be(const uint8_t *buf) +{ + uint64_t num = read64ne(buf); + return conv64be(num); +} + + +static inline uint64_t +read64le(const uint8_t *buf) +{ + uint64_t num = read64ne(buf); + return conv64le(num); +} + + +// These need to be macros like in the unaligned case. +#define write16be(buf, num) write16ne((buf), conv16be(num)) +#define write16le(buf, num) write16ne((buf), conv16le(num)) +#define write32be(buf, num) write32ne((buf), conv32be(num)) +#define write32le(buf, num) write32ne((buf), conv32le(num)) +#define write64be(buf, num) write64ne((buf), conv64be(num)) +#define write64le(buf, num) write64ne((buf), conv64le(num)) + + +//////////////////// +// Bit operations // +//////////////////// + static inline uint32_t bsr32(uint32_t n) { @@ -558,27 +612,27 @@ bsr32(uint32_t n) #else uint32_t i = 31; - if ((n & UINT32_C(0xFFFF0000)) == 0) { + if ((n & 0xFFFF0000) == 0) { n <<= 16; i = 15; } - if ((n & UINT32_C(0xFF000000)) == 0) { + if ((n & 0xFF000000) == 0) { n <<= 8; i -= 8; } - if ((n & UINT32_C(0xF0000000)) == 0) { + if ((n & 0xF0000000) == 0) { n <<= 4; i -= 4; } - if ((n & UINT32_C(0xC0000000)) == 0) { + if ((n & 0xC0000000) == 0) { n <<= 2; i -= 2; } - if ((n & UINT32_C(0x80000000)) == 0) + if ((n & 0x80000000) == 0) --i; return i; @@ -610,27 +664,27 @@ clz32(uint32_t n) #else uint32_t i = 0; - if ((n & UINT32_C(0xFFFF0000)) == 0) { + if ((n & 0xFFFF0000) == 0) { n <<= 16; i = 16; } - if ((n & UINT32_C(0xFF000000)) == 0) { + if ((n & 0xFF000000) == 0) { n <<= 8; i += 8; } - if ((n & UINT32_C(0xF0000000)) == 0) { + if ((n & 0xF0000000) == 0) { n <<= 4; i += 4; } - if ((n & UINT32_C(0xC0000000)) == 0) { + if ((n & 0xC0000000) == 0) { n <<= 2; i += 2; } - if ((n & UINT32_C(0x80000000)) == 0) + if ((n & 0x80000000) == 0) ++i; return i; @@ -660,27 +714,27 @@ ctz32(uint32_t n) #else uint32_t i = 0; - if ((n & UINT32_C(0x0000FFFF)) == 0) { + if ((n & 0x0000FFFF) == 0) { n >>= 16; i = 16; } - if ((n & UINT32_C(0x000000FF)) == 0) { + if ((n & 0x000000FF) == 0) { n >>= 8; i += 8; } - if ((n & UINT32_C(0x0000000F)) == 0) { + if ((n & 0x0000000F) == 0) { n >>= 4; i += 4; } - if ((n & UINT32_C(0x00000003)) == 0) { + if ((n & 0x00000003) == 0) { n >>= 2; i += 2; } - if ((n & UINT32_C(0x00000001)) == 0) + if ((n & 0x00000001) == 0) ++i; return i;