From 29afef03486d461c23f57150ac5436684bff7811 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Sat, 1 Jun 2019 18:41:16 +0300 Subject: [PATCH] tuklib_integer: Improve unaligned memory access. Now memcpy() or GNU C packed structs for unaligned access instead of type punning. See the comment in this commit for details. Avoiding type punning with unaligned access is needed to silence gcc -fsanitize=undefined. New functions: unaliged_readXXne and unaligned_writeXXne where XX is 16, 32, or 64. --- src/common/tuklib_integer.h | 180 +++++++++++++++++++++++++++++++++--- 1 file changed, 168 insertions(+), 12 deletions(-) diff --git a/src/common/tuklib_integer.h b/src/common/tuklib_integer.h index b1e84d5c..e2c7b7c8 100644 --- a/src/common/tuklib_integer.h +++ b/src/common/tuklib_integer.h @@ -42,6 +42,7 @@ #define TUKLIB_INTEGER_H #include "tuklib_common.h" +#include //////////////////////////////////////// @@ -274,61 +275,211 @@ write64ne(uint8_t *buf, uint64_t num) // Unaligned reads and writes // //////////////////////////////// +// The traditional way of casting e.g. *(const uint16_t *)uint8_pointer +// is bad (at least) because compilers can emit vector instructions that +// require aligned pointers even if non-vector instructions work with +// unaligned pointers. +// +// Using memcpy() is the standard compliant way to do unaligned access. +// Many modern compilers inline it so there is no function call overhead. +// +// However, it seems that some compilers generate better code with a cast +// to a packed struct than with memcpy(): +// - Old GCC versions (early 4.x and older) on x86 +// - GCC <= 8.2 (and possibly newer) on ARMv5 (but ARMv5 is old and maybe +// doesn't matter so much) +// - GCC <= 5.x on ARMv7 (on 4.x neither is great but packed is less bad) +// - Intel C Compiler <= 19 (and possibly newer) +// +// GCC on ARMv6 is weird: +// - GCC >= 6.x is better with memcpy() than with a packed struct. +// - On GCC < 6 neither method is good, but packed seems less bad. +// +// https://gcc.godbolt.org/ was useful for seeing what kind of code is +// generated by different compilers on different archs. Note that one +// may need to try a little less trivial code than than these functions +// alone to spot differences. For example this is better with packed method +// on Intel C Compiler 19: +// +// int foo(const uint8_t *a, const uint8_t *b) +// { +// return unaligned_read16ne(a) == unaligned_read16ne(b); +// } +// +// Based on the above information, prefer the memcpy() method in +// general (including all Clang versions), but use the packed struct +// with GCC 5.x and older and with the Intel C Compiler. This isn't +// optimal but at least it covers some known special cases. +#if defined(__GNUC__) && !defined(__clang__) \ + && (__GNUC__ < 6 || defined(__INTEL_COMPILER)) +# define TUKLIB_UNALIGNED_WITH_PACKED 1 +#endif + + +static inline uint16_t +unaligned_read16ne(const uint8_t *buf) +{ +#ifdef TUKLIB_UNALIGNED_WITH_PACKED + struct __attribute__((__packed__)) s { uint16_t v; }; + const struct s *p = (const struct s *)buf; + return p->v; +#else + uint16_t num; + memcpy(&num, buf, sizeof(num)); + return num; +#endif +} + + +static inline uint32_t +unaligned_read32ne(const uint8_t *buf) +{ +#ifdef TUKLIB_UNALIGNED_WITH_PACKED + struct __attribute__((__packed__)) s { uint32_t v; }; + const struct s *p = (const struct s *)buf; + return p->v; +#else + uint32_t num; + memcpy(&num, buf, sizeof(num)); + return num; +#endif +} + + +static inline uint64_t +unaligned_read64ne(const uint8_t *buf) +{ +#ifdef TUKLIB_UNALIGNED_WITH_PACKED + struct __attribute__((__packed__)) s { uint64_t v; }; + const struct s *p = (const struct s *)buf; + return p->v; +#else + uint64_t num; + memcpy(&num, buf, sizeof(num)); + return num; +#endif +} + + +static inline void +unaligned_write16ne(uint8_t *buf, uint16_t num) +{ +#ifdef TUKLIB_UNALIGNED_WITH_PACKED + struct __attribute__((__packed__)) s { uint16_t v; }; + struct s *p = (struct s *)buf; + p->v = num; +#else + memcpy(buf, &num, sizeof(num)); +#endif + return; +} + + +static inline void +unaligned_write32ne(uint8_t *buf, uint32_t num) +{ +#ifdef TUKLIB_UNALIGNED_WITH_PACKED + struct __attribute__((__packed__)) s { uint32_t v; }; + struct s *p = (struct s *)buf; + p->v = num; +#else + memcpy(buf, &num, sizeof(num)); +#endif + return; +} + + +static inline void +unaligned_write64ne(uint8_t *buf, uint64_t num) +{ +#ifdef TUKLIB_UNALIGNED_WITH_PACKED + struct __attribute__((__packed__)) s { uint64_t v; }; + struct s *p = (struct s *)buf; + p->v = num; +#else + memcpy(buf, &num, sizeof(num)); +#endif + return; +} + + // NOTE: TUKLIB_FAST_UNALIGNED_ACCESS indicates only support for 16-bit and // 32-bit unaligned integer loads and stores. It's possible that 64-bit // unaligned access doesn't work or is slower than byte-by-byte access. // Since unaligned 64-bit is probably not needed as often as 16-bit or // 32-bit, we simply don't support 64-bit unaligned access for now. -#ifdef TUKLIB_FAST_UNALIGNED_ACCESS -# define unaligned_read16be read16be -# define unaligned_read16le read16le -# define unaligned_read32be read32be -# define unaligned_read32le read32le -# define unaligned_write16be write16be -# define unaligned_write16le write16le -# define unaligned_write32be write32be -# define unaligned_write32le write32le - -#else static inline uint16_t unaligned_read16be(const uint8_t *buf) { +#if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS) + return conv16be(unaligned_read16ne(buf)); +#else uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1]; return num; +#endif } static inline uint16_t unaligned_read16le(const uint8_t *buf) { +#if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS) + return conv16le(unaligned_read16ne(buf)); +#else uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8); return num; +#endif } static inline uint32_t unaligned_read32be(const uint8_t *buf) { +#if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS) + return conv32be(unaligned_read32ne(buf)); +#else uint32_t num = (uint32_t)buf[0] << 24; num |= (uint32_t)buf[1] << 16; num |= (uint32_t)buf[2] << 8; num |= (uint32_t)buf[3]; return num; +#endif } static inline uint32_t unaligned_read32le(const uint8_t *buf) { +#if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS) + return conv32le(unaligned_read32ne(buf)); +#else uint32_t num = (uint32_t)buf[0]; num |= (uint32_t)buf[1] << 8; num |= (uint32_t)buf[2] << 16; num |= (uint32_t)buf[3] << 24; return num; +#endif } +#if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS) +// Like in the aligned case, these need to be macros. +# define unaligned_write16be(buf, num) \ + unaligned_write16ne(buf, conv16be(num)) +# define unaligned_write32be(buf, num) \ + unaligned_write32ne(buf, conv32be(num)) +#endif + +#if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS) +# define unaligned_write16le(buf, num) \ + unaligned_write16ne(buf, conv16le(num)) +# define unaligned_write32le(buf, num) \ + unaligned_write32ne(buf, conv32le(num)) +#endif + + +#ifndef unaligned_write16be static inline void unaligned_write16be(uint8_t *buf, uint16_t num) { @@ -336,8 +487,10 @@ unaligned_write16be(uint8_t *buf, uint16_t num) buf[1] = (uint8_t)num; return; } +#endif +#ifndef unaligned_write16le static inline void unaligned_write16le(uint8_t *buf, uint16_t num) { @@ -345,8 +498,10 @@ unaligned_write16le(uint8_t *buf, uint16_t num) buf[1] = (uint8_t)(num >> 8); return; } +#endif +#ifndef unaligned_write32be static inline void unaligned_write32be(uint8_t *buf, uint32_t num) { @@ -356,8 +511,10 @@ unaligned_write32be(uint8_t *buf, uint32_t num) buf[3] = (uint8_t)num; return; } +#endif +#ifndef unaligned_write32le static inline void unaligned_write32le(uint8_t *buf, uint32_t num) { @@ -367,7 +524,6 @@ unaligned_write32le(uint8_t *buf, uint32_t num) buf[3] = (uint8_t)(num >> 24); return; } - #endif