Revise tuklib_integer.h and .m4.

Add a configure option --enable-unsafe-type-punning to get the old non-conforming memory access methods. It can be useful with old compilers or in some other less typical situations but shouldn't normally be used. Omit the packed struct trick for unaligned access. While it's best in some cases, this is simpler. If the memcpy trick doesn't work, one can request unsafe type punning from configure. Because CRC32/CRC64 code needs fast aligned reads, if no very safe way to do it is found, type punning is used as a fallback. This sucks but since it currently works in practice, it seems to be the least bad option. It's never needed with GCC >= 4.7 or Clang >= 3.6 since these support __builtin_assume_aligned and thus fast aligned access can be done with the memcpy trick. Other things: - Support GCC/Clang __builtin_bswapXX - Cleaner bswap fallback macros - Minor cleanups
2026-02-04 10:58:07 +00:00 · 2019-12-31 00:18:24 +02:00 · 2019-12-31 00:18:24 +02:00 · 77bc5bc6dd
commit 77bc5bc6dd
parent 8b72950a6b
2 changed files with 316 additions and 218 deletions
--- a/m4/tuklib_integer.m4
+++ b/m4/tuklib_integer.m4
@ -45,11 +45,26 @@ main(void)
 	])dnl
 fi

+AC_MSG_CHECKING([if __builtin_bswap16/32/64 are supported])
+AC_LINK_IFELSE([AC_LANG_PROGRAM([[]],
+			[[__builtin_bswap16(1);
+			__builtin_bswap32(1);
+			__builtin_bswap64(1);]])],
+	[
+		AC_DEFINE([HAVE___BUILTIN_BSWAPXX], [1],
+			[Define to 1 if the GNU C extensions
+			__builtin_bswap16/32/64 are supported.])
+		AC_MSG_RESULT([yes])
+	], [
+		AC_MSG_RESULT([no])
+	])
+
 AC_MSG_CHECKING([if unaligned memory access should be used])
 AC_ARG_ENABLE([unaligned-access], AS_HELP_STRING([--enable-unaligned-access],
 		[Enable if the system supports *fast* unaligned memory access
 		with 16-bit and 32-bit integers. By default, this is enabled
-		only on x86, x86_64, and big endian PowerPC.]),
+		only on x86, x86_64, big endian PowerPC,
+		and some ARM systems.]),
 	[], [enable_unaligned_access=auto])
 if test "x$enable_unaligned_access" = xauto ; then
 	# TODO: There may be other architectures, on which unaligned access
@ -82,4 +97,33 @@ if test "x$enable_unaligned_access" = xyes ; then
 else
 	AC_MSG_RESULT([no])
 fi
+
+AC_MSG_CHECKING([if unsafe type punning should be used])
+AC_ARG_ENABLE([unsafe-type-punning],
+	AS_HELP_STRING([--enable-unsafe-type-punning],
+		[This introduces strict aliasing violations and may result
+		in broken code. However, this might improve performance in
+		some cases, especially with old compilers (e.g.
+		GCC 3 and early 4.x on x86, GCC < 6 on ARMv6 and ARMv7).]),
+	[], [enable_unsafe_type_punning=no])
+if test "x$enable_unsafe_type_punning" = xyes ; then
+	AC_DEFINE([TUKLIB_USE_UNSAFE_TYPE_PUNNING], [1], [Define to 1 to use
+		unsafe type punning, e.g. char *x = ...; *(int *)x = 123;
+		which violates strict aliasing rules and thus is
+		undefined behavior and might result in broken code.])
+	AC_MSG_RESULT([yes])
+else
+	AC_MSG_RESULT([no])
+fi
+
+AC_MSG_CHECKING([if __builtin_assume_aligned is supported])
+AC_LINK_IFELSE([AC_LANG_PROGRAM([[]], [[__builtin_assume_aligned("", 1);]])],
+	[
+		AC_DEFINE([HAVE___BUILTIN_ASSUME_ALIGNED], [1],
+			[Define to 1 if the GNU C extension
+			__builtin_assume_aligned is supported.])
+		AC_MSG_RESULT([yes])
+	], [
+		AC_MSG_RESULT([no])
+	])
 ])dnl
--- a/src/common/tuklib_integer.h
+++ b/src/common/tuklib_integer.h
@ -6,22 +6,26 @@
 /// This file provides macros or functions to do some basic integer and bit
 /// operations.
 ///
-/// Endianness related integer operations (XX = 16, 32, or 64; Y = b or l):
+/// Native endian inline functions (XX = 16, 32, or 64):
+///   - Unaligned native endian reads: unaligned_readXXne(ptr)
+///   - Unaligned native endian writes: unaligned_writeXXne(ptr, num)
+///   - Aligned native endian reads: readXXne(ptr)
+///   - Aligned native endian writes: writeXXne(ptr, num)
+///
+/// Endianness-converting integer operations (these can be macros!)
+/// (XX = 16, 32, or 64; Y = b or l):
 ///   - Byte swapping: bswapXX(num)
-///   - Byte order conversions to/from native: convXXYe(num)
+///   - Byte order conversions to/from native (byteswaps if Y isn't
+///     the native endianness): convXXYe(num)
 ///   - Aligned reads: readXXYe(ptr)
 ///   - Aligned writes: writeXXYe(ptr, num)
 ///   - Unaligned reads (16/32-bit only): unaligned_readXXYe(ptr)
 ///   - Unaligned writes (16/32-bit only): unaligned_writeXXYe(ptr, num)
 ///
-/// Since they can macros, the arguments should have no side effects since
-/// they may be evaluated more than once.
+/// Since the above can macros, the arguments should have no side effects
+/// because they may be evaluated more than once.
 ///
-/// \todo       PowerPC and possibly some other architectures support
-///             byte swapping load and store instructions. This file
-///             doesn't take advantage of those instructions.
-///
-/// Bit scan operations for non-zero 32-bit integers:
+/// Bit scan operations for non-zero 32-bit integers (inline functions):
 ///   - Bit scan reverse (find highest non-zero bit): bsr32(num)
 ///   - Count leading zeros: clz32(num)
 ///   - Count trailing zeros: ctz32(num)
@ -44,12 +48,24 @@
 #include "tuklib_common.h"
 #include <string.h>

+// Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
+// and such functions.
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
+#	include <immintrin.h>
+#endif

-////////////////////////////////////////
-// Operating system specific features //
-////////////////////////////////////////

-#if defined(HAVE_BYTESWAP_H)
+///////////////////
+// Byte swapping //
+///////////////////
+
+#if defined(HAVE___BUILTIN_BSWAPXX)
+	// GCC >= 4.8 and Clang
+#	define bswap16(n) __builtin_bswap16(n)
+#	define bswap32(n) __builtin_bswap32(n)
+#	define bswap64(n) __builtin_bswap64(n)
+
+#elif defined(HAVE_BYTESWAP_H)
 	// glibc, uClibc, dietlibc
 #	include <byteswap.h>
 #	ifdef HAVE_BSWAP_16
@ -98,45 +114,33 @@
 #	endif
 #endif

-
-////////////////////////////////
-// Compiler-specific features //
-////////////////////////////////
-
-// Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
-// and such functions.
-#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
-#	include <immintrin.h>
-#endif
-
-
-///////////////////
-// Byte swapping //
-///////////////////
-
 #ifndef bswap16
-#	define bswap16(num) \
-		(((uint16_t)(num) << 8) | ((uint16_t)(num) >> 8))
+#	define bswap16(n) (uint16_t)( \
+		  (((n) & 0x00FFU) << 8) \
+		| (((n) & 0xFF00U) >> 8) \
+	)
 #endif

 #ifndef bswap32
-#	define bswap32(num) \
-		( (((uint32_t)(num) << 24)                       ) \
-		| (((uint32_t)(num) <<  8) & UINT32_C(0x00FF0000)) \
-		| (((uint32_t)(num) >>  8) & UINT32_C(0x0000FF00)) \
-		| (((uint32_t)(num) >> 24)                       ) )
+#	define bswap32(n) (uint32_t)( \
+		  (((n) & UINT32_C(0x000000FF)) << 24) \
+		| (((n) & UINT32_C(0x0000FF00)) << 8) \
+		| (((n) & UINT32_C(0x00FF0000)) >> 8) \
+		| (((n) & UINT32_C(0xFF000000)) >> 24) \
+	)
 #endif

 #ifndef bswap64
-#	define bswap64(num) \
-		( (((uint64_t)(num) << 56)                               ) \
-		| (((uint64_t)(num) << 40) & UINT64_C(0x00FF000000000000)) \
-		| (((uint64_t)(num) << 24) & UINT64_C(0x0000FF0000000000)) \
-		| (((uint64_t)(num) <<  8) & UINT64_C(0x000000FF00000000)) \
-		| (((uint64_t)(num) >>  8) & UINT64_C(0x00000000FF000000)) \
-		| (((uint64_t)(num) >> 24) & UINT64_C(0x0000000000FF0000)) \
-		| (((uint64_t)(num) >> 40) & UINT64_C(0x000000000000FF00)) \
-		| (((uint64_t)(num) >> 56)                               ) )
+#	define bswap64(n) (uint64_t)( \
+		  (((n) & UINT64_C(0x00000000000000FF)) << 56) \
+		| (((n) & UINT64_C(0x000000000000FF00)) << 40) \
+		| (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
+		| (((n) & UINT64_C(0x00000000FF000000)) << 8) \
+		| (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
+		| (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
+		| (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
+		| (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
+	)
 #endif

 // Define conversion macros using the basic byte swapping macros.
@ -181,148 +185,31 @@
 #endif


-//////////////////////////////
-// Aligned reads and writes //
-//////////////////////////////
-
-static inline uint16_t
-read16be(const uint8_t *buf)
-{
-	uint16_t num = *(const uint16_t *)buf;
-	return conv16be(num);
-}
-
-
-static inline uint16_t
-read16le(const uint8_t *buf)
-{
-	uint16_t num = *(const uint16_t *)buf;
-	return conv16le(num);
-}
-
-
-static inline uint32_t
-read32be(const uint8_t *buf)
-{
-	uint32_t num = *(const uint32_t *)buf;
-	return conv32be(num);
-}
-
-
-static inline uint32_t
-read32le(const uint8_t *buf)
-{
-	uint32_t num = *(const uint32_t *)buf;
-	return conv32le(num);
-}
-
-
-static inline uint64_t
-read64be(const uint8_t *buf)
-{
-	uint64_t num = *(const uint64_t *)buf;
-	return conv64be(num);
-}
-
-
-static inline uint64_t
-read64le(const uint8_t *buf)
-{
-	uint64_t num = *(const uint64_t *)buf;
-	return conv64le(num);
-}
-
-
-// NOTE: Possible byte swapping must be done in a macro to allow GCC
-// to optimize byte swapping of constants when using glibc's or *BSD's
-// byte swapping macros. The actual write is done in an inline function
-// to make type checking of the buf pointer possible similarly to readXXYe()
-// functions.
-
-#define write16be(buf, num) write16ne((buf), conv16be(num))
-#define write16le(buf, num) write16ne((buf), conv16le(num))
-#define write32be(buf, num) write32ne((buf), conv32be(num))
-#define write32le(buf, num) write32ne((buf), conv32le(num))
-#define write64be(buf, num) write64ne((buf), conv64be(num))
-#define write64le(buf, num) write64ne((buf), conv64le(num))
-
-
-static inline void
-write16ne(uint8_t *buf, uint16_t num)
-{
-	*(uint16_t *)buf = num;
-	return;
-}
-
-
-static inline void
-write32ne(uint8_t *buf, uint32_t num)
-{
-	*(uint32_t *)buf = num;
-	return;
-}
-
-
-static inline void
-write64ne(uint8_t *buf, uint64_t num)
-{
-	*(uint64_t *)buf = num;
-	return;
-}
-
-
 ////////////////////////////////
 // Unaligned reads and writes //
 ////////////////////////////////

 // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
-// is bad (at least) because compilers can emit vector instructions that
-// require aligned pointers even if non-vector instructions work with
-// unaligned pointers.
+// is bad even if the uint8_pointer is properly aligned because this kind
+// of casts break strict aliasing rules and result in undefined behavior.
+// With unaligned pointers it's even worse: compilers may emit vector
+// instructions that require aligned pointers even if non-vector
+// instructions work with unaligned pointers.
 //
 // Using memcpy() is the standard compliant way to do unaligned access.
 // Many modern compilers inline it so there is no function call overhead.
-//
-// However, it seems that some compilers generate better code with a cast
-// to a packed struct than with memcpy():
-//   - Old GCC versions (early 4.x and older) on x86
-//   - GCC <= 8.2 (and possibly newer) on ARMv5 (but ARMv5 is old and maybe
-//     doesn't matter so much)
-//   - GCC <= 5.x on ARMv7 (on 4.x neither is great but packed is less bad)
-//   - Intel C Compiler <= 19 (and possibly newer)
-//
-// GCC on ARMv6 is weird:
-//   - GCC >= 6.x is better with memcpy() than with a packed struct.
-//   - On GCC < 6 neither method is good, but packed seems less bad.
-//
-// https://gcc.godbolt.org/ was useful for seeing what kind of code is
-// generated by different compilers on different archs. Note that one
-// may need to try a little less trivial code than than these functions
-// alone to spot differences. For example this is better with packed method
-// on Intel C Compiler 19:
-//
-//     int foo(const uint8_t *a, const uint8_t *b)
-//     {
-//         return unaligned_read16ne(a) == unaligned_read16ne(b);
-//     }
-//
-// Based on the above information, prefer the memcpy() method in
-// general (including all Clang versions), but use the packed struct
-// with GCC 5.x and older and with the Intel C Compiler. This isn't
-// optimal but at least it covers some known special cases.
-#if defined(__GNUC__) && !defined(__clang__) \
-		&& (__GNUC__ < 6 || defined(__INTEL_COMPILER))
-#	define TUKLIB_UNALIGNED_WITH_PACKED 1
-#endif
-
+// For those compilers that don't handle the memcpy() method well, the
+// old casting method (that violates strict aliasing) can be requested at
+// build time. A third method, casting to a packed struct, would also be
+// an option but isn't provided to keep things simpler (it's already a mess).
+// Hopefully this is flexible enough in practice.

 static inline uint16_t
 unaligned_read16ne(const uint8_t *buf)
 {
-#ifdef TUKLIB_UNALIGNED_WITH_PACKED
-	struct __attribute__((__packed__)) s { uint16_t v; };
-	const struct s *p = (const struct s *)buf;
-	return p->v;
+#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
+		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
+	return *(const uint16_t *)buf;
 #else
 	uint16_t num;
 	memcpy(&num, buf, sizeof(num));
@ -334,10 +221,9 @@ unaligned_read16ne(const uint8_t *buf)
 static inline uint32_t
 unaligned_read32ne(const uint8_t *buf)
 {
-#ifdef TUKLIB_UNALIGNED_WITH_PACKED
-	struct __attribute__((__packed__)) s { uint32_t v; };
-	const struct s *p = (const struct s *)buf;
-	return p->v;
+#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
+		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
+	return *(const uint32_t *)buf;
 #else
 	uint32_t num;
 	memcpy(&num, buf, sizeof(num));
@ -349,10 +235,9 @@ unaligned_read32ne(const uint8_t *buf)
 static inline uint64_t
 unaligned_read64ne(const uint8_t *buf)
 {
-#ifdef TUKLIB_UNALIGNED_WITH_PACKED
-	struct __attribute__((__packed__)) s { uint64_t v; };
-	const struct s *p = (const struct s *)buf;
-	return p->v;
+#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
+		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
+	return *(const uint64_t *)buf;
 #else
 	uint64_t num;
 	memcpy(&num, buf, sizeof(num));
@ -364,10 +249,9 @@ unaligned_read64ne(const uint8_t *buf)
 static inline void
 unaligned_write16ne(uint8_t *buf, uint16_t num)
 {
-#ifdef TUKLIB_UNALIGNED_WITH_PACKED
-	struct __attribute__((__packed__)) s { uint16_t v; };
-	struct s *p = (struct s *)buf;
-	p->v = num;
+#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
+		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
+	*(uint16_t *)buf = num;
 #else
 	memcpy(buf, &num, sizeof(num));
 #endif
@ -378,10 +262,9 @@ unaligned_write16ne(uint8_t *buf, uint16_t num)
 static inline void
 unaligned_write32ne(uint8_t *buf, uint32_t num)
 {
-#ifdef TUKLIB_UNALIGNED_WITH_PACKED
-	struct __attribute__((__packed__)) s { uint32_t v; };
-	struct s *p = (struct s *)buf;
-	p->v = num;
+#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
+		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
+	*(uint32_t *)buf = num;
 #else
 	memcpy(buf, &num, sizeof(num));
 #endif
@ -392,10 +275,9 @@ unaligned_write32ne(uint8_t *buf, uint32_t num)
 static inline void
 unaligned_write64ne(uint8_t *buf, uint64_t num)
 {
-#ifdef TUKLIB_UNALIGNED_WITH_PACKED
-	struct __attribute__((__packed__)) s { uint64_t v; };
-	struct s *p = (struct s *)buf;
-	p->v = num;
+#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
+		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
+	*(uint64_t *)buf = num;
 #else
 	memcpy(buf, &num, sizeof(num));
 #endif
@ -403,12 +285,6 @@ unaligned_write64ne(uint8_t *buf, uint64_t num)
 }


-// NOTE: TUKLIB_FAST_UNALIGNED_ACCESS indicates only support for 16-bit and
-// 32-bit unaligned integer loads and stores. It's possible that 64-bit
-// unaligned access doesn't work or is slower than byte-by-byte access.
-// Since unaligned 64-bit is probably not needed as often as 16-bit or
-// 32-bit, we simply don't support 64-bit unaligned access for now.
-
 static inline uint16_t
 unaligned_read16be(const uint8_t *buf)
 {
@ -467,8 +343,11 @@ unaligned_read32le(const uint8_t *buf)
 }


+// NOTE: Possible byte swapping must be done in a macro to allow the compiler
+// to optimize byte swapping of constants when using glibc's or *BSD's
+// byte swapping macros. The actual write is done in an inline function
+// to make type checking of the buf pointer possible.
 #if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
-// Like in the aligned case, these need to be macros.
 #	define unaligned_write16be(buf, num) \
 			unaligned_write16ne(buf, conv16be(num))
 #	define unaligned_write32be(buf, num) \
@ -531,6 +410,181 @@ unaligned_write32le(uint8_t *buf, uint32_t num)
 #endif


+//////////////////////////////
+// Aligned reads and writes //
+//////////////////////////////
+
+// Separate functions for aligned reads and writes are provided since on
+// strict-align archs aligned access is much faster than unaligned access.
+//
+// Just like in the unaligned case, memcpy() is needed to avoid
+// strict aliasing violations. However, on archs that don't support
+// unaligned access the compiler cannot know that the pointers given
+// to memcpy() are aligned which results in slow code. As of C11 there is
+// no standard way to tell the compiler that we know that the address is
+// aligned but some compilers have language extensions to do that. With
+// such language extensions the memcpy() method gives excellent results.
+//
+// What to do on a strict-align system when no known language extentensions
+// are available? Falling back to byte-by-byte access would be safe but ruin
+// optimizations that have been made specifically with aligned access in mind.
+// As a compromise, aligned reads will fall back to non-compliant type punning
+// but aligned writes will be byte-by-byte, that is, fast reads are preferred
+// over fast writes. This obviously isn't great but hopefully it's a working
+// compromise for now.
+//
+// __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
+#ifdef HAVE___BUILTIN_ASSUME_ALIGNED
+#	define tuklib_memcpy_aligned(dest, src, size) \
+		memcpy(dest, __builtin_assume_aligned(src, size), size)
+#else
+#	define tuklib_memcpy_aligned(dest, src, size) \
+		memcpy(dest, src, size)
+#	ifndef TUKLIB_FAST_UNALIGNED_ACCESS
+#		define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
+#	endif
+#endif
+
+
+static inline uint16_t
+read16ne(const uint8_t *buf)
+{
+#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
+		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
+	return *(const uint16_t *)buf;
+#else
+	uint16_t num;
+	tuklib_memcpy_aligned(&num, buf, sizeof(num));
+	return num;
+#endif
+}
+
+
+static inline uint32_t
+read32ne(const uint8_t *buf)
+{
+#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
+		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
+	return *(const uint32_t *)buf;
+#else
+	uint32_t num;
+	tuklib_memcpy_aligned(&num, buf, sizeof(num));
+	return num;
+#endif
+}
+
+
+static inline uint64_t
+read64ne(const uint8_t *buf)
+{
+#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
+		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
+	return *(const uint64_t *)buf;
+#else
+	uint64_t num;
+	tuklib_memcpy_aligned(&num, buf, sizeof(num));
+	return num;
+#endif
+}
+
+
+static inline void
+write16ne(uint8_t *buf, uint16_t num)
+{
+#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
+	*(uint16_t *)buf = num;
+#else
+	tuklib_memcpy_aligned(buf, &num, sizeof(num));
+#endif
+	return;
+}
+
+
+static inline void
+write32ne(uint8_t *buf, uint32_t num)
+{
+#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
+	*(uint32_t *)buf = num;
+#else
+	tuklib_memcpy_aligned(buf, &num, sizeof(num));
+#endif
+	return;
+}
+
+
+static inline void
+write64ne(uint8_t *buf, uint64_t num)
+{
+#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
+	*(uint64_t *)buf = num;
+#else
+	tuklib_memcpy_aligned(buf, &num, sizeof(num));
+#endif
+	return;
+}
+
+
+static inline uint16_t
+read16be(const uint8_t *buf)
+{
+	uint16_t num = read16ne(buf);
+	return conv16be(num);
+}
+
+
+static inline uint16_t
+read16le(const uint8_t *buf)
+{
+	uint16_t num = read16ne(buf);
+	return conv16le(num);
+}
+
+
+static inline uint32_t
+read32be(const uint8_t *buf)
+{
+	uint32_t num = read32ne(buf);
+	return conv32be(num);
+}
+
+
+static inline uint32_t
+read32le(const uint8_t *buf)
+{
+	uint32_t num = read32ne(buf);
+	return conv32le(num);
+}
+
+
+static inline uint64_t
+read64be(const uint8_t *buf)
+{
+	uint64_t num = read64ne(buf);
+	return conv64be(num);
+}
+
+
+static inline uint64_t
+read64le(const uint8_t *buf)
+{
+	uint64_t num = read64ne(buf);
+	return conv64le(num);
+}
+
+
+// These need to be macros like in the unaligned case.
+#define write16be(buf, num) write16ne((buf), conv16be(num))
+#define write16le(buf, num) write16ne((buf), conv16le(num))
+#define write32be(buf, num) write32ne((buf), conv32be(num))
+#define write32le(buf, num) write32ne((buf), conv32le(num))
+#define write64be(buf, num) write64ne((buf), conv64be(num))
+#define write64le(buf, num) write64ne((buf), conv64le(num))
+
+
+////////////////////
+// Bit operations //
+////////////////////
+
 static inline uint32_t
 bsr32(uint32_t n)
 {
@ -558,27 +612,27 @@ bsr32(uint32_t n)
 #else
 	uint32_t i = 31;

-	if ((n & UINT32_C(0xFFFF0000)) == 0) {
+	if ((n & 0xFFFF0000) == 0) {
 		n <<= 16;
 		i = 15;
 	}

-	if ((n & UINT32_C(0xFF000000)) == 0) {
+	if ((n & 0xFF000000) == 0) {
 		n <<= 8;
 		i -= 8;
 	}

-	if ((n & UINT32_C(0xF0000000)) == 0) {
+	if ((n & 0xF0000000) == 0) {
 		n <<= 4;
 		i -= 4;
 	}

-	if ((n & UINT32_C(0xC0000000)) == 0) {
+	if ((n & 0xC0000000) == 0) {
 		n <<= 2;
 		i -= 2;
 	}

-	if ((n & UINT32_C(0x80000000)) == 0)
+	if ((n & 0x80000000) == 0)
 		--i;

 	return i;
@ -610,27 +664,27 @@ clz32(uint32_t n)
 #else
 	uint32_t i = 0;

-	if ((n & UINT32_C(0xFFFF0000)) == 0) {
+	if ((n & 0xFFFF0000) == 0) {
 		n <<= 16;
 		i = 16;
 	}

-	if ((n & UINT32_C(0xFF000000)) == 0) {
+	if ((n & 0xFF000000) == 0) {
 		n <<= 8;
 		i += 8;
 	}

-	if ((n & UINT32_C(0xF0000000)) == 0) {
+	if ((n & 0xF0000000) == 0) {
 		n <<= 4;
 		i += 4;
 	}

-	if ((n & UINT32_C(0xC0000000)) == 0) {
+	if ((n & 0xC0000000) == 0) {
 		n <<= 2;
 		i += 2;
 	}

-	if ((n & UINT32_C(0x80000000)) == 0)
+	if ((n & 0x80000000) == 0)
 		++i;

 	return i;
@ -660,27 +714,27 @@ ctz32(uint32_t n)
 #else
 	uint32_t i = 0;

-	if ((n & UINT32_C(0x0000FFFF)) == 0) {
+	if ((n & 0x0000FFFF) == 0) {
 		n >>= 16;
 		i = 16;
 	}

-	if ((n & UINT32_C(0x000000FF)) == 0) {
+	if ((n & 0x000000FF) == 0) {
 		n >>= 8;
 		i += 8;
 	}

-	if ((n & UINT32_C(0x0000000F)) == 0) {
+	if ((n & 0x0000000F) == 0) {
 		n >>= 4;
 		i += 4;
 	}

-	if ((n & UINT32_C(0x00000003)) == 0) {
+	if ((n & 0x00000003) == 0) {
 		n >>= 2;
 		i += 2;
 	}

-	if ((n & UINT32_C(0x00000001)) == 0)
+	if ((n & 0x00000001) == 0)
 		++i;

 	return i;