diff --git a/src/common/tuklib_integer.h b/src/common/tuklib_integer.h
index b1e84d5c..e2c7b7c8 100644
--- a/src/common/tuklib_integer.h
+++ b/src/common/tuklib_integer.h
@@ -42,6 +42,7 @@
 #define TUKLIB_INTEGER_H
 
 #include "tuklib_common.h"
+#include <string.h>
 
 
 ////////////////////////////////////////
@@ -274,61 +275,211 @@ write64ne(uint8_t *buf, uint64_t num)
 // Unaligned reads and writes //
 ////////////////////////////////
 
+// The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
+// is bad (at least) because compilers can emit vector instructions that
+// require aligned pointers even if non-vector instructions work with
+// unaligned pointers.
+//
+// Using memcpy() is the standard compliant way to do unaligned access.
+// Many modern compilers inline it so there is no function call overhead.
+//
+// However, it seems that some compilers generate better code with a cast
+// to a packed struct than with memcpy():
+//   - Old GCC versions (early 4.x and older) on x86
+//   - GCC <= 8.2 (and possibly newer) on ARMv5 (but ARMv5 is old and maybe
+//     doesn't matter so much)
+//   - GCC <= 5.x on ARMv7 (on 4.x neither is great but packed is less bad)
+//   - Intel C Compiler <= 19 (and possibly newer)
+//
+// GCC on ARMv6 is weird:
+//   - GCC >= 6.x is better with memcpy() than with a packed struct.
+//   - On GCC < 6 neither method is good, but packed seems less bad.
+//
+// https://gcc.godbolt.org/ was useful for seeing what kind of code is
+// generated by different compilers on different archs. Note that one
+// may need to try a little less trivial code than than these functions
+// alone to spot differences. For example this is better with packed method
+// on Intel C Compiler 19:
+//
+//     int foo(const uint8_t *a, const uint8_t *b)
+//     {
+//         return unaligned_read16ne(a) == unaligned_read16ne(b);
+//     }
+//
+// Based on the above information, prefer the memcpy() method in
+// general (including all Clang versions), but use the packed struct
+// with GCC 5.x and older and with the Intel C Compiler. This isn't
+// optimal but at least it covers some known special cases.
+#if defined(__GNUC__) && !defined(__clang__) \
+		&& (__GNUC__ < 6 || defined(__INTEL_COMPILER))
+#	define TUKLIB_UNALIGNED_WITH_PACKED 1
+#endif
+
+
+static inline uint16_t
+unaligned_read16ne(const uint8_t *buf)
+{
+#ifdef TUKLIB_UNALIGNED_WITH_PACKED
+	struct __attribute__((__packed__)) s { uint16_t v; };
+	const struct s *p = (const struct s *)buf;
+	return p->v;
+#else
+	uint16_t num;
+	memcpy(&num, buf, sizeof(num));
+	return num;
+#endif
+}
+
+
+static inline uint32_t
+unaligned_read32ne(const uint8_t *buf)
+{
+#ifdef TUKLIB_UNALIGNED_WITH_PACKED
+	struct __attribute__((__packed__)) s { uint32_t v; };
+	const struct s *p = (const struct s *)buf;
+	return p->v;
+#else
+	uint32_t num;
+	memcpy(&num, buf, sizeof(num));
+	return num;
+#endif
+}
+
+
+static inline uint64_t
+unaligned_read64ne(const uint8_t *buf)
+{
+#ifdef TUKLIB_UNALIGNED_WITH_PACKED
+	struct __attribute__((__packed__)) s { uint64_t v; };
+	const struct s *p = (const struct s *)buf;
+	return p->v;
+#else
+	uint64_t num;
+	memcpy(&num, buf, sizeof(num));
+	return num;
+#endif
+}
+
+
+static inline void
+unaligned_write16ne(uint8_t *buf, uint16_t num)
+{
+#ifdef TUKLIB_UNALIGNED_WITH_PACKED
+	struct __attribute__((__packed__)) s { uint16_t v; };
+	struct s *p = (struct s *)buf;
+	p->v = num;
+#else
+	memcpy(buf, &num, sizeof(num));
+#endif
+	return;
+}
+
+
+static inline void
+unaligned_write32ne(uint8_t *buf, uint32_t num)
+{
+#ifdef TUKLIB_UNALIGNED_WITH_PACKED
+	struct __attribute__((__packed__)) s { uint32_t v; };
+	struct s *p = (struct s *)buf;
+	p->v = num;
+#else
+	memcpy(buf, &num, sizeof(num));
+#endif
+	return;
+}
+
+
+static inline void
+unaligned_write64ne(uint8_t *buf, uint64_t num)
+{
+#ifdef TUKLIB_UNALIGNED_WITH_PACKED
+	struct __attribute__((__packed__)) s { uint64_t v; };
+	struct s *p = (struct s *)buf;
+	p->v = num;
+#else
+	memcpy(buf, &num, sizeof(num));
+#endif
+	return;
+}
+
+
 // NOTE: TUKLIB_FAST_UNALIGNED_ACCESS indicates only support for 16-bit and
 // 32-bit unaligned integer loads and stores. It's possible that 64-bit
 // unaligned access doesn't work or is slower than byte-by-byte access.
 // Since unaligned 64-bit is probably not needed as often as 16-bit or
 // 32-bit, we simply don't support 64-bit unaligned access for now.
-#ifdef TUKLIB_FAST_UNALIGNED_ACCESS
-#	define unaligned_read16be read16be
-#	define unaligned_read16le read16le
-#	define unaligned_read32be read32be
-#	define unaligned_read32le read32le
-#	define unaligned_write16be write16be
-#	define unaligned_write16le write16le
-#	define unaligned_write32be write32be
-#	define unaligned_write32le write32le
-
-#else
 
 static inline uint16_t
 unaligned_read16be(const uint8_t *buf)
 {
+#if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
+	return conv16be(unaligned_read16ne(buf));
+#else
 	uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];
 	return num;
+#endif
 }
 
 
 static inline uint16_t
 unaligned_read16le(const uint8_t *buf)
 {
+#if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
+	return conv16le(unaligned_read16ne(buf));
+#else
 	uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);
 	return num;
+#endif
 }
 
 
 static inline uint32_t
 unaligned_read32be(const uint8_t *buf)
 {
+#if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
+	return conv32be(unaligned_read32ne(buf));
+#else
 	uint32_t num = (uint32_t)buf[0] << 24;
 	num |= (uint32_t)buf[1] << 16;
 	num |= (uint32_t)buf[2] << 8;
 	num |= (uint32_t)buf[3];
 	return num;
+#endif
 }
 
 
 static inline uint32_t
 unaligned_read32le(const uint8_t *buf)
 {
+#if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
+	return conv32le(unaligned_read32ne(buf));
+#else
 	uint32_t num = (uint32_t)buf[0];
 	num |= (uint32_t)buf[1] << 8;
 	num |= (uint32_t)buf[2] << 16;
 	num |= (uint32_t)buf[3] << 24;
 	return num;
+#endif
 }
 
 
+#if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
+// Like in the aligned case, these need to be macros.
+#	define unaligned_write16be(buf, num) \
+			unaligned_write16ne(buf, conv16be(num))
+#	define unaligned_write32be(buf, num) \
+			unaligned_write32ne(buf, conv32be(num))
+#endif
+
+#if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
+#	define unaligned_write16le(buf, num) \
+			unaligned_write16ne(buf, conv16le(num))
+#	define unaligned_write32le(buf, num) \
+			unaligned_write32ne(buf, conv32le(num))
+#endif
+
+
+#ifndef unaligned_write16be
 static inline void
 unaligned_write16be(uint8_t *buf, uint16_t num)
 {
@@ -336,8 +487,10 @@ unaligned_write16be(uint8_t *buf, uint16_t num)
 	buf[1] = (uint8_t)num;
 	return;
 }
+#endif
 
 
+#ifndef unaligned_write16le
 static inline void
 unaligned_write16le(uint8_t *buf, uint16_t num)
 {
@@ -345,8 +498,10 @@ unaligned_write16le(uint8_t *buf, uint16_t num)
 	buf[1] = (uint8_t)(num >> 8);
 	return;
 }
+#endif
 
 
+#ifndef unaligned_write32be
 static inline void
 unaligned_write32be(uint8_t *buf, uint32_t num)
 {
@@ -356,8 +511,10 @@ unaligned_write32be(uint8_t *buf, uint32_t num)
 	buf[3] = (uint8_t)num;
 	return;
 }
+#endif
 
 
+#ifndef unaligned_write32le
 static inline void
 unaligned_write32le(uint8_t *buf, uint32_t num)
 {
@@ -367,7 +524,6 @@ unaligned_write32le(uint8_t *buf, uint32_t num)
 	buf[3] = (uint8_t)(num >> 24);
 	return;
 }
-
 #endif