xz/src/liblzma/simple/arm64.c

///////////////////////////////////////////////////////////////////////////////
//
/// \file       arm64.c
/// \brief      Filter for ARM64 binaries
///
/// This converts ARM64 relative addresses in the BL and ADRP immediates
/// to absolute values to increase redundancy of ARM64 code.
///
/// Unlike the older BCJ filters, this handles zeros specially. This way
/// the filter won't be counterproductive on Linux kernel modules, object
/// files, and static libraries where the immediates are all zeros (to be
/// filled later by a linker). Usually this has no downsides but with bad
/// luck it can reduce the effectiveness of the filter and trying a different
/// start offset can mitigate the problem.
///
/// Converting B or ADR instructions was also tested but it's not useful.
/// A majority of the jumps for the B instruction are very small (+/- 0xFF).
/// These are typical for loops and if-statements. Encoding them to their
/// absolute address reduces redundancy since many of the small relative
/// jump values are repeated, but very few of the absolute addresses are.
//
//  Authors:    Lasse Collin
//              Jia Tan
//
//  This file has been put into the public domain.
//  You can do whatever you want with this file.
//
///////////////////////////////////////////////////////////////////////////////

#include "simple_private.h"


static uint32_t
arm64_conv(uint32_t src, uint32_t pc, uint32_t mask, bool is_encoder)
{
	if (!is_encoder)
		pc = 0U - pc;

	uint32_t dest = src + pc;
	if ((dest & mask) == 0)
		dest = pc;

	return dest;
}


static size_t
arm64_code(void *simple lzma_attribute((__unused__)),
		uint32_t now_pos, bool is_encoder,
		uint8_t *buffer, size_t size)
{
	size_t i;

	// Clang 14.0.6 on x86-64 makes this four times bigger and 60 % slower
	// with auto-vectorization that is enabled by default with -O2.
	// Even -Os, which doesn't use vectorization, produces faster code.
	// Disabling vectorization with -O2 gives good speed (faster than -Os)
	// and reasonable code size.
	//
	// Such vectorization bloat happens with -O2 when targeting ARM64 too
	// but performance hasn't been tested.
	//
	// Clang 14 and 15 won't auto-vectorize this loop if the condition
	// for ADRP is replaced with the commented-out version. However,
	// at least Clang 14.0.6 doesn't generate as fast code with that
	// condition. The commented-out code is also bigger.
	//
	// GCC 12.2 on x86-64 with -O2 produces good code with both versions
	// of the ADRP if-statement although the single-branch version is
	// slightly faster and smaller than the commented-out version.
	// Speed is similar to non-vectorized clang -O2.
#ifdef __clang__
#	pragma clang loop vectorize(disable)
#endif
	for (i = 0; i + 4 <= size; i += 4) {
		const uint32_t pc = (uint32_t)(now_pos + i);
		uint32_t instr = read32le(buffer + i);

		if ((instr >> 26) == 0x25) {
			// BL instruction:
			// The full 26-bit immediate is converted.
			// The range is +/-128 MiB.
			//
			// Using the full range is helps quite a lot with
			// big executables. Smaller range would reduce false
			// positives in non-code sections of the input though
			// so this is a compromise that slightly favors big
			// files. With the full range only six bits of the 32
			// need to match to trigger a conversion.
			const uint32_t mask26 = 0x03FFFFFF;
			const uint32_t src = instr & mask26;
			instr = 0x94000000;

			if (src == 0)
				continue;

			instr |= arm64_conv(src, pc >> 2, mask26, is_encoder)
					& mask26;
			write32le(buffer + i, instr);

/*
		// This is a more readable version of the one below but this
		// has two branches. It results in bigger and slower code.
		} else if ((instr & 0x9FF00000) == 0x90000000
				|| (instr & 0x9FF00000) == 0x90F00000) {
*/
		// This is only a rotation, addition, and testing that
		// none of the bits covered by the bitmask are set.
		} else if (((((instr << 8) | (instr >> 24))
				+ (0x10000000 - 0x90)) & 0xE000009F) == 0) {
			// ADRP instruction:
			// Only values in the range +/-512 MiB are converted.
			//
			// Using less than the full +/-4 GiB range reduces
			// false positives on non-code sections of the input
			// while being excellent for executables up to 512 MiB.
			// The positive effect of ADRP conversion is smaller
			// than that of BL but it also doesn't hurt so much in
			// non-code sections of input because, with +/-512 MiB
			// range, nine bits of 32 need to match to trigger a
			// conversion (two 10-bit match choices = 9 bits).
			const uint32_t src = ((instr >> 29) & 3)
					| ((instr >> 3) & 0x0003FFFC);
			instr &= 0x9000001F;

			if (src == 0)
				continue;

			const uint32_t dest = arm64_conv(
					src, pc >> 12, 0x3FFFF, is_encoder);

			instr |= (dest & 3) << 29;
			instr |= (dest & 0x0003FFFC) << 3;
			instr |= (0U - (dest & 0x00020000)) & 0x00E00000;
			write32le(buffer + i, instr);
		}
	}

	return i;
}


static lzma_ret
arm64_coder_init(lzma_next_coder *next, const lzma_allocator *allocator,
		const lzma_filter_info *filters, bool is_encoder)
{
	return lzma_simple_coder_init(next, allocator, filters,
			&arm64_code, 0, 4, 4, is_encoder);
}


#ifdef HAVE_ENCODER_ARM64
extern lzma_ret
lzma_simple_arm64_encoder_init(lzma_next_coder *next,
		const lzma_allocator *allocator,
		const lzma_filter_info *filters)
{
	return arm64_coder_init(next, allocator, filters, true);
}
#endif


#ifdef HAVE_DECODER_ARM64
extern lzma_ret
lzma_simple_arm64_decoder_init(lzma_next_coder *next,
		const lzma_allocator *allocator,
		const lzma_filter_info *filters)
{
	return arm64_coder_init(next, allocator, filters, false);
}
#endif
liblzma: Add experimental ARM64 BCJ filter with a temporary Filter ID. That is, the Filter ID will be changed once the design is final. The current version will be removed. So files created with the tempoary Filter ID won't be supported in the future. 2022-09-19 19:34:56 +03:00			`///////////////////////////////////////////////////////////////////////////////`
			`//`
			`/// \file arm64.c`
			`/// \brief Filter for ARM64 binaries`
			`///`
Replace the experimental ARM64 filter with a new experimental version. This is incompatible with the previous version. This has space/tab fixes in filter_*.c and bcj.h too. 2022-11-14 23:14:41 +02:00			`/// This converts ARM64 relative addresses in the BL and ADRP immediates`
			`/// to absolute values to increase redundancy of ARM64 code.`
			`///`
			`/// Unlike the older BCJ filters, this handles zeros specially. This way`
			`/// the filter won't be counterproductive on Linux kernel modules, object`
			`/// files, and static libraries where the immediates are all zeros (to be`
			`/// filled later by a linker). Usually this has no downsides but with bad`
			`/// luck it can reduce the effectiveness of the filter and trying a different`
			`/// start offset can mitigate the problem.`
			`///`
			`/// Converting B or ADR instructions was also tested but it's not useful.`
			`/// A majority of the jumps for the B instruction are very small (+/- 0xFF).`
			`/// These are typical for loops and if-statements. Encoding them to their`
			`/// absolute address reduces redundancy since many of the small relative`
			`/// jump values are repeated, but very few of the absolute addresses are.`
			`//`
liblzma: Add experimental ARM64 BCJ filter with a temporary Filter ID. That is, the Filter ID will be changed once the design is final. The current version will be removed. So files created with the tempoary Filter ID won't be supported in the future. 2022-09-19 19:34:56 +03:00			`// Authors: Lasse Collin`
			`// Jia Tan`
			`//`
			`// This file has been put into the public domain.`
			`// You can do whatever you want with this file.`
			`//`
			`///////////////////////////////////////////////////////////////////////////////`

			`#include "simple_private.h"`


Replace the experimental ARM64 filter with a new experimental version. This is incompatible with the previous version. This has space/tab fixes in filter_*.c and bcj.h too. 2022-11-14 23:14:41 +02:00			`static uint32_t`
			`arm64_conv(uint32_t src, uint32_t pc, uint32_t mask, bool is_encoder)`
			`{`
			`if (!is_encoder)`
			`pc = 0U - pc;`
liblzma: Add experimental ARM64 BCJ filter with a temporary Filter ID. That is, the Filter ID will be changed once the design is final. The current version will be removed. So files created with the tempoary Filter ID won't be supported in the future. 2022-09-19 19:34:56 +03:00
Replace the experimental ARM64 filter with a new experimental version. This is incompatible with the previous version. This has space/tab fixes in filter_*.c and bcj.h too. 2022-11-14 23:14:41 +02:00			`uint32_t dest = src + pc;`
			`if ((dest & mask) == 0)`
			`dest = pc;`
liblzma: Add experimental ARM64 BCJ filter with a temporary Filter ID. That is, the Filter ID will be changed once the design is final. The current version will be removed. So files created with the tempoary Filter ID won't be supported in the future. 2022-09-19 19:34:56 +03:00
Replace the experimental ARM64 filter with a new experimental version. This is incompatible with the previous version. This has space/tab fixes in filter_*.c and bcj.h too. 2022-11-14 23:14:41 +02:00			`return dest;`
			`}`
liblzma: Add experimental ARM64 BCJ filter with a temporary Filter ID. That is, the Filter ID will be changed once the design is final. The current version will be removed. So files created with the tempoary Filter ID won't be supported in the future. 2022-09-19 19:34:56 +03:00

			`static size_t`
Replace the experimental ARM64 filter with a new experimental version. This is incompatible with the previous version. This has space/tab fixes in filter_*.c and bcj.h too. 2022-11-14 23:14:41 +02:00			`arm64_code(void *simple lzma_attribute((__unused__)),`
			`uint32_t now_pos, bool is_encoder,`
liblzma: Add experimental ARM64 BCJ filter with a temporary Filter ID. That is, the Filter ID will be changed once the design is final. The current version will be removed. So files created with the tempoary Filter ID won't be supported in the future. 2022-09-19 19:34:56 +03:00			`uint8_t *buffer, size_t size)`
			`{`
			`size_t i;`
Replace the experimental ARM64 filter with a new experimental version. This is incompatible with the previous version. This has space/tab fixes in filter_*.c and bcj.h too. 2022-11-14 23:14:41 +02:00
			`// Clang 14.0.6 on x86-64 makes this four times bigger and 60 % slower`
			`// with auto-vectorization that is enabled by default with -O2.`
			`// Even -Os, which doesn't use vectorization, produces faster code.`
			`// Disabling vectorization with -O2 gives good speed (faster than -Os)`
			`// and reasonable code size.`
			`//`
			`// Such vectorization bloat happens with -O2 when targeting ARM64 too`
			`// but performance hasn't been tested.`
			`//`
			`// Clang 14 and 15 won't auto-vectorize this loop if the condition`
			`// for ADRP is replaced with the commented-out version. However,`
			`// at least Clang 14.0.6 doesn't generate as fast code with that`
			`// condition. The commented-out code is also bigger.`
			`//`
			`// GCC 12.2 on x86-64 with -O2 produces good code with both versions`
			`// of the ADRP if-statement although the single-branch version is`
			`// slightly faster and smaller than the commented-out version.`
			`// Speed is similar to non-vectorized clang -O2.`
			`#ifdef __clang__`
			`# pragma clang loop vectorize(disable)`
			`#endif`
liblzma: Add experimental ARM64 BCJ filter with a temporary Filter ID. That is, the Filter ID will be changed once the design is final. The current version will be removed. So files created with the tempoary Filter ID won't be supported in the future. 2022-09-19 19:34:56 +03:00			`for (i = 0; i + 4 <= size; i += 4) {`
Replace the experimental ARM64 filter with a new experimental version. This is incompatible with the previous version. This has space/tab fixes in filter_*.c and bcj.h too. 2022-11-14 23:14:41 +02:00			`const uint32_t pc = (uint32_t)(now_pos + i);`
			`uint32_t instr = read32le(buffer + i);`

			`if ((instr >> 26) == 0x25) {`
			`// BL instruction:`
			`// The full 26-bit immediate is converted.`
			`// The range is +/-128 MiB.`
			`//`
			`// Using the full range is helps quite a lot with`
			`// big executables. Smaller range would reduce false`
			`// positives in non-code sections of the input though`
			`// so this is a compromise that slightly favors big`
			`// files. With the full range only six bits of the 32`
			`// need to match to trigger a conversion.`
			`const uint32_t mask26 = 0x03FFFFFF;`
			`const uint32_t src = instr & mask26;`
			`instr = 0x94000000;`

			`if (src == 0)`
liblzma: Add experimental ARM64 BCJ filter with a temporary Filter ID. That is, the Filter ID will be changed once the design is final. The current version will be removed. So files created with the tempoary Filter ID won't be supported in the future. 2022-09-19 19:34:56 +03:00			`continue;`

Replace the experimental ARM64 filter with a new experimental version. This is incompatible with the previous version. This has space/tab fixes in filter_*.c and bcj.h too. 2022-11-14 23:14:41 +02:00			`instr \|= arm64_conv(src, pc >> 2, mask26, is_encoder)`
			`& mask26;`
			`write32le(buffer + i, instr);`

			`/*`
			`// This is a more readable version of the one below but this`
			`// has two branches. It results in bigger and slower code.`
			`} else if ((instr & 0x9FF00000) == 0x90000000`
			`\|\| (instr & 0x9FF00000) == 0x90F00000) {`
			`*/`
			`// This is only a rotation, addition, and testing that`
			`// none of the bits covered by the bitmask are set.`
			`} else if (((((instr << 8) \| (instr >> 24))`
			`+ (0x10000000 - 0x90)) & 0xE000009F) == 0) {`
			`// ADRP instruction:`
			`// Only values in the range +/-512 MiB are converted.`
			`//`
			`// Using less than the full +/-4 GiB range reduces`
			`// false positives on non-code sections of the input`
			`// while being excellent for executables up to 512 MiB.`
			`// The positive effect of ADRP conversion is smaller`
			`// than that of BL but it also doesn't hurt so much in`
			`// non-code sections of input because, with +/-512 MiB`
			`// range, nine bits of 32 need to match to trigger a`
			`// conversion (two 10-bit match choices = 9 bits).`
			`const uint32_t src = ((instr >> 29) & 3)`
			`\| ((instr >> 3) & 0x0003FFFC);`
			`instr &= 0x9000001F;`

liblzma: Add experimental ARM64 BCJ filter with a temporary Filter ID. That is, the Filter ID will be changed once the design is final. The current version will be removed. So files created with the tempoary Filter ID won't be supported in the future. 2022-09-19 19:34:56 +03:00			`if (src == 0)`
			`continue;`

Replace the experimental ARM64 filter with a new experimental version. This is incompatible with the previous version. This has space/tab fixes in filter_*.c and bcj.h too. 2022-11-14 23:14:41 +02:00			`const uint32_t dest = arm64_conv(`
			`src, pc >> 12, 0x3FFFF, is_encoder);`

			`instr \|= (dest & 3) << 29;`
			`instr \|= (dest & 0x0003FFFC) << 3;`
			`instr \|= (0U - (dest & 0x00020000)) & 0x00E00000;`
			`write32le(buffer + i, instr);`
liblzma: Add experimental ARM64 BCJ filter with a temporary Filter ID. That is, the Filter ID will be changed once the design is final. The current version will be removed. So files created with the tempoary Filter ID won't be supported in the future. 2022-09-19 19:34:56 +03:00			`}`
			`}`

			`return i;`
			`}`


			`static lzma_ret`
			`arm64_coder_init(lzma_next_coder next, const lzma_allocator allocator,`
			`const lzma_filter_info *filters, bool is_encoder)`
			`{`
Replace the experimental ARM64 filter with a new experimental version. This is incompatible with the previous version. This has space/tab fixes in filter_*.c and bcj.h too. 2022-11-14 23:14:41 +02:00			`return lzma_simple_coder_init(next, allocator, filters,`
Revert "liblzma: Simple/BCJ filters: Allow disabling generic BCJ options." This reverts commit 177bdc922cb17bd0fd831ab8139dfae912a5c2b8 and also does equivalent change to arm64.c. Now that ARM64 filter will use lzma_options_bcj, this change is not needed anymore. 2022-11-14 23:19:57 +02:00			`&arm64_code, 0, 4, 4, is_encoder);`
liblzma: Add experimental ARM64 BCJ filter with a temporary Filter ID. That is, the Filter ID will be changed once the design is final. The current version will be removed. So files created with the tempoary Filter ID won't be supported in the future. 2022-09-19 19:34:56 +03:00			`}`


			`#ifdef HAVE_ENCODER_ARM64`
			`extern lzma_ret`
			`lzma_simple_arm64_encoder_init(lzma_next_coder *next,`
			`const lzma_allocator *allocator,`
			`const lzma_filter_info *filters)`
			`{`
			`return arm64_coder_init(next, allocator, filters, true);`
			`}`
			`#endif`


			`#ifdef HAVE_DECODER_ARM64`
			`extern lzma_ret`
			`lzma_simple_arm64_decoder_init(lzma_next_coder *next,`
			`const lzma_allocator *allocator,`
			`const lzma_filter_info *filters)`
			`{`
			`return arm64_coder_init(next, allocator, filters, false);`
			`}`
			`#endif`