2022-09-19 19:34:56 +03:00
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
//
|
|
|
|
/// \file arm64.c
|
|
|
|
/// \brief Filter for ARM64 binaries
|
|
|
|
///
|
2022-11-14 23:14:41 +02:00
|
|
|
/// This converts ARM64 relative addresses in the BL and ADRP immediates
|
|
|
|
/// to absolute values to increase redundancy of ARM64 code.
|
|
|
|
///
|
|
|
|
/// Unlike the older BCJ filters, this handles zeros specially. This way
|
|
|
|
/// the filter won't be counterproductive on Linux kernel modules, object
|
|
|
|
/// files, and static libraries where the immediates are all zeros (to be
|
|
|
|
/// filled later by a linker). Usually this has no downsides but with bad
|
|
|
|
/// luck it can reduce the effectiveness of the filter and trying a different
|
|
|
|
/// start offset can mitigate the problem.
|
|
|
|
///
|
|
|
|
/// Converting B or ADR instructions was also tested but it's not useful.
|
|
|
|
/// A majority of the jumps for the B instruction are very small (+/- 0xFF).
|
|
|
|
/// These are typical for loops and if-statements. Encoding them to their
|
|
|
|
/// absolute address reduces redundancy since many of the small relative
|
|
|
|
/// jump values are repeated, but very few of the absolute addresses are.
|
|
|
|
//
|
2022-09-19 19:34:56 +03:00
|
|
|
// Authors: Lasse Collin
|
|
|
|
// Jia Tan
|
|
|
|
//
|
|
|
|
// This file has been put into the public domain.
|
|
|
|
// You can do whatever you want with this file.
|
|
|
|
//
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
#include "simple_private.h"
|
|
|
|
|
|
|
|
|
2022-11-14 23:14:41 +02:00
|
|
|
static uint32_t
|
|
|
|
arm64_conv(uint32_t src, uint32_t pc, uint32_t mask, bool is_encoder)
|
|
|
|
{
|
|
|
|
if (!is_encoder)
|
|
|
|
pc = 0U - pc;
|
2022-09-19 19:34:56 +03:00
|
|
|
|
2022-11-14 23:14:41 +02:00
|
|
|
uint32_t dest = src + pc;
|
|
|
|
if ((dest & mask) == 0)
|
|
|
|
dest = pc;
|
2022-09-19 19:34:56 +03:00
|
|
|
|
2022-11-14 23:14:41 +02:00
|
|
|
return dest;
|
|
|
|
}
|
2022-09-19 19:34:56 +03:00
|
|
|
|
|
|
|
|
|
|
|
static size_t
|
2022-11-14 23:14:41 +02:00
|
|
|
arm64_code(void *simple lzma_attribute((__unused__)),
|
|
|
|
uint32_t now_pos, bool is_encoder,
|
2022-09-19 19:34:56 +03:00
|
|
|
uint8_t *buffer, size_t size)
|
|
|
|
{
|
|
|
|
size_t i;
|
2022-11-14 23:14:41 +02:00
|
|
|
|
|
|
|
// Clang 14.0.6 on x86-64 makes this four times bigger and 60 % slower
|
|
|
|
// with auto-vectorization that is enabled by default with -O2.
|
|
|
|
// Even -Os, which doesn't use vectorization, produces faster code.
|
|
|
|
// Disabling vectorization with -O2 gives good speed (faster than -Os)
|
|
|
|
// and reasonable code size.
|
|
|
|
//
|
|
|
|
// Such vectorization bloat happens with -O2 when targeting ARM64 too
|
|
|
|
// but performance hasn't been tested.
|
|
|
|
//
|
|
|
|
// Clang 14 and 15 won't auto-vectorize this loop if the condition
|
|
|
|
// for ADRP is replaced with the commented-out version. However,
|
|
|
|
// at least Clang 14.0.6 doesn't generate as fast code with that
|
|
|
|
// condition. The commented-out code is also bigger.
|
|
|
|
//
|
|
|
|
// GCC 12.2 on x86-64 with -O2 produces good code with both versions
|
|
|
|
// of the ADRP if-statement although the single-branch version is
|
|
|
|
// slightly faster and smaller than the commented-out version.
|
|
|
|
// Speed is similar to non-vectorized clang -O2.
|
|
|
|
#ifdef __clang__
|
|
|
|
# pragma clang loop vectorize(disable)
|
|
|
|
#endif
|
2022-09-19 19:34:56 +03:00
|
|
|
for (i = 0; i + 4 <= size; i += 4) {
|
2022-11-14 23:14:41 +02:00
|
|
|
const uint32_t pc = (uint32_t)(now_pos + i);
|
|
|
|
uint32_t instr = read32le(buffer + i);
|
|
|
|
|
|
|
|
if ((instr >> 26) == 0x25) {
|
|
|
|
// BL instruction:
|
|
|
|
// The full 26-bit immediate is converted.
|
|
|
|
// The range is +/-128 MiB.
|
|
|
|
//
|
|
|
|
// Using the full range is helps quite a lot with
|
|
|
|
// big executables. Smaller range would reduce false
|
|
|
|
// positives in non-code sections of the input though
|
|
|
|
// so this is a compromise that slightly favors big
|
|
|
|
// files. With the full range only six bits of the 32
|
|
|
|
// need to match to trigger a conversion.
|
|
|
|
const uint32_t mask26 = 0x03FFFFFF;
|
|
|
|
const uint32_t src = instr & mask26;
|
|
|
|
instr = 0x94000000;
|
|
|
|
|
|
|
|
if (src == 0)
|
2022-09-19 19:34:56 +03:00
|
|
|
continue;
|
|
|
|
|
2022-11-14 23:14:41 +02:00
|
|
|
instr |= arm64_conv(src, pc >> 2, mask26, is_encoder)
|
|
|
|
& mask26;
|
|
|
|
write32le(buffer + i, instr);
|
|
|
|
|
|
|
|
/*
|
|
|
|
// This is a more readable version of the one below but this
|
|
|
|
// has two branches. It results in bigger and slower code.
|
|
|
|
} else if ((instr & 0x9FF00000) == 0x90000000
|
|
|
|
|| (instr & 0x9FF00000) == 0x90F00000) {
|
|
|
|
*/
|
|
|
|
// This is only a rotation, addition, and testing that
|
|
|
|
// none of the bits covered by the bitmask are set.
|
|
|
|
} else if (((((instr << 8) | (instr >> 24))
|
|
|
|
+ (0x10000000 - 0x90)) & 0xE000009F) == 0) {
|
|
|
|
// ADRP instruction:
|
|
|
|
// Only values in the range +/-512 MiB are converted.
|
|
|
|
//
|
|
|
|
// Using less than the full +/-4 GiB range reduces
|
|
|
|
// false positives on non-code sections of the input
|
|
|
|
// while being excellent for executables up to 512 MiB.
|
|
|
|
// The positive effect of ADRP conversion is smaller
|
|
|
|
// than that of BL but it also doesn't hurt so much in
|
|
|
|
// non-code sections of input because, with +/-512 MiB
|
|
|
|
// range, nine bits of 32 need to match to trigger a
|
|
|
|
// conversion (two 10-bit match choices = 9 bits).
|
|
|
|
const uint32_t src = ((instr >> 29) & 3)
|
|
|
|
| ((instr >> 3) & 0x0003FFFC);
|
|
|
|
instr &= 0x9000001F;
|
|
|
|
|
2022-09-19 19:34:56 +03:00
|
|
|
if (src == 0)
|
|
|
|
continue;
|
|
|
|
|
2022-11-14 23:14:41 +02:00
|
|
|
const uint32_t dest = arm64_conv(
|
|
|
|
src, pc >> 12, 0x3FFFF, is_encoder);
|
|
|
|
|
|
|
|
instr |= (dest & 3) << 29;
|
|
|
|
instr |= (dest & 0x0003FFFC) << 3;
|
|
|
|
instr |= (0U - (dest & 0x00020000)) & 0x00E00000;
|
|
|
|
write32le(buffer + i, instr);
|
2022-09-19 19:34:56 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static lzma_ret
|
|
|
|
arm64_coder_init(lzma_next_coder *next, const lzma_allocator *allocator,
|
|
|
|
const lzma_filter_info *filters, bool is_encoder)
|
|
|
|
{
|
2022-11-14 23:14:41 +02:00
|
|
|
return lzma_simple_coder_init(next, allocator, filters,
|
2022-11-14 23:19:57 +02:00
|
|
|
&arm64_code, 0, 4, 4, is_encoder);
|
2022-09-19 19:34:56 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef HAVE_ENCODER_ARM64
|
|
|
|
extern lzma_ret
|
|
|
|
lzma_simple_arm64_encoder_init(lzma_next_coder *next,
|
|
|
|
const lzma_allocator *allocator,
|
|
|
|
const lzma_filter_info *filters)
|
|
|
|
{
|
|
|
|
return arm64_coder_init(next, allocator, filters, true);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef HAVE_DECODER_ARM64
|
|
|
|
extern lzma_ret
|
|
|
|
lzma_simple_arm64_decoder_init(lzma_next_coder *next,
|
|
|
|
const lzma_allocator *allocator,
|
|
|
|
const lzma_filter_info *filters)
|
|
|
|
{
|
|
|
|
return arm64_coder_init(next, allocator, filters, false);
|
|
|
|
}
|
|
|
|
#endif
|