xz/src/liblzma/check/crc32_arm64.h

118 lines
3.6 KiB
C

///////////////////////////////////////////////////////////////////////////////
//
/// \file crc32_arm64.h
/// \brief CRC32 calculation with ARM64 optimization
//
// Authors: Chenxi Mao
// Jia Tan
// Hans Jansen
//
///////////////////////////////////////////////////////////////////////////////
#ifndef LZMA_CRC32_ARM64_H
#define LZMA_CRC32_ARM64_H
// MSVC always has the CRC intrinsics available when building for ARM64
// there is no need to include any header files.
#ifndef _MSC_VER
# include <arm_acle.h>
#endif
#if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED)
# if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)
# include <sys/auxv.h>
# elif defined(_WIN32)
# include <processthreadsapi.h>
# elif defined(__APPLE__) && defined(HAVE_SYSCTLBYNAME)
# include <sys/sysctl.h>
# endif
#endif
// Some EDG-based compilers support ARM64 and define __GNUC__
// (such as Nvidia's nvcc), but do not support function attributes.
//
// NOTE: Build systems check for this too, keep them in sync with this.
#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
# define crc_attr_target \
__attribute__((__target__("+crc")))
#else
# define crc_attr_target
#endif
crc_attr_target
static uint32_t
crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
{
crc = ~crc;
// Align the input buffer because this was shown to be
// significantly faster than unaligned accesses.
const size_t align_amount = my_min(size, (8 - (uintptr_t)buf) & 7);
for (const uint8_t *limit = buf + align_amount; buf < limit; ++buf)
crc = __crc32b(crc, *buf);
size -= align_amount;
// Process 8 bytes at a time. The end point is determined by
// ignoring the least significant three bits of size to ensure
// we do not process past the bounds of the buffer. This guarantees
// that limit is a multiple of 8 and is strictly less than size.
for (const uint8_t *limit = buf + (size & ~((size_t)7));
buf < limit; buf += 8)
crc = __crc32d(crc, aligned_read64le(buf));
// Process the remaining bytes that are not 8 byte aligned.
for (const uint8_t *limit = buf + (size & 7); buf < limit; ++buf)
crc = __crc32b(crc, *buf);
return ~crc;
}
#if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED)
static inline bool
is_arch_extension_supported(void)
{
#if defined(HAVE_GETAUXVAL)
return (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0;
#elif defined(HAVE_ELF_AUX_INFO)
unsigned long feature_flags;
elf_aux_info(AT_HWCAP, &feature_flags, sizeof(feature_flags));
return feature_flags & HWCAP_CRC32 != 0;
#elif defined(_WIN32)
return IsProcessorFeaturePresent(
PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
#elif defined(__APPLE__) && defined(HAVE_SYSCTLBYNAME)
int has_crc32 = 0;
size_t size = sizeof(has_crc32);
// The sysctlbyname() function requires a string identifier for the
// CPU feature it tests. The Apple documentation lists the string
// "hw.optional.armv8_crc32", which can be found here:
// (https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics#3915619)
int err = sysctlbyname("hw.optional.armv8_crc32", &has_crc32,
&size, NULL, 0);
return !err && has_crc32;
#else
// If a runtime detection method cannot be found, then this must
// be a compile time error. The checks in crc_common.h should ensure
// a runtime detection method is always found if this function is
// built. It would be possible to just return false here, but this
// is inefficient for binary size and runtime since only the generic
// method could ever be used.
# error Runtime detection method unavailable.
#endif
}
#endif
#endif // LZMA_CRC32_ARM64_H