mirror of https://git.tukaani.org/xz.git
Speed up CRC32 calculation on ARM64
The CRC32 instructions in ARM64 can calculate the CRC32 result for 8 bytes in a single operation, making the use of ARM64 instructions much faster compared to the general CRC32 algorithm. Optimized CRC32 will be enabled if ARM64 has CRC extension running on Linux. Signed-off-by: Chenxi Mao <chenxi.mao2013@gmail.com>
This commit is contained in:
parent
b43c3e48bf
commit
849d0f282a
|
@ -230,6 +230,7 @@ add_library(liblzma
|
||||||
src/liblzma/check/check.h
|
src/liblzma/check/check.h
|
||||||
src/liblzma/check/crc_common.h
|
src/liblzma/check/crc_common.h
|
||||||
src/liblzma/check/crc_x86_clmul.h
|
src/liblzma/check/crc_x86_clmul.h
|
||||||
|
src/liblzma/check/crc32_aarch64.h
|
||||||
src/liblzma/common/block_util.c
|
src/liblzma/common/block_util.c
|
||||||
src/liblzma/common/common.c
|
src/liblzma/common/common.c
|
||||||
src/liblzma/common/common.h
|
src/liblzma/common/common.h
|
||||||
|
|
|
@ -15,7 +15,8 @@ liblzma_la_SOURCES += \
|
||||||
check/check.c \
|
check/check.c \
|
||||||
check/check.h \
|
check/check.h \
|
||||||
check/crc_common.h \
|
check/crc_common.h \
|
||||||
check/crc_x86_clmul.h
|
check/crc_x86_clmul.h \
|
||||||
|
check/crc32_aarch64.h
|
||||||
|
|
||||||
if COND_SMALL
|
if COND_SMALL
|
||||||
liblzma_la_SOURCES += check/crc32_small.c
|
liblzma_la_SOURCES += check/crc32_small.c
|
||||||
|
|
|
@ -0,0 +1,109 @@
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
//
|
||||||
|
/// \file crc32_aarch64.c
|
||||||
|
/// \brief CRC32 calculation with aarch64 optimization
|
||||||
|
//
|
||||||
|
// Authors: Chenxi Mao
|
||||||
|
//
|
||||||
|
// This file has been put into the public domain.
|
||||||
|
// You can do whatever you want with this file.
|
||||||
|
//
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
#ifdef LZMA_CRC_CRC32_AARCH64_H
|
||||||
|
# error crc_arm64_clmul.h was included twice.
|
||||||
|
#endif
|
||||||
|
#define LZMA_CRC_CRC32_AARCH64_H
|
||||||
|
#include <sys/auxv.h>
|
||||||
|
// EDG-based compilers (Intel's classic compiler and compiler for E2K) can
|
||||||
|
// define __GNUC__ but the attribute must not be used with them.
|
||||||
|
// The new Clang-based ICX needs the attribute.
|
||||||
|
//
|
||||||
|
// NOTE: Build systems check for this too, keep them in sync with this.
|
||||||
|
#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
|
||||||
|
# define crc_attr_target \
|
||||||
|
__attribute__((__target__("+crc")))
|
||||||
|
#else
|
||||||
|
# define crc_attr_target
|
||||||
|
#endif
|
||||||
|
#ifdef BUILDING_CRC32_AARCH64
|
||||||
|
crc_attr_target
|
||||||
|
crc_attr_no_sanitize_address
|
||||||
|
static uint32_t
|
||||||
|
crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
|
||||||
|
{
|
||||||
|
crc = ~crc;
|
||||||
|
while ((uintptr_t)(buf) & 7) {
|
||||||
|
crc = __builtin_aarch64_crc32b(crc, *buf);
|
||||||
|
buf++;
|
||||||
|
size--;
|
||||||
|
}
|
||||||
|
for (;size>=8;size-=8,buf+=8) {
|
||||||
|
crc = __builtin_aarch64_crc32x(crc, aligned_read64le(buf));
|
||||||
|
}
|
||||||
|
for (;size>0;size--,buf++)
|
||||||
|
crc = __builtin_aarch64_crc32b(crc, *buf);
|
||||||
|
return ~crc;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#ifdef BUILDING_CRC64_AARCH64
|
||||||
|
//FIXME: there is no crc64_arch_optimized implementation,
|
||||||
|
// to make compiler happy, add crc64_generic here.
|
||||||
|
#ifdef WORDS_BIGENDIAN
|
||||||
|
# define A1(x) ((x) >> 56)
|
||||||
|
#else
|
||||||
|
# define A1 A
|
||||||
|
#endif
|
||||||
|
crc_attr_target
|
||||||
|
crc_attr_no_sanitize_address
|
||||||
|
static uint64_t
|
||||||
|
crc64_arch_optimized(const uint8_t *buf, size_t size, uint64_t crc)
|
||||||
|
{
|
||||||
|
crc = ~crc;
|
||||||
|
|
||||||
|
#ifdef WORDS_BIGENDIAN
|
||||||
|
crc = bswap64(crc);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (size > 4) {
|
||||||
|
while ((uintptr_t)(buf) & 3) {
|
||||||
|
crc = lzma_crc64_table[0][*buf++ ^ A1(crc)] ^ S8(crc);
|
||||||
|
--size;
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint8_t *const limit = buf + (size & ~(size_t)(3));
|
||||||
|
size &= (size_t)(3);
|
||||||
|
|
||||||
|
while (buf < limit) {
|
||||||
|
#ifdef WORDS_BIGENDIAN
|
||||||
|
const uint32_t tmp = (uint32_t)(crc >> 32)
|
||||||
|
^ aligned_read32ne(buf);
|
||||||
|
#else
|
||||||
|
const uint32_t tmp = (uint32_t)crc
|
||||||
|
^ aligned_read32ne(buf);
|
||||||
|
#endif
|
||||||
|
buf += 4;
|
||||||
|
|
||||||
|
crc = lzma_crc64_table[3][A(tmp)]
|
||||||
|
^ lzma_crc64_table[2][B(tmp)]
|
||||||
|
^ S32(crc)
|
||||||
|
^ lzma_crc64_table[1][C(tmp)]
|
||||||
|
^ lzma_crc64_table[0][D(tmp)];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
while (size-- != 0)
|
||||||
|
crc = lzma_crc64_table[0][*buf++ ^ A1(crc)] ^ S8(crc);
|
||||||
|
|
||||||
|
#ifdef WORDS_BIGENDIAN
|
||||||
|
crc = bswap64(crc);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return ~crc;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
static inline bool
|
||||||
|
is_arch_extension_supported(void)
|
||||||
|
{
|
||||||
|
return (getauxval(AT_HWCAP) & HWCAP_CRC32)!=0;
|
||||||
|
}
|
||||||
|
|
|
@ -15,9 +15,12 @@
|
||||||
#include "check.h"
|
#include "check.h"
|
||||||
#include "crc_common.h"
|
#include "crc_common.h"
|
||||||
|
|
||||||
#ifdef CRC_X86_CLMUL
|
#if defined(CRC_X86_CLMUL)
|
||||||
# define BUILDING_CRC32_CLMUL
|
# define BUILDING_CRC32_CLMUL
|
||||||
# include "crc_x86_clmul.h"
|
# include "crc_x86_clmul.h"
|
||||||
|
#elif defined(CRC32_ARM64)
|
||||||
|
# define BUILDING_CRC32_AARCH64
|
||||||
|
# include "crc32_aarch64.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -14,9 +14,12 @@
|
||||||
#include "check.h"
|
#include "check.h"
|
||||||
#include "crc_common.h"
|
#include "crc_common.h"
|
||||||
|
|
||||||
#ifdef CRC_X86_CLMUL
|
#if defined(CRC_X86_CLMUL)
|
||||||
# define BUILDING_CRC64_CLMUL
|
# define BUILDING_CRC64_CLMUL
|
||||||
# include "crc_x86_clmul.h"
|
# include "crc_x86_clmul.h"
|
||||||
|
#elif defined(CRC32_ARM64)
|
||||||
|
# define BUILDING_CRC64_AARCH64
|
||||||
|
# include "crc32_aarch64.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -52,29 +52,33 @@
|
||||||
#undef CRC_GENERIC
|
#undef CRC_GENERIC
|
||||||
#undef CRC_ARCH_OPTIMIZED
|
#undef CRC_ARCH_OPTIMIZED
|
||||||
#undef CRC_X86_CLMUL
|
#undef CRC_X86_CLMUL
|
||||||
|
#undef CRC32_ARM64
|
||||||
#undef CRC_USE_IFUNC
|
#undef CRC_USE_IFUNC
|
||||||
#undef CRC_USE_GENERIC_FOR_SMALL_INPUTS
|
#undef CRC_USE_GENERIC_FOR_SMALL_INPUTS
|
||||||
|
|
||||||
// If CLMUL cannot be used then only the generic slice-by-eight (CRC32)
|
|
||||||
// or slice-by-four (CRC64) is built.
|
|
||||||
#if !defined(HAVE_USABLE_CLMUL)
|
|
||||||
# define CRC_GENERIC 1
|
|
||||||
|
|
||||||
// If CLMUL is allowed unconditionally in the compiler options then the
|
// If CLMUL is allowed unconditionally in the compiler options then the
|
||||||
// generic version can be omitted. Note that this doesn't work with MSVC
|
// generic version can be omitted. Note that this doesn't work with MSVC
|
||||||
// as I don't know how to detect the features here.
|
// as I don't know how to detect the features here.
|
||||||
//
|
//
|
||||||
// NOTE: Keep this this in sync with crc32_table.c.
|
// NOTE: Keep this this in sync with crc32_table.c.
|
||||||
#elif (defined(__SSSE3__) && defined(__SSE4_1__) && defined(__PCLMUL__)) \
|
#if (defined(__SSSE3__) && defined(__SSE4_1__) && defined(__PCLMUL__)) \
|
||||||
|| (defined(__e2k__) && __iset__ >= 6)
|
|| (defined(__e2k__) && __iset__ >= 6)
|
||||||
# define CRC_ARCH_OPTIMIZED 1
|
# define CRC_ARCH_OPTIMIZED 1
|
||||||
# define CRC_X86_CLMUL 1
|
# define CRC_X86_CLMUL 1
|
||||||
|
|
||||||
|
#elif (defined(__aarch64__))
|
||||||
|
# define CRC_ARCH_OPTIMIZED 1
|
||||||
|
# define CRC32_ARM64 1
|
||||||
|
// If CLMUL cannot be used then only the generic slice-by-eight (CRC32)
|
||||||
|
// or slice-by-four (CRC64) is built.
|
||||||
|
#elif !defined(HAVE_USABLE_CLMUL)
|
||||||
|
# define CRC_GENERIC 1
|
||||||
// Otherwise build both and detect at runtime which version to use.
|
// Otherwise build both and detect at runtime which version to use.
|
||||||
#else
|
#else
|
||||||
# define CRC_GENERIC 1
|
# define CRC_GENERIC 1
|
||||||
# define CRC_ARCH_OPTIMIZED 1
|
# define CRC_ARCH_OPTIMIZED 1
|
||||||
# define CRC_X86_CLMUL 1
|
# define CRC_X86_CLMUL 1
|
||||||
|
# define CRC32_ARM64 1
|
||||||
|
|
||||||
# ifdef HAVE_FUNC_ATTRIBUTE_IFUNC
|
# ifdef HAVE_FUNC_ATTRIBUTE_IFUNC
|
||||||
# define CRC_USE_IFUNC 1
|
# define CRC_USE_IFUNC 1
|
||||||
|
|
Loading…
Reference in New Issue