mirror of https://git.tukaani.org/xz.git
liblzma: Speed up CRC32 calculation on 64-bit LoongArch
The crc.w.{b/h/w/d}.w instructions in LoongArch can calculate the CRC32 result for 1/2/4/8 bytes in a single operation. Using these is much faster compared to the generic method. Optimized CRC32 is enabled unconditionally on 64-bit LoongArch because the LoongArch specification says that CRC32 instructions shall be implemented for 64-bit processors. Optimized CRC32 isn't enabled for 32-bit LoongArch processors because not enough information is available about them. Co-authored-by: Lasse Collin <lasse.collin@tukaani.org> Closes: https://github.com/tukaani-project/xz/pull/86
This commit is contained in:
parent
0ed8936685
commit
7baf6835cf
|
@ -548,6 +548,7 @@ add_library(liblzma
|
|||
src/liblzma/check/crc_common.h
|
||||
src/liblzma/check/crc_x86_clmul.h
|
||||
src/liblzma/check/crc32_arm64.h
|
||||
src/liblzma/check/crc32_loongarch.h
|
||||
src/liblzma/common/block_util.c
|
||||
src/liblzma/common/common.c
|
||||
src/liblzma/common/common.h
|
||||
|
@ -1341,6 +1342,30 @@ if(XZ_ARM64_CRC32)
|
|||
endif()
|
||||
endif()
|
||||
|
||||
option(XZ_LOONGARCH_CRC32
|
||||
"Use LoongArch CRC32 instructions if supported by the compiler" ON)
|
||||
|
||||
if(XZ_LOONGARCH_CRC32)
|
||||
# LoongArch CRC32 intrinsics are in larchintrin.h.
|
||||
# These are supported by at least GCC and Clang.
|
||||
#
|
||||
# Only 64-bit LoongArch is currently supported.
|
||||
# It doesn't need runtime detection.
|
||||
check_c_source_compiles("
|
||||
#if !(defined(__loongarch__) && __loongarch_grlen >= 64)
|
||||
# error
|
||||
#endif
|
||||
|
||||
#include <larchintrin.h>
|
||||
int main(void)
|
||||
{
|
||||
return __crc_w_w_w(1, 2);
|
||||
}
|
||||
"
|
||||
HAVE_LOONGARCH_CRC32)
|
||||
tuklib_add_definition_if(liblzma HAVE_LOONGARCH_CRC32)
|
||||
endif()
|
||||
|
||||
|
||||
# Symbol visibility support:
|
||||
#
|
||||
|
|
40
configure.ac
40
configure.ac
|
@ -394,6 +394,16 @@ AC_ARG_ENABLE([arm64-crc32], AS_HELP_STRING([--disable-arm64-crc32],
|
|||
[], [enable_arm64_crc32=yes])
|
||||
|
||||
|
||||
################################
|
||||
# LoongArch CRC32 instructions #
|
||||
################################
|
||||
|
||||
AC_ARG_ENABLE([loongarch-crc32], AS_HELP_STRING([--disable-loongarch-crc32],
|
||||
[Do not use LoongArch CRC32 instructions even if support for
|
||||
them is detected.]),
|
||||
[], [enable_loongarch_crc32=yes])
|
||||
|
||||
|
||||
#####################
|
||||
# Size optimization #
|
||||
#####################
|
||||
|
@ -1106,6 +1116,36 @@ AS_IF([test "x$enable_arm64_crc32" = xyes], [
|
|||
])
|
||||
|
||||
|
||||
# LoongArch CRC32 intrinsics are in larchintrin.h.
|
||||
# These are supported by at least GCC and Clang.
|
||||
#
|
||||
# Only 64-bit LoongArch is currently supported.
|
||||
# It doesn't need runtime detection.
|
||||
AC_MSG_CHECKING([if LoongArch CRC32 instructions are usable])
|
||||
AS_IF([test "x$enable_loongarch_crc32" = xno], [
|
||||
AC_MSG_RESULT([no, --disable-loongarch-crc32 was used])
|
||||
], [
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([[
|
||||
#if !(defined(__loongarch__) && __loongarch_grlen >= 64)
|
||||
# error
|
||||
#endif
|
||||
|
||||
#include <larchintrin.h>
|
||||
int main(void)
|
||||
{
|
||||
return __crc_w_w_w(1, 2);
|
||||
}
|
||||
]])], [
|
||||
AC_DEFINE([HAVE_LOONGARCH_CRC32], [1], [Define to 1 if
|
||||
64-bit LoongArch CRC32 instructions are supported.])
|
||||
enable_loongarch_crc32=yes
|
||||
], [
|
||||
enable_loongarch_crc32=no
|
||||
])
|
||||
AC_MSG_RESULT([$enable_loongarch_crc32])
|
||||
])
|
||||
|
||||
|
||||
# Check for sandbox support. If one is found, set enable_sandbox=found.
|
||||
#
|
||||
# About -fsanitize: Of our three sandbox methods, only Landlock is
|
||||
|
|
|
@ -14,7 +14,8 @@ liblzma_la_SOURCES += \
|
|||
check/check.h \
|
||||
check/crc_common.h \
|
||||
check/crc_x86_clmul.h \
|
||||
check/crc32_arm64.h
|
||||
check/crc32_arm64.h \
|
||||
check/crc32_loongarch.h
|
||||
|
||||
if COND_SMALL
|
||||
liblzma_la_SOURCES += check/crc32_small.c
|
||||
|
|
|
@ -19,6 +19,8 @@
|
|||
# include "crc_x86_clmul.h"
|
||||
#elif defined(CRC32_ARM64)
|
||||
# include "crc32_arm64.h"
|
||||
#elif defined(CRC32_LOONGARCH)
|
||||
# include "crc32_loongarch.h"
|
||||
#endif
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
// SPDX-License-Identifier: 0BSD
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
/// \file crc32_loongarch.h
|
||||
/// \brief CRC32 calculation with LoongArch optimization
|
||||
//
|
||||
// Authors: Xi Ruoyao
|
||||
// Lasse Collin
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef LZMA_CRC32_LOONGARCH_H
|
||||
#define LZMA_CRC32_LOONGARCH_H
|
||||
|
||||
#include <larchintrin.h>
|
||||
|
||||
|
||||
static uint32_t
|
||||
crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc_unsigned)
|
||||
{
|
||||
int32_t crc = (int32_t)~crc_unsigned;
|
||||
|
||||
if (size >= 8) {
|
||||
const size_t align = (0 - (uintptr_t)buf) & 7;
|
||||
|
||||
if (align & 1)
|
||||
crc = __crc_w_b_w((int8_t)*buf++, crc);
|
||||
|
||||
if (align & 2) {
|
||||
crc = __crc_w_h_w((int16_t)aligned_read16le(buf), crc);
|
||||
buf += 2;
|
||||
}
|
||||
|
||||
if (align & 4) {
|
||||
crc = __crc_w_w_w((int32_t)aligned_read32le(buf), crc);
|
||||
buf += 4;
|
||||
}
|
||||
|
||||
size -= align;
|
||||
|
||||
for (const uint8_t *limit = buf + (size & ~(size_t)7);
|
||||
buf < limit; buf += 8)
|
||||
crc = __crc_w_d_w((int64_t)aligned_read64le(buf), crc);
|
||||
|
||||
size &= 7;
|
||||
}
|
||||
|
||||
if (size & 4) {
|
||||
crc = __crc_w_w_w((int32_t)aligned_read32le(buf), crc);
|
||||
buf += 4;
|
||||
}
|
||||
|
||||
if (size & 2) {
|
||||
crc = __crc_w_h_w((int16_t)aligned_read16le(buf), crc);
|
||||
buf += 2;
|
||||
}
|
||||
|
||||
if (size & 1)
|
||||
crc = __crc_w_b_w((int8_t)*buf, crc);
|
||||
|
||||
return (uint32_t)~crc;
|
||||
}
|
||||
|
||||
#endif // LZMA_CRC32_LOONGARCH_H
|
|
@ -83,6 +83,9 @@ extern const uint64_t lzma_crc64_table[4][256];
|
|||
// CRC64 could be done with CLMUL but it's not implemented yet.
|
||||
#undef CRC32_ARM64
|
||||
|
||||
// 64-bit LoongArch has CRC32 instructions.
|
||||
#undef CRC32_LOONGARCH
|
||||
|
||||
|
||||
// ARM64
|
||||
//
|
||||
|
@ -112,6 +115,18 @@ extern const uint64_t lzma_crc64_table[4][256];
|
|||
#endif
|
||||
|
||||
|
||||
// LoongArch
|
||||
//
|
||||
// Only 64-bit LoongArch is supported for now. No runtime detection
|
||||
// is needed because the LoongArch specification says that the CRC32
|
||||
// instructions are a part of the Basic Integer Instructions and
|
||||
// they shall be implemented by 64-bit LoongArch implementations.
|
||||
#ifdef HAVE_LOONGARCH_CRC32
|
||||
# define CRC32_ARCH_OPTIMIZED 1
|
||||
# define CRC32_LOONGARCH 1
|
||||
#endif
|
||||
|
||||
|
||||
// x86 and E2K
|
||||
#if defined(HAVE_USABLE_CLMUL)
|
||||
// If CLMUL is allowed unconditionally in the compiler options then
|
||||
|
|
Loading…
Reference in New Issue