liblzma: Speed up CRC32 calculation on 64-bit LoongArch

The crc.w.{b/h/w/d}.w instructions in LoongArch can calculate the CRC32
result for 1/2/4/8 bytes in a single operation. Using these is much
faster compared to the generic method.

Optimized CRC32 is enabled unconditionally on 64-bit LoongArch because
the LoongArch specification says that CRC32 instructions shall be
implemented for 64-bit processors. Optimized CRC32 isn't enabled for
32-bit LoongArch processors because not enough information is available
about them.

Co-authored-by: Lasse Collin <lasse.collin@tukaani.org>

Closes: https://github.com/tukaani-project/xz/pull/86
This commit is contained in:
Xi Ruoyao 2024-06-28 13:36:43 +03:00 committed by Lasse Collin
parent 0ed8936685
commit 7baf6835cf
6 changed files with 149 additions and 1 deletions

View File

@ -548,6 +548,7 @@ add_library(liblzma
src/liblzma/check/crc_common.h
src/liblzma/check/crc_x86_clmul.h
src/liblzma/check/crc32_arm64.h
src/liblzma/check/crc32_loongarch.h
src/liblzma/common/block_util.c
src/liblzma/common/common.c
src/liblzma/common/common.h
@ -1341,6 +1342,30 @@ if(XZ_ARM64_CRC32)
endif()
endif()
option(XZ_LOONGARCH_CRC32
"Use LoongArch CRC32 instructions if supported by the compiler" ON)
if(XZ_LOONGARCH_CRC32)
# LoongArch CRC32 intrinsics are in larchintrin.h.
# These are supported by at least GCC and Clang.
#
# Only 64-bit LoongArch is currently supported.
# It doesn't need runtime detection.
check_c_source_compiles("
#if !(defined(__loongarch__) && __loongarch_grlen >= 64)
# error
#endif
#include <larchintrin.h>
int main(void)
{
return __crc_w_w_w(1, 2);
}
"
HAVE_LOONGARCH_CRC32)
tuklib_add_definition_if(liblzma HAVE_LOONGARCH_CRC32)
endif()
# Symbol visibility support:
#

View File

@ -394,6 +394,16 @@ AC_ARG_ENABLE([arm64-crc32], AS_HELP_STRING([--disable-arm64-crc32],
[], [enable_arm64_crc32=yes])
################################
# LoongArch CRC32 instructions #
################################
AC_ARG_ENABLE([loongarch-crc32], AS_HELP_STRING([--disable-loongarch-crc32],
[Do not use LoongArch CRC32 instructions even if support for
them is detected.]),
[], [enable_loongarch_crc32=yes])
#####################
# Size optimization #
#####################
@ -1106,6 +1116,36 @@ AS_IF([test "x$enable_arm64_crc32" = xyes], [
])
# LoongArch CRC32 intrinsics are in larchintrin.h.
# These are supported by at least GCC and Clang.
#
# Only 64-bit LoongArch is currently supported.
# It doesn't need runtime detection.
AC_MSG_CHECKING([if LoongArch CRC32 instructions are usable])
AS_IF([test "x$enable_loongarch_crc32" = xno], [
AC_MSG_RESULT([no, --disable-loongarch-crc32 was used])
], [
AC_LINK_IFELSE([AC_LANG_SOURCE([[
#if !(defined(__loongarch__) && __loongarch_grlen >= 64)
# error
#endif
#include <larchintrin.h>
int main(void)
{
return __crc_w_w_w(1, 2);
}
]])], [
AC_DEFINE([HAVE_LOONGARCH_CRC32], [1], [Define to 1 if
64-bit LoongArch CRC32 instructions are supported.])
enable_loongarch_crc32=yes
], [
enable_loongarch_crc32=no
])
AC_MSG_RESULT([$enable_loongarch_crc32])
])
# Check for sandbox support. If one is found, set enable_sandbox=found.
#
# About -fsanitize: Of our three sandbox methods, only Landlock is

View File

@ -14,7 +14,8 @@ liblzma_la_SOURCES += \
check/check.h \
check/crc_common.h \
check/crc_x86_clmul.h \
check/crc32_arm64.h
check/crc32_arm64.h \
check/crc32_loongarch.h
if COND_SMALL
liblzma_la_SOURCES += check/crc32_small.c

View File

@ -19,6 +19,8 @@
# include "crc_x86_clmul.h"
#elif defined(CRC32_ARM64)
# include "crc32_arm64.h"
#elif defined(CRC32_LOONGARCH)
# include "crc32_loongarch.h"
#endif

View File

@ -0,0 +1,65 @@
// SPDX-License-Identifier: 0BSD
///////////////////////////////////////////////////////////////////////////////
//
/// \file crc32_loongarch.h
/// \brief CRC32 calculation with LoongArch optimization
//
// Authors: Xi Ruoyao
// Lasse Collin
//
///////////////////////////////////////////////////////////////////////////////
#ifndef LZMA_CRC32_LOONGARCH_H
#define LZMA_CRC32_LOONGARCH_H
#include <larchintrin.h>
static uint32_t
crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc_unsigned)
{
int32_t crc = (int32_t)~crc_unsigned;
if (size >= 8) {
const size_t align = (0 - (uintptr_t)buf) & 7;
if (align & 1)
crc = __crc_w_b_w((int8_t)*buf++, crc);
if (align & 2) {
crc = __crc_w_h_w((int16_t)aligned_read16le(buf), crc);
buf += 2;
}
if (align & 4) {
crc = __crc_w_w_w((int32_t)aligned_read32le(buf), crc);
buf += 4;
}
size -= align;
for (const uint8_t *limit = buf + (size & ~(size_t)7);
buf < limit; buf += 8)
crc = __crc_w_d_w((int64_t)aligned_read64le(buf), crc);
size &= 7;
}
if (size & 4) {
crc = __crc_w_w_w((int32_t)aligned_read32le(buf), crc);
buf += 4;
}
if (size & 2) {
crc = __crc_w_h_w((int16_t)aligned_read16le(buf), crc);
buf += 2;
}
if (size & 1)
crc = __crc_w_b_w((int8_t)*buf, crc);
return (uint32_t)~crc;
}
#endif // LZMA_CRC32_LOONGARCH_H

View File

@ -83,6 +83,9 @@ extern const uint64_t lzma_crc64_table[4][256];
// CRC64 could be done with CLMUL but it's not implemented yet.
#undef CRC32_ARM64
// 64-bit LoongArch has CRC32 instructions.
#undef CRC32_LOONGARCH
// ARM64
//
@ -112,6 +115,18 @@ extern const uint64_t lzma_crc64_table[4][256];
#endif
// LoongArch
//
// Only 64-bit LoongArch is supported for now. No runtime detection
// is needed because the LoongArch specification says that the CRC32
// instructions are a part of the Basic Integer Instructions and
// they shall be implemented by 64-bit LoongArch implementations.
#ifdef HAVE_LOONGARCH_CRC32
# define CRC32_ARCH_OPTIMIZED 1
# define CRC32_LOONGARCH 1
#endif
// x86 and E2K
#if defined(HAVE_USABLE_CLMUL)
// If CLMUL is allowed unconditionally in the compiler options then