mirror of https://git.tukaani.org/xz.git
liblzma: Avoid extern lzma_crc32_clmul() and lzma_crc64_clmul().
A CLMUL-only build will have the crcxx_clmul() inlined into lzma_crcxx(). Previously a jump to the extern lzma_crcxx_clmul() was needed. Notes about shared liblzma on ELF platforms: - On platforms that support ifunc and -fvisibility=hidden, this was silly because CLMUL-only build would have that single extra jump instruction of extra overhead. - On platforms that support neither -fvisibility=hidden nor linker version script (liblzma*.map), jumping to lzma_crcxx_clmul() would go via PLT so a few more instructions of overhead (still not a big issue but silly nevertheless). There was a downside with static liblzma too: if an application only needs lzma_crc64(), static linking would make the linker include the CLMUL code for both CRC32 and CRC64 from crc_x86_clmul.o even though the CRC32 code wouldn't be needed, thus increasing code size of the executable (assuming that -ffunction-sections isn't used). Also, now compilers are likely to inline crc_simd_body() even if they don't support the always_inline attribute (or MSVC's __forceinline). Quite possibly all compilers that build the code do support such an attribute. But now it likely isn't a problem even if the attribute wasn't supported. Now all x86-specific stuff is in crc_x86_clmul.h. If other archs The other archs can then have their own headers with their own is_clmul_supported() and crcxx_clmul(). Another bonus is that the build system doesn't need to care if crc_clmul.c is needed. is_clmul_supported() stays as inline function as it's not needed when doing a CLMUL-only build (avoids a warning about unused function).
This commit is contained in:
parent
e3833e297d
commit
419f55f9df
|
@ -229,6 +229,7 @@ add_library(liblzma
|
||||||
src/liblzma/check/check.c
|
src/liblzma/check/check.c
|
||||||
src/liblzma/check/check.h
|
src/liblzma/check/check.h
|
||||||
src/liblzma/check/crc_common.h
|
src/liblzma/check/crc_common.h
|
||||||
|
src/liblzma/check/crc_x86_clmul.h
|
||||||
src/liblzma/common/block_util.c
|
src/liblzma/common/block_util.c
|
||||||
src/liblzma/common/common.c
|
src/liblzma/common/common.c
|
||||||
src/liblzma/common/common.h
|
src/liblzma/common/common.h
|
||||||
|
@ -1000,11 +1001,7 @@ calculation if supported by the system" ON)
|
||||||
int main(void) { return 0; }
|
int main(void) { return 0; }
|
||||||
"
|
"
|
||||||
HAVE_USABLE_CLMUL)
|
HAVE_USABLE_CLMUL)
|
||||||
|
tuklib_add_definition_if(liblzma HAVE_USABLE_CLMUL)
|
||||||
if(HAVE_USABLE_CLMUL)
|
|
||||||
target_sources(liblzma PRIVATE src/liblzma/check/crc_clmul.c)
|
|
||||||
target_compile_definitions(liblzma PRIVATE HAVE_USABLE_CLMUL)
|
|
||||||
endif()
|
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
|
@ -1086,7 +1086,6 @@ __m128i my_clmul(__m128i a)
|
||||||
])
|
])
|
||||||
AC_MSG_RESULT([$enable_clmul_crc])
|
AC_MSG_RESULT([$enable_clmul_crc])
|
||||||
])
|
])
|
||||||
AM_CONDITIONAL([COND_CRC_CLMUL], [test "x$enable_clmul_crc" = xyes])
|
|
||||||
|
|
||||||
# Check for sandbox support. If one is found, set enable_sandbox=found.
|
# Check for sandbox support. If one is found, set enable_sandbox=found.
|
||||||
#
|
#
|
||||||
|
|
|
@ -14,7 +14,8 @@ EXTRA_DIST += \
|
||||||
liblzma_la_SOURCES += \
|
liblzma_la_SOURCES += \
|
||||||
check/check.c \
|
check/check.c \
|
||||||
check/check.h \
|
check/check.h \
|
||||||
check/crc_common.h
|
check/crc_common.h \
|
||||||
|
check/crc_x86_clmul.h
|
||||||
|
|
||||||
if COND_SMALL
|
if COND_SMALL
|
||||||
liblzma_la_SOURCES += check/crc32_small.c
|
liblzma_la_SOURCES += check/crc32_small.c
|
||||||
|
@ -27,9 +28,6 @@ if COND_ASM_X86
|
||||||
liblzma_la_SOURCES += check/crc32_x86.S
|
liblzma_la_SOURCES += check/crc32_x86.S
|
||||||
else
|
else
|
||||||
liblzma_la_SOURCES += check/crc32_fast.c
|
liblzma_la_SOURCES += check/crc32_fast.c
|
||||||
if COND_CRC_CLMUL
|
|
||||||
liblzma_la_SOURCES += check/crc_clmul.c
|
|
||||||
endif
|
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,11 @@
|
||||||
#include "check.h"
|
#include "check.h"
|
||||||
#include "crc_common.h"
|
#include "crc_common.h"
|
||||||
|
|
||||||
|
#ifdef CRC_CLMUL
|
||||||
|
# define BUILDING_CRC32_CLMUL
|
||||||
|
# include "crc_x86_clmul.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifdef CRC_GENERIC
|
#ifdef CRC_GENERIC
|
||||||
|
|
||||||
|
@ -132,7 +137,7 @@ typedef uint32_t (*crc32_func_type)(
|
||||||
static crc32_func_type
|
static crc32_func_type
|
||||||
crc32_resolve(void)
|
crc32_resolve(void)
|
||||||
{
|
{
|
||||||
return is_clmul_supported() ? &lzma_crc32_clmul : &crc32_generic;
|
return is_clmul_supported() ? &crc32_clmul : &crc32_generic;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(HAVE_FUNC_ATTRIBUTE_IFUNC) && defined(__clang__)
|
#if defined(HAVE_FUNC_ATTRIBUTE_IFUNC) && defined(__clang__)
|
||||||
|
@ -221,7 +226,7 @@ lzma_crc32(const uint8_t *buf, size_t size, uint32_t crc)
|
||||||
return crc32_func(buf, size, crc);
|
return crc32_func(buf, size, crc);
|
||||||
|
|
||||||
#elif defined(CRC_CLMUL)
|
#elif defined(CRC_CLMUL)
|
||||||
return lzma_crc32_clmul(buf, size, crc);
|
return crc32_clmul(buf, size, crc);
|
||||||
|
|
||||||
#else
|
#else
|
||||||
return crc32_generic(buf, size, crc);
|
return crc32_generic(buf, size, crc);
|
||||||
|
|
|
@ -14,6 +14,11 @@
|
||||||
#include "check.h"
|
#include "check.h"
|
||||||
#include "crc_common.h"
|
#include "crc_common.h"
|
||||||
|
|
||||||
|
#ifdef CRC_CLMUL
|
||||||
|
# define BUILDING_CRC64_CLMUL
|
||||||
|
# include "crc_x86_clmul.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifdef CRC_GENERIC
|
#ifdef CRC_GENERIC
|
||||||
|
|
||||||
|
@ -97,7 +102,7 @@ typedef uint64_t (*crc64_func_type)(
|
||||||
static crc64_func_type
|
static crc64_func_type
|
||||||
crc64_resolve(void)
|
crc64_resolve(void)
|
||||||
{
|
{
|
||||||
return is_clmul_supported() ? &lzma_crc64_clmul : &crc64_generic;
|
return is_clmul_supported() ? &crc64_clmul : &crc64_generic;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(HAVE_FUNC_ATTRIBUTE_IFUNC) && defined(__clang__)
|
#if defined(HAVE_FUNC_ATTRIBUTE_IFUNC) && defined(__clang__)
|
||||||
|
@ -160,7 +165,7 @@ lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc)
|
||||||
//
|
//
|
||||||
// FIXME: Lookup table isn't currently omitted on 32-bit x86,
|
// FIXME: Lookup table isn't currently omitted on 32-bit x86,
|
||||||
// see crc64_table.c.
|
// see crc64_table.c.
|
||||||
return lzma_crc64_clmul(buf, size, crc);
|
return crc64_clmul(buf, size, crc);
|
||||||
|
|
||||||
#else
|
#else
|
||||||
return crc64_generic(buf, size, crc);
|
return crc64_generic(buf, size, crc);
|
||||||
|
|
|
@ -108,70 +108,6 @@
|
||||||
# define CRC_USE_GENERIC_FOR_SMALL_INPUTS 1
|
# define CRC_USE_GENERIC_FOR_SMALL_INPUTS 1
|
||||||
# endif
|
# endif
|
||||||
*/
|
*/
|
||||||
|
|
||||||
# if defined(_MSC_VER)
|
|
||||||
# include <intrin.h>
|
|
||||||
# elif defined(HAVE_CPUID_H)
|
|
||||||
# include <cpuid.h>
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// is_clmul_supported() must be inlined in this header file because the
|
|
||||||
// ifunc resolver function may not support calling a function in another
|
|
||||||
// translation unit. Depending on compiler-toolchain and flags, a call to
|
|
||||||
// a function defined in another translation unit could result in a
|
|
||||||
// reference to the PLT, which is unsafe to do in an ifunc resolver. The
|
|
||||||
// ifunc resolver runs very early when loading a shared library, so the PLT
|
|
||||||
// entries may not be setup at that time. Inlining this function duplicates
|
|
||||||
// the function body in crc32_resolve() and crc64_resolve(), but this is
|
|
||||||
// acceptable because the function results in very few instructions.
|
|
||||||
static inline bool
|
|
||||||
is_clmul_supported(void)
|
|
||||||
{
|
|
||||||
int success = 1;
|
|
||||||
uint32_t r[4]; // eax, ebx, ecx, edx
|
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
|
||||||
// This needs <intrin.h> with MSVC. ICC has it as a built-in
|
|
||||||
// on all platforms.
|
|
||||||
__cpuid(r, 1);
|
|
||||||
#elif defined(HAVE_CPUID_H)
|
|
||||||
// Compared to just using __asm__ to run CPUID, this also checks
|
|
||||||
// that CPUID is supported and saves and restores ebx as that is
|
|
||||||
// needed with GCC < 5 with position-independent code (PIC).
|
|
||||||
success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]);
|
|
||||||
#else
|
|
||||||
// Just a fallback that shouldn't be needed.
|
|
||||||
__asm__("cpuid\n\t"
|
|
||||||
: "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3])
|
|
||||||
: "a"(1), "c"(0));
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Returns true if these are supported:
|
|
||||||
// CLMUL (bit 1 in ecx)
|
|
||||||
// SSSE3 (bit 9 in ecx)
|
|
||||||
// SSE4.1 (bit 19 in ecx)
|
|
||||||
const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19);
|
|
||||||
return success && (r[2] & ecx_mask) == ecx_mask;
|
|
||||||
|
|
||||||
// Alternative methods that weren't used:
|
|
||||||
// - ICC's _may_i_use_cpu_feature: the other methods should work too.
|
|
||||||
// - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul")
|
|
||||||
//
|
|
||||||
// CPUID decding is needed with MSVC anyway and older GCC. This keeps
|
|
||||||
// the feature checks in the build system simpler too. The nice thing
|
|
||||||
// about __builtin_cpu_supports would be that it generates very short
|
|
||||||
// code as is it only reads a variable set at startup but a few bytes
|
|
||||||
// doesn't matter here.
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/// CRC32 implemented with the x86 CLMUL instruction.
|
|
||||||
extern uint32_t lzma_crc32_clmul(const uint8_t *buf, size_t size,
|
|
||||||
uint32_t crc);
|
|
||||||
|
|
||||||
/// CRC64 implemented with the x86 CLMUL instruction.
|
|
||||||
extern uint64_t lzma_crc64_clmul(const uint8_t *buf, size_t size,
|
|
||||||
uint64_t crc);
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1,11 +1,10 @@
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
//
|
//
|
||||||
/// \file crc_clmul.c
|
/// \file crc_x86_clmul.h
|
||||||
/// \brief CRC32 and CRC64 implementations using CLMUL instructions.
|
/// \brief CRC32 and CRC64 implementations using CLMUL instructions.
|
||||||
///
|
///
|
||||||
/// lzma_crc32_clmul() and lzma_crc64_clmul() use 32/64-bit x86
|
/// crc32_clmul() and crc64_clmul() use 32/64-bit x86 SSSE3, SSE4.1, and
|
||||||
/// SSSE3, SSE4.1, and CLMUL instructions. This is compatible with
|
/// CLMUL instructions. This is compatible with Elbrus 2000 (E2K) too.
|
||||||
/// Elbrus 2000 (E2K) too.
|
|
||||||
///
|
///
|
||||||
/// They were derived from
|
/// They were derived from
|
||||||
/// https://www.researchgate.net/publication/263424619_Fast_CRC_computation
|
/// https://www.researchgate.net/publication/263424619_Fast_CRC_computation
|
||||||
|
@ -27,9 +26,20 @@
|
||||||
//
|
//
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
#include "crc_common.h"
|
// This file must not be included more than once.
|
||||||
|
#ifdef LZMA_CRC_X86_CLMUL_H
|
||||||
|
# error crc_x86_clmul.h was included twice.
|
||||||
|
#endif
|
||||||
|
#define LZMA_CRC_X86_CLMUL_H
|
||||||
|
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
# include <intrin.h>
|
||||||
|
#elif defined(HAVE_CPUID_H)
|
||||||
|
# include <cpuid.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
// EDG-based compilers (Intel's classic compiler and compiler for E2K) can
|
// EDG-based compilers (Intel's classic compiler and compiler for E2K) can
|
||||||
// define __GNUC__ but the attribute must not be used with them.
|
// define __GNUC__ but the attribute must not be used with them.
|
||||||
|
@ -225,12 +235,12 @@ calc_hi(uint64_t p, uint64_t a, int n)
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifdef HAVE_CHECK_CRC32
|
#ifdef BUILDING_CRC32_CLMUL
|
||||||
|
|
||||||
crc_attr_target
|
crc_attr_target
|
||||||
crc_attr_no_sanitize_address
|
crc_attr_no_sanitize_address
|
||||||
extern uint32_t
|
static uint32_t
|
||||||
lzma_crc32_clmul(const uint8_t *buf, size_t size, uint32_t crc)
|
crc32_clmul(const uint8_t *buf, size_t size, uint32_t crc)
|
||||||
{
|
{
|
||||||
#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS
|
#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS
|
||||||
// The code assumes that there is at least one byte of input.
|
// The code assumes that there is at least one byte of input.
|
||||||
|
@ -265,7 +275,7 @@ lzma_crc32_clmul(const uint8_t *buf, size_t size, uint32_t crc)
|
||||||
v0 = _mm_xor_si128(v0, v2); // [2]
|
v0 = _mm_xor_si128(v0, v2); // [2]
|
||||||
return ~(uint32_t)_mm_extract_epi32(v0, 2);
|
return ~(uint32_t)_mm_extract_epi32(v0, 2);
|
||||||
}
|
}
|
||||||
#endif // HAVE_CHECK_CRC32
|
#endif // BUILDING_CRC32_CLMUL
|
||||||
|
|
||||||
|
|
||||||
/////////////////////
|
/////////////////////
|
||||||
|
@ -299,7 +309,7 @@ calc_hi(uint64_t poly, uint64_t a)
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifdef HAVE_CHECK_CRC64
|
#ifdef BUILDING_CRC64_CLMUL
|
||||||
|
|
||||||
// MSVC (VS2015 - VS2022) produces bad 32-bit x86 code from the CLMUL CRC
|
// MSVC (VS2015 - VS2022) produces bad 32-bit x86 code from the CLMUL CRC
|
||||||
// code when optimizations are enabled (release build). According to the bug
|
// code when optimizations are enabled (release build). According to the bug
|
||||||
|
@ -318,8 +328,8 @@ calc_hi(uint64_t poly, uint64_t a)
|
||||||
|
|
||||||
crc_attr_target
|
crc_attr_target
|
||||||
crc_attr_no_sanitize_address
|
crc_attr_no_sanitize_address
|
||||||
extern uint64_t
|
static uint64_t
|
||||||
lzma_crc64_clmul(const uint8_t *buf, size_t size, uint64_t crc)
|
crc64_clmul(const uint8_t *buf, size_t size, uint64_t crc)
|
||||||
{
|
{
|
||||||
#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS
|
#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS
|
||||||
// The code assumes that there is at least one byte of input.
|
// The code assumes that there is at least one byte of input.
|
||||||
|
@ -366,4 +376,54 @@ lzma_crc64_clmul(const uint8_t *buf, size_t size, uint64_t crc)
|
||||||
# pragma optimize("", on)
|
# pragma optimize("", on)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif // HAVE_CHECK_CRC64
|
#endif // BUILDING_CRC64_CLMUL
|
||||||
|
|
||||||
|
|
||||||
|
// is_clmul_supported() must be inlined in this header file because the
|
||||||
|
// ifunc resolver function may not support calling a function in another
|
||||||
|
// translation unit. Depending on compiler-toolchain and flags, a call to
|
||||||
|
// a function defined in another translation unit could result in a
|
||||||
|
// reference to the PLT, which is unsafe to do in an ifunc resolver. The
|
||||||
|
// ifunc resolver runs very early when loading a shared library, so the PLT
|
||||||
|
// entries may not be setup at that time. Inlining this function duplicates
|
||||||
|
// the function body in crc32_resolve() and crc64_resolve(), but this is
|
||||||
|
// acceptable because the function results in very few instructions.
|
||||||
|
static inline bool
|
||||||
|
is_clmul_supported(void)
|
||||||
|
{
|
||||||
|
int success = 1;
|
||||||
|
uint32_t r[4]; // eax, ebx, ecx, edx
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
// This needs <intrin.h> with MSVC. ICC has it as a built-in
|
||||||
|
// on all platforms.
|
||||||
|
__cpuid(r, 1);
|
||||||
|
#elif defined(HAVE_CPUID_H)
|
||||||
|
// Compared to just using __asm__ to run CPUID, this also checks
|
||||||
|
// that CPUID is supported and saves and restores ebx as that is
|
||||||
|
// needed with GCC < 5 with position-independent code (PIC).
|
||||||
|
success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]);
|
||||||
|
#else
|
||||||
|
// Just a fallback that shouldn't be needed.
|
||||||
|
__asm__("cpuid\n\t"
|
||||||
|
: "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3])
|
||||||
|
: "a"(1), "c"(0));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Returns true if these are supported:
|
||||||
|
// CLMUL (bit 1 in ecx)
|
||||||
|
// SSSE3 (bit 9 in ecx)
|
||||||
|
// SSE4.1 (bit 19 in ecx)
|
||||||
|
const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19);
|
||||||
|
return success && (r[2] & ecx_mask) == ecx_mask;
|
||||||
|
|
||||||
|
// Alternative methods that weren't used:
|
||||||
|
// - ICC's _may_i_use_cpu_feature: the other methods should work too.
|
||||||
|
// - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul")
|
||||||
|
//
|
||||||
|
// CPUID decding is needed with MSVC anyway and older GCC. This keeps
|
||||||
|
// the feature checks in the build system simpler too. The nice thing
|
||||||
|
// about __builtin_cpu_supports would be that it generates very short
|
||||||
|
// code as is it only reads a variable set at startup but a few bytes
|
||||||
|
// doesn't matter here.
|
||||||
|
}
|
Loading…
Reference in New Issue