From 96b663f67c0e738a99ba8f35d9f4ced9add74544 Mon Sep 17 00:00:00 2001 From: Jia Tan Date: Sat, 14 Oct 2023 13:23:23 +0800 Subject: [PATCH] liblzma: Refactor CRC comments. A detailed description of the three dispatch methods was added. Also, duplicated comments now only appear in crc32_fast.c or were removed from both crc32_fast.c and crc64_fast.c if they appeared in crc_clmul.c. --- src/liblzma/check/crc32_fast.c | 64 ++++++++++++++++++++++++---------- src/liblzma/check/crc64_fast.c | 61 +++++--------------------------- 2 files changed, 53 insertions(+), 72 deletions(-) diff --git a/src/liblzma/check/crc32_fast.c b/src/liblzma/check/crc32_fast.c index 8849a476..add93d55 100644 --- a/src/liblzma/check/crc32_fast.c +++ b/src/liblzma/check/crc32_fast.c @@ -2,25 +2,6 @@ // /// \file crc32.c /// \brief CRC32 calculation -/// -/// There are two methods in this file. -/// crc32_generic uses the slice-by-eight algorithm. -/// It is explained in this document: -/// http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf -/// The code in this file is not the same as in Intel's paper, but -/// the basic principle is identical. -/// -/// crc32_clmul uses 32/64-bit x86 SSSE3, SSE4.1, and CLMUL instructions. -/// It was derived from -/// https://www.researchgate.net/publication/263424619_Fast_CRC_computation -/// and the public domain code from https://github.com/rawrunprotected/crc -/// (URLs were checked on 2023-09-29). -/// -/// FIXME: Builds for 32-bit x86 use crc32_x86.S by default instead -/// of this file and thus CLMUL version isn't available on 32-bit x86 -/// unless configured with --disable-assembler. Even then the lookup table -/// isn't omitted in crc32_table.c since it doesn't know that assembly -/// code has been disabled. // // Authors: Lasse Collin // Ilya Kurdyukov @@ -100,6 +81,38 @@ crc32_generic(const uint8_t *buf, size_t size, uint32_t crc) #endif #if defined(CRC_GENERIC) && defined(CRC_CLMUL) + +////////////////////////// +// Function dispatching // +////////////////////////// + +// If both the generic and CLMUL implementations are built, then the +// function to use is selected at runtime since system running the +// binary may not have the CLMUL instructions. +// The three dispatch methods in order of priority: +// +// 1. Indirect function (ifunc). This method is slightly more efficient +// than the constructor method because it will change the entry in the +// Procedure Linkage Table (PLT) for the function either at load time or +// at the first call. This avoids having to call the function through a +// function pointer and will treat the function call like a regular call +// through the PLT. ifuncs are created by using +// __attribute__((__ifunc__("resolver"))) on a function which has no +// body. The "resolver" is the name of the function that chooses at +// runtime which implementation to use. +// +// 2. Constructor. This method uses __attribute__((__constructor__)) to +// set crc32_func at load time. This avoids extra computation (and any +// unlikely threading bugs) on the first call to lzma_crc32() to decide +// which implementation should be used. +// +// 3. First Call Resolution. On the very first call to lzma_crc32(), the +// call will be directed to crc32_dispatch() instead. This will set the +// appropriate implementation function and will not be called again. +// This method does not use any kind of locking but is safe because if +// multiple threads run the dispatcher simultaneously then they will all +// set crc32_func to the same value. + typedef uint32_t (*crc32_func_type)( const uint8_t *buf, size_t size, uint32_t crc); @@ -111,6 +124,9 @@ typedef uint32_t (*crc32_func_type)( # pragma GCC diagnostic ignored "-Wunused-function" #endif +// This resolver is shared between all three dispatch methods. It serves as +// the ifunc resolver if ifunc is supported, otherwise it is called as a +// regular function by the constructor or first call resolution methods. static crc32_func_type crc32_resolve(void) { @@ -124,9 +140,11 @@ crc32_resolve(void) #ifndef HAVE_FUNC_ATTRIBUTE_IFUNC #ifdef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR +// Constructor method. # define CRC32_SET_FUNC_ATTR __attribute__((__constructor__)) static crc32_func_type crc32_func; #else +// First Call Resolution method. # define CRC32_SET_FUNC_ATTR static uint32_t crc32_dispatch(const uint8_t *buf, size_t size, uint32_t crc); static crc32_func_type crc32_func = &crc32_dispatch; @@ -190,6 +208,14 @@ lzma_crc32(const uint8_t *buf, size_t size, uint32_t crc) return crc32_generic(buf, size, crc); #endif +/* +#ifndef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR + // See crc32_dispatch(). This would be the alternative which uses + // locking and doesn't use crc32_dispatch(). Note that on Windows + // this method needs Vista threads. + mythread_once(crc64_set_func); +#endif +*/ return crc32_func(buf, size, crc); #elif defined(CRC_CLMUL) diff --git a/src/liblzma/check/crc64_fast.c b/src/liblzma/check/crc64_fast.c index 07f4f576..8acdc713 100644 --- a/src/liblzma/check/crc64_fast.c +++ b/src/liblzma/check/crc64_fast.c @@ -2,23 +2,6 @@ // /// \file crc64.c /// \brief CRC64 calculation -/// -/// There are two methods in this file. crc64_generic uses the -/// the slice-by-four algorithm. This is the same idea that is -/// used in crc32_fast.c, but for CRC64 we use only four tables -/// instead of eight to avoid increasing CPU cache usage. -/// -/// crc64_clmul uses 32/64-bit x86 SSSE3, SSE4.1, and CLMUL instructions. -/// It was derived from -/// https://www.researchgate.net/publication/263424619_Fast_CRC_computation -/// and the public domain code from https://github.com/rawrunprotected/crc -/// (URLs were checked on 2023-09-29). -/// -/// FIXME: Builds for 32-bit x86 use crc64_x86.S by default instead -/// of this file and thus CLMUL version isn't available on 32-bit x86 -/// unless configured with --disable-assembler. Even then the lookup table -/// isn't omitted in crc64_table.c since it doesn't know that assembly -/// code has been disabled. // // Authors: Lasse Collin // Ilya Kurdyukov @@ -93,12 +76,16 @@ crc64_generic(const uint8_t *buf, size_t size, uint64_t crc) #endif #if defined(CRC_GENERIC) && defined(CRC_CLMUL) +////////////////////////// +// Function dispatching // +////////////////////////// + +// If both the generic and CLMUL implementations are usable, then the +// function that is used is selected at runtime. See crc32_fast.c. + typedef uint64_t (*crc64_func_type)( const uint8_t *buf, size_t size, uint64_t crc); -// Clang 16.0.0 and older has a bug where it marks the ifunc resolver -// function as unused since it is static and never used outside of -// __attribute__((__ifunc__())). #if defined(HAVE_FUNC_ATTRIBUTE_IFUNC) && defined(__clang__) # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wunused-function" @@ -139,13 +126,6 @@ crc64_set_func(void) static uint64_t crc64_dispatch(const uint8_t *buf, size_t size, uint64_t crc) { - // When __attribute__((__ifunc__(...))) and - // __attribute__((__constructor__)) isn't supported, set the - // function pointer without any locking. If multiple threads run - // the detection code in parallel, they will all end up setting - // the pointer to the same value. This avoids the use of - // mythread_once() on every call to lzma_crc64() but this likely - // isn't strictly standards compliant. Let's change it if it breaks. crc64_set_func(); return crc64_func(buf, size, crc); } @@ -163,36 +143,11 @@ extern LZMA_API(uint64_t) lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc) { #if defined(CRC_GENERIC) && defined(CRC_CLMUL) - // If CLMUL is available, it is the best for non-tiny inputs, - // being over twice as fast as the generic slice-by-four version. - // However, for size <= 16 it's different. In the extreme case - // of size == 1 the generic version can be five times faster. - // At size >= 8 the CLMUL starts to become reasonable. It - // varies depending on the alignment of buf too. - // - // The above doesn't include the overhead of mythread_once(). - // At least on x86-64 GNU/Linux, pthread_once() is very fast but - // it still makes lzma_crc64(buf, 1, crc) 50-100 % slower. When - // size reaches 12-16 bytes the overhead becomes negligible. - // - // So using the generic version for size <= 16 may give better - // performance with tiny inputs but if such inputs happen rarely - // it's not so obvious because then the lookup table of the - // generic version may not be in the processor cache. + #ifdef CRC_USE_GENERIC_FOR_SMALL_INPUTS if (size <= 16) return crc64_generic(buf, size, crc); #endif - -/* -#ifndef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR - // See crc64_dispatch(). This would be the alternative which uses - // locking and doesn't use crc64_dispatch(). Note that on Windows - // this method needs Vista threads. - mythread_once(crc64_set_func); -#endif -*/ - return crc64_func(buf, size, crc); #elif defined(CRC_CLMUL)