Skip to content

Commit

Permalink
lib/x86/crc32: target pclmul,sse4.1 instead of pclmul
Browse files Browse the repository at this point in the history
In practice, all CPUs that support PCLMULQDQ also support SSE4.1:

    Intel: Westmere and later + Silvermont and later
    AMD: Bulldozer and later

Therefore, make crc32_x86_pclmulqdq() use SSE4.1 instructions.

To be safe, add an explicit check for SSE4.1 support.  Though as per the
above, this is unnecessary in practice (as far as I can tell).
  • Loading branch information
ebiggers committed Oct 26, 2024
1 parent 972e734 commit 22ac423
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 45 deletions.
7 changes: 6 additions & 1 deletion lib/x86/cpu_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,12 @@ void libdeflate_init_x86_cpu_features(void)
family += (a >> 20) & 0xff;
if (d & (1 << 26))
features |= X86_CPU_FEATURE_SSE2;
if (c & (1 << 1))
/*
* No known CPUs have pclmulqdq without sse4.1, so in practice code
* targeting pclmulqdq can use sse4.1 instructions. But to be safe,
* explicitly check for both the pclmulqdq and sse4.1 bits.
*/
if ((c & (1 << 1)) && (c & (1 << 19)))
features |= X86_CPU_FEATURE_PCLMULQDQ;
if (c & (1 << 27))
xcr0 = read_xcr(0);
Expand Down
3 changes: 2 additions & 1 deletion lib/x86/cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
# define HAVE_SSE2_NATIVE 0
#endif

#if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__))
#if (defined(__PCLMUL__) && defined(__SSE4_1__)) || \
(defined(_MSC_VER) && defined(__AVX2__))
# define HAVE_PCLMULQDQ(features) 1
#else
# define HAVE_PCLMULQDQ(features) ((features) & X86_CPU_FEATURE_PCLMULQDQ)
Expand Down
24 changes: 8 additions & 16 deletions lib/x86/crc32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,31 +44,26 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
};

#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
/* PCLMULQDQ implementation */
/*
* PCLMULQDQ implementation. This targets PCLMULQDQ+SSE4.1, since in practice
* all CPUs that support PCLMULQDQ also support SSE4.1.
*/
# define crc32_x86_pclmulqdq crc32_x86_pclmulqdq
# define SUFFIX _pclmulqdq
# define ATTRIBUTES _target_attribute("pclmul")
# define ATTRIBUTES _target_attribute("pclmul,sse4.1")
# define VL 16
# define USE_SSE4_1 0
# define USE_AVX512 0
# include "crc32_pclmul_template.h"

/*
* PCLMULQDQ/AVX implementation. Compared to the regular PCLMULQDQ
* implementation, this still uses 128-bit vectors, but it has two potential
* benefits. First, simply compiling against the AVX target can improve
* performance significantly (e.g. 10100 MB/s to 16700 MB/s on Skylake) without
* actually using any AVX intrinsics, probably due to the availability of
* non-destructive VEX-encoded instructions. Second, AVX support implies SSSE3
* and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for efficient
* handling of partial blocks. (We *could* compile a variant with
* PCLMULQDQ+SSE4.1 without AVX, but for simplicity we currently don't bother.)
* PCLMULQDQ/AVX implementation. Same as above, but this is compiled with AVX
* enabled so that the compiler can generate VEX-coded instructions which can be
* slightly more efficient. It still uses 128-bit vectors.
*/
# define crc32_x86_pclmulqdq_avx crc32_x86_pclmulqdq_avx
# define SUFFIX _pclmulqdq_avx
# define ATTRIBUTES _target_attribute("pclmul,avx")
# define VL 16
# define USE_SSE4_1 1
# define USE_AVX512 0
# include "crc32_pclmul_template.h"
#endif
Expand All @@ -90,7 +85,6 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
# define SUFFIX _vpclmulqdq_avx2
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx2")
# define VL 32
# define USE_SSE4_1 1
# define USE_AVX512 0
# include "crc32_pclmul_template.h"
#endif
Expand All @@ -108,7 +102,6 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
# define SUFFIX _vpclmulqdq_avx512_vl256
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl" NO_EVEX512)
# define VL 32
# define USE_SSE4_1 1
# define USE_AVX512 1
# include "crc32_pclmul_template.h"

Expand All @@ -121,7 +114,6 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
# define SUFFIX _vpclmulqdq_avx512_vl512
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl" EVEX512)
# define VL 64
# define USE_SSE4_1 1
# define USE_AVX512 1
# include "crc32_pclmul_template.h"
#endif
Expand Down
32 changes: 5 additions & 27 deletions lib/x86/crc32_pclmul_template.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,13 @@
* ATTRIBUTES:
* Target function attributes to use. Must satisfy the dependencies of the
* other parameters as follows:
* VL=16 && USE_SSE4_1=0 && USE_AVX512=0: at least pclmul
* VL=16 && USE_SSE4_1=1 && USE_AVX512=0: at least pclmul,sse4.1
* VL=32 && USE_SSE4_1=1 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2
* VL=32 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
* VL=64 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
* VL=16 && USE_AVX512=0: at least pclmul,sse4.1
* VL=32 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2
* VL=32 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
* VL=64 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
* (Other combinations are not useful and have not been tested.)
* VL:
* Vector length in bytes. Must be 16, 32, or 64.
* USE_SSE4_1:
* If 1, take advantage of SSE4.1 instructions such as pblendvb.
* If 0, assume that the CPU might not support SSE4.1.
* USE_AVX512:
* If 1, take advantage of AVX-512 features such as masking and the
* vpternlog instruction. This doesn't enable the use of 512-bit vectors;
Expand Down Expand Up @@ -149,7 +145,6 @@ ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i /* __v8du */ mults)
#define fold_vec512 ADD_SUFFIX(fold_vec512)
#endif /* VL >= 64 */

#if USE_SSE4_1
/*
* Given 'x' containing a 16-byte polynomial, and a pointer 'p' that points to
* the next '1 <= len <= 15' data bytes, rearrange the concatenation of 'x' and
Expand Down Expand Up @@ -181,7 +176,6 @@ ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len,
return fold_vec128(x0, x1, mults_128b);
}
#define fold_lessthan16bytes ADD_SUFFIX(fold_lessthan16bytes)
#endif /* USE_SSE4_1 */

static ATTRIBUTES u32
ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
Expand Down Expand Up @@ -273,7 +267,6 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
size_t align = -(uintptr_t)p & (VL-1);

len -= align;
#if USE_SSE4_1
x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), x0);
p += 16;
if (align & 15) {
Expand All @@ -296,11 +289,6 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
v0 = _mm512_inserti64x4(v0, *(const __m256i *)(p + 16), 1);
# endif
p -= 16;
#else
crc = crc32_slice1(crc, p, align);
p += align;
v0 = VXOR(VLOADU(p), M128I_TO_VEC(_mm_cvtsi32_si128(crc)));
#endif
} else {
v0 = VXOR(VLOADU(p), M128I_TO_VEC(x0));
}
Expand Down Expand Up @@ -399,10 +387,8 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
* If fold_lessthan16bytes() is available, handle any remainder
* of 1 to 15 bytes now, before reducing to 32 bits.
*/
#if USE_SSE4_1
if (len)
x0 = fold_lessthan16bytes(x0, p, len, mults_128b);
#endif
#if USE_AVX512
reduce_x0:
#endif
Expand Down Expand Up @@ -467,14 +453,7 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32),
barrett_reduction_constants, 0x10);
x0 = _mm_xor_si128(x0, x1);
#if USE_SSE4_1
crc = _mm_extract_epi32(x0, 1);
#else
crc = _mm_cvtsi128_si32(_mm_shuffle_epi32(x0, 0x01));
/* Process up to 15 bytes left over at the end. */
crc = crc32_slice1(crc, p, len);
#endif
return crc;
return _mm_extract_epi32(x0, 1);
}

#undef vec_t
Expand All @@ -491,5 +470,4 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
#undef SUFFIX
#undef ATTRIBUTES
#undef VL
#undef USE_SSE4_1
#undef USE_AVX512

0 comments on commit 22ac423

Please sign in to comment.