From 495414bb3e235ea6a499e7b7c31348bc4c79a917 Mon Sep 17 00:00:00 2001 From: Alfred Klomp Date: Sat, 23 Nov 2019 16:29:33 +0100 Subject: [PATCH] AVX2: dec: unroll inner loop --- lib/arch/avx2/dec_loop.c | 113 ++++++++++++++++++++++++++------------- 1 file changed, 76 insertions(+), 37 deletions(-) diff --git a/lib/arch/avx2/dec_loop.c b/lib/arch/avx2/dec_loop.c index c344d8aa..f959fc4b 100644 --- a/lib/arch/avx2/dec_loop.c +++ b/lib/arch/avx2/dec_loop.c @@ -1,19 +1,6 @@ -static inline void -dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +static inline int +dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds) { - if (*slen < 45) { - return; - } - - // Process blocks of 32 bytes per round. Because 8 extra zero bytes are - // written after the output, ensure that there will be at least 13 - // bytes of input data left to cover the gap. (11 data bytes and up to - // two end-of-string markers.) - size_t rounds = (*slen - 13) / 32; - - *slen -= rounds * 32; // 32 bytes consumed per round - *olen += rounds * 24; // 24 bytes produced per round - const __m256i lut_lo = _mm256_setr_epi8( 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A, @@ -34,36 +21,88 @@ dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) const __m256i mask_2F = _mm256_set1_epi8(0x2F); - do { - // Load input: - __m256i str = _mm256_loadu_si256((__m256i *) *s); + // Load input: + __m256i str = _mm256_loadu_si256((__m256i *) *s); - // See the SSSE3 decoder for an explanation of the algorithm. - const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F); - const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F); - const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles); - const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles); + // See the SSSE3 decoder for an explanation of the algorithm. + const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F); + const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F); + const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles); + const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles); - if (!_mm256_testz_si256(lo, hi)) { - break; - } + if (!_mm256_testz_si256(lo, hi)) { + return 0; + } + + const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F); + const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles)); - const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F); - const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles)); + // Now simply add the delta values to the input: + str = _mm256_add_epi8(str, roll); - // Now simply add the delta values to the input: - str = _mm256_add_epi8(str, roll); + // Reshuffle the input to packed 12-byte output format: + str = dec_reshuffle(str); - // Reshuffle the input to packed 12-byte output format: - str = dec_reshuffle(str); + // Store the output: + _mm256_storeu_si256((__m256i *) *o, str); - // Store the output: - _mm256_storeu_si256((__m256i *) *o, str); + *s += 32; + *o += 24; + *rounds -= 1; - *s += 32; - *o += 24; + return 1; +} + +static inline void +dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 45) { + return; + } + + // Process blocks of 32 bytes per round. Because 8 extra zero bytes are + // written after the output, ensure that there will be at least 13 + // bytes of input data left to cover the gap. (11 data bytes and up to + // two end-of-string markers.) + size_t rounds = (*slen - 13) / 32; + + *slen -= rounds * 32; // 32 bytes consumed per round + *olen += rounds * 24; // 24 bytes produced per round + + do { + if (rounds >= 8) { + if (dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds)) { + continue; + } + break; + } + if (rounds >= 4) { + if (dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds)) { + continue; + } + break; + } + if (rounds >= 2) { + if (dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds)) { + continue; + } + break; + } + dec_loop_avx2_inner(s, o, &rounds); + break; - } while (--rounds > 0); + } while (rounds > 0); // Adjust for any rounds that were skipped: *slen += rounds * 32;