Skip to content

Commit

Permalink
AVX2: dec: unroll inner loop
Browse files Browse the repository at this point in the history
  • Loading branch information
aklomp committed Nov 29, 2019
1 parent dd8dc9b commit 495414b
Showing 1 changed file with 76 additions and 37 deletions.
113 changes: 76 additions & 37 deletions lib/arch/avx2/dec_loop.c
Original file line number Diff line number Diff line change
@@ -1,19 +1,6 @@
static inline void
dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
static inline int
dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
{
if (*slen < 45) {
return;
}

// Process blocks of 32 bytes per round. Because 8 extra zero bytes are
// written after the output, ensure that there will be at least 13
// bytes of input data left to cover the gap. (11 data bytes and up to
// two end-of-string markers.)
size_t rounds = (*slen - 13) / 32;

*slen -= rounds * 32; // 32 bytes consumed per round
*olen += rounds * 24; // 24 bytes produced per round

const __m256i lut_lo = _mm256_setr_epi8(
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
Expand All @@ -34,36 +21,88 @@ dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)

const __m256i mask_2F = _mm256_set1_epi8(0x2F);

do {
// Load input:
__m256i str = _mm256_loadu_si256((__m256i *) *s);
// Load input:
__m256i str = _mm256_loadu_si256((__m256i *) *s);

// See the SSSE3 decoder for an explanation of the algorithm.
const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
// See the SSSE3 decoder for an explanation of the algorithm.
const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);

if (!_mm256_testz_si256(lo, hi)) {
break;
}
if (!_mm256_testz_si256(lo, hi)) {
return 0;
}

const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));

const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
// Now simply add the delta values to the input:
str = _mm256_add_epi8(str, roll);

// Now simply add the delta values to the input:
str = _mm256_add_epi8(str, roll);
// Reshuffle the input to packed 12-byte output format:
str = dec_reshuffle(str);

// Reshuffle the input to packed 12-byte output format:
str = dec_reshuffle(str);
// Store the output:
_mm256_storeu_si256((__m256i *) *o, str);

// Store the output:
_mm256_storeu_si256((__m256i *) *o, str);
*s += 32;
*o += 24;
*rounds -= 1;

*s += 32;
*o += 24;
return 1;
}

static inline void
dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 45) {
return;
}

// Process blocks of 32 bytes per round. Because 8 extra zero bytes are
// written after the output, ensure that there will be at least 13
// bytes of input data left to cover the gap. (11 data bytes and up to
// two end-of-string markers.)
size_t rounds = (*slen - 13) / 32;

*slen -= rounds * 32; // 32 bytes consumed per round
*olen += rounds * 24; // 24 bytes produced per round

do {
if (rounds >= 8) {
if (dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds)) {
continue;
}
break;
}
if (rounds >= 4) {
if (dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds)) {
continue;
}
break;
}
if (rounds >= 2) {
if (dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds)) {
continue;
}
break;
}
dec_loop_avx2_inner(s, o, &rounds);
break;

} while (--rounds > 0);
} while (rounds > 0);

// Adjust for any rounds that were skipped:
*slen += rounds * 32;
Expand Down

0 comments on commit 495414b

Please sign in to comment.