Skip to content

Commit

Permalink
AVX2: enc: unroll inner loop
Browse files Browse the repository at this point in the history
  • Loading branch information
aklomp committed Nov 28, 2019
1 parent 0024d2b commit e2c6687
Showing 1 changed file with 66 additions and 22 deletions.
88 changes: 66 additions & 22 deletions lib/arch/avx2/enc_loop.c
Original file line number Diff line number Diff line change
@@ -1,3 +1,37 @@
static inline void
enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
{
// First load is done at s - 0 to not get a segfault:
__m256i src = _mm256_loadu_si256((__m256i *) *s);

// Shift by 4 bytes, as required by enc_reshuffle:
src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));

// Reshuffle, translate, store:
src = enc_reshuffle(src);
src = enc_translate(src);
_mm256_storeu_si256((__m256i *) *o, src);

// Subsequent loads will be done at s - 4, set pointer for next round:
*s += 20;
*o += 32;
}

static inline void
enc_loop_avx2_inner (const uint8_t **s, uint8_t **o)
{
// Load input:
__m256i src = _mm256_loadu_si256((__m256i *) *s);

// Reshuffle, translate, store:
src = enc_reshuffle(src);
src = enc_translate(src);
_mm256_storeu_si256((__m256i *) *o, src);

*s += 24;
*o += 32;
}

static inline void
enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
Expand All @@ -14,30 +48,40 @@ enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
*slen -= rounds * 24; // 24 bytes consumed per round
*olen += rounds * 32; // 32 bytes produced per round

// First load is done at s - 0 to not get a segfault:
__m256i inputvector = _mm256_loadu_si256((__m256i *) *s);

// Subsequent loads will be done at s - 4, set pointer for next round:
*s += 20;

// Shift by 4 bytes, as required by enc_reshuffle:
inputvector = _mm256_permutevar8x32_epi32(inputvector, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));

for (;;) {

// Reshuffle, translate, store:
inputvector = enc_reshuffle(inputvector);
inputvector = enc_translate(inputvector);
_mm256_storeu_si256((__m256i *) *o, inputvector);
*o += 32;
// The first loop iteration requires special handling to ensure that
// the read, which is done at an offset, does not underflow the buffer:
enc_loop_avx2_inner_first(s, o);
rounds--;

if (--rounds == 0) {
break;
while (rounds > 0) {
if (rounds >= 8) {
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
rounds -= 8;
continue;
}

// Load for the next round:
inputvector = _mm256_loadu_si256((__m256i *) *s);
*s += 24;
if (rounds >= 4) {
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
rounds -= 4;
continue;
}
if (rounds >= 2) {
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
rounds -= 2;
continue;
}
enc_loop_avx2_inner(s, o);
break;
}

// Add the offset back:
Expand Down

0 comments on commit e2c6687

Please sign in to comment.