From 495414bb3e235ea6a499e7b7c31348bc4c79a917 Mon Sep 17 00:00:00 2001
From: Alfred Klomp <git@alfredklomp.com>
Date: Sat, 23 Nov 2019 16:29:33 +0100
Subject: [PATCH] AVX2: dec: unroll inner loop

---
 lib/arch/avx2/dec_loop.c | 113 ++++++++++++++++++++++++++-------------
 1 file changed, 76 insertions(+), 37 deletions(-)

diff --git a/lib/arch/avx2/dec_loop.c b/lib/arch/avx2/dec_loop.c
index c344d8aa..f959fc4b 100644
--- a/lib/arch/avx2/dec_loop.c
+++ b/lib/arch/avx2/dec_loop.c
@@ -1,19 +1,6 @@
-static inline void
-dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+static inline int
+dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
 {
-	if (*slen < 45) {
-		return;
-	}
-
-	// Process blocks of 32 bytes per round. Because 8 extra zero bytes are
-	// written after the output, ensure that there will be at least 13
-	// bytes of input data left to cover the gap. (11 data bytes and up to
-	// two end-of-string markers.)
-	size_t rounds = (*slen - 13) / 32;
-
-	*slen -= rounds * 32;	// 32 bytes consumed per round
-	*olen += rounds * 24;	// 24 bytes produced per round
-
 	const __m256i lut_lo = _mm256_setr_epi8(
 		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
 		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
@@ -34,36 +21,88 @@ dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 
 	const __m256i mask_2F = _mm256_set1_epi8(0x2F);
 
-	do {
-		// Load input:
-		__m256i str = _mm256_loadu_si256((__m256i *) *s);
+	// Load input:
+	__m256i str = _mm256_loadu_si256((__m256i *) *s);
 
-		// See the SSSE3 decoder for an explanation of the algorithm.
-		const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
-		const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
-		const __m256i hi         = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
-		const __m256i lo         = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
+	// See the SSSE3 decoder for an explanation of the algorithm.
+	const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
+	const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
+	const __m256i hi         = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
+	const __m256i lo         = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
 
-		if (!_mm256_testz_si256(lo, hi)) {
-			break;
-		}
+	if (!_mm256_testz_si256(lo, hi)) {
+		return 0;
+	}
+
+	const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
+	const __m256i roll  = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
 
-		const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
-		const __m256i roll  = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
+	// Now simply add the delta values to the input:
+	str = _mm256_add_epi8(str, roll);
 
-		// Now simply add the delta values to the input:
-		str = _mm256_add_epi8(str, roll);
+	// Reshuffle the input to packed 12-byte output format:
+	str = dec_reshuffle(str);
 
-		// Reshuffle the input to packed 12-byte output format:
-		str = dec_reshuffle(str);
+	// Store the output:
+	_mm256_storeu_si256((__m256i *) *o, str);
 
-		// Store the output:
-		_mm256_storeu_si256((__m256i *) *o, str);
+	*s += 32;
+	*o += 24;
+	*rounds -= 1;
 
-		*s += 32;
-		*o += 24;
+	return 1;
+}
+
+static inline void
+dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 45) {
+		return;
+	}
+
+	// Process blocks of 32 bytes per round. Because 8 extra zero bytes are
+	// written after the output, ensure that there will be at least 13
+	// bytes of input data left to cover the gap. (11 data bytes and up to
+	// two end-of-string markers.)
+	size_t rounds = (*slen - 13) / 32;
+
+	*slen -= rounds * 32;	// 32 bytes consumed per round
+	*olen += rounds * 24;	// 24 bytes produced per round
+
+	do {
+		if (rounds >= 8) {
+			if (dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		if (rounds >= 4) {
+			if (dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		if (rounds >= 2) {
+			if (dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		dec_loop_avx2_inner(s, o, &rounds);
+		break;
 
-	} while (--rounds > 0);
+	} while (rounds > 0);
 
 	// Adjust for any rounds that were skipped:
 	*slen += rounds * 32;