From a80d268700d1a8496dd05de50833dfc6bdd750f0 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Sat, 29 May 2021 18:21:10 +0100 Subject: [PATCH 1/5] Optimize ZSTD_decodeSequence by another x% --- lib/common/bitstream.h | 10 +++ lib/decompress/zstd_decompress_block.c | 95 ++++++++++++-------------- 2 files changed, 52 insertions(+), 53 deletions(-) diff --git a/lib/common/bitstream.h b/lib/common/bitstream.h index 2e5a933ad3d..1a494ee8567 100644 --- a/lib/common/bitstream.h +++ b/lib/common/bitstream.h @@ -332,7 +332,17 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c U32 const regMask = sizeof(bitContainer)*8 - 1; /* if start > regMask, bitstream is corrupted, and result is undefined */ assert(nbBits < BIT_MASK_SIZE); + /* x86 transform & ((1 << nbBits) - 1) to bzhi instruction, it is better + * than accessing memory. When bmi2 instruction is not present, we consider + * such cpus old (pre-Haswell, 2013) and their performance is not of that + * importance. + */ +#if defined(__x86_64__) || defined(_M_X86) + U64 const one = 1; + return (bitContainer >> (start & regMask)) & ((one << nbBits) - 1); +#else return (bitContainer >> (start & regMask)) & BIT_mask[nbBits]; +#endif } MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c index 349dcdc3336..6c40be4163e 100644 --- a/lib/decompress/zstd_decompress_block.c +++ b/lib/decompress/zstd_decompress_block.c @@ -905,20 +905,10 @@ ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqS } FORCE_INLINE_TEMPLATE void -ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD) +ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits) { - ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state]; - U32 const nbBits = DInfo.nbBits; size_t const lowBits = BIT_readBits(bitD, nbBits); - DStatePtr->state = DInfo.nextState + lowBits; -} - -FORCE_INLINE_TEMPLATE void -ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD_seqSymbol const DInfo) -{ - U32 const nbBits = DInfo.nbBits; - size_t const lowBits = BIT_readBits(bitD, nbBits); - DStatePtr->state = DInfo.nextState + lowBits; + DStatePtr->state = nextState + lowBits; } /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum @@ -937,20 +927,36 @@ FORCE_INLINE_TEMPLATE seq_t ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) { seq_t seq; - ZSTD_seqSymbol const llDInfo = seqState->stateLL.table[seqState->stateLL.state]; - ZSTD_seqSymbol const mlDInfo = seqState->stateML.table[seqState->stateML.state]; - ZSTD_seqSymbol const ofDInfo = seqState->stateOffb.table[seqState->stateOffb.state]; - U32 const llBase = llDInfo.baseValue; - U32 const mlBase = mlDInfo.baseValue; - U32 const ofBase = ofDInfo.baseValue; - BYTE const llBits = llDInfo.nbAdditionalBits; - BYTE const mlBits = mlDInfo.nbAdditionalBits; - BYTE const ofBits = ofDInfo.nbAdditionalBits; + const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; + const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; + const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; + seq.matchLength = mlDInfo->baseValue; + seq.litLength = llDInfo->baseValue; + U32 const ofBase = ofDInfo->baseValue; + BYTE const llBits = llDInfo->nbAdditionalBits; + BYTE const mlBits = mlDInfo->nbAdditionalBits; + BYTE const ofBits = ofDInfo->nbAdditionalBits; BYTE const totalBits = llBits+mlBits+ofBits; + U16 const llNext = llDInfo->nextState; + U16 const mlNext = mlDInfo->nextState; + U16 const ofNext = ofDInfo->nextState; + U32 const llnbBits = llDInfo->nbBits; + U32 const mlnbBits = mlDInfo->nbBits; + U32 const ofnbBits = ofDInfo->nbBits; + /* + * As gcc has better branch and block analyzers, sometimes it is only + * valuable to mark likelyness for clang, it gives around 3-4% of + * performance. + */ + /* sequence */ { size_t offset; +#if defined(__clang__) + if (LIKELY(ofBits > 1)) { +#else if (ofBits > 1) { +#endif ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); assert(ofBits <= MaxOff); @@ -968,12 +974,10 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) seqState->prevOffset[1] = seqState->prevOffset[0]; seqState->prevOffset[0] = offset; } else { - U32 const ll0 = (llBase == 0); + U32 const ll0 = (llDInfo->baseValue == 0); if (LIKELY((ofBits == 0))) { - if (LIKELY(!ll0)) - offset = seqState->prevOffset[0]; - else { - offset = seqState->prevOffset[1]; + offset = seqState->prevOffset[ll0]; + if (UNLIKELY(ll0)) { seqState->prevOffset[1] = seqState->prevOffset[0]; seqState->prevOffset[0] = offset; } @@ -988,8 +992,11 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) seq.offset = offset; } - seq.matchLength = mlBase; +#if defined(__clang__) + if (UNLIKELY(mlBits > 0)) +#else if (mlBits > 0) +#endif seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) @@ -999,8 +1006,11 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); - seq.litLength = llBase; +#if defined(__clang__) + if (UNLIKELY(llBits > 0)) +#else if (llBits > 0) +#endif seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); if (MEM_32bits()) @@ -1009,31 +1019,10 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); - /* ANS state update - * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo(). - * clang-9.2.0 does 7% worse with ZSTD_updateFseState(). - * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the - * better option, so it is the default for other compilers. But, if you - * measure that it is worse, please put up a pull request. - */ - { -#if defined(__GNUC__) && !defined(__clang__) - const int kUseUpdateFseState = 1; -#else - const int kUseUpdateFseState = 0; -#endif - if (kUseUpdateFseState) { - ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ - ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ - ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */ - } else { - ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llDInfo); /* <= 9 bits */ - ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlDInfo); /* <= 9 bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ - ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofDInfo); /* <= 8 bits */ - } - } + ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ return seq; } From 444f4db9556d42f3b143130b32f6b9f51e6ea9d6 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Sat, 29 May 2021 20:55:37 +0100 Subject: [PATCH 2/5] Move declaration of 1 to an inlined cast --- lib/common/bitstream.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/common/bitstream.h b/lib/common/bitstream.h index 1a494ee8567..48aad7f3ae0 100644 --- a/lib/common/bitstream.h +++ b/lib/common/bitstream.h @@ -338,8 +338,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c * importance. */ #if defined(__x86_64__) || defined(_M_X86) - U64 const one = 1; - return (bitContainer >> (start & regMask)) & ((one << nbBits) - 1); + return (bitContainer >> (start & regMask)) & ((((U64)1) << nbBits) - 1); #else return (bitContainer >> (start & regMask)) & BIT_mask[nbBits]; #endif From 6534c0000f73326f4a326dca9f0697f2b9d254c3 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Tue, 8 Jun 2021 20:45:57 +0100 Subject: [PATCH 3/5] Be C89 compliant and fix alignment for gcc11 --- lib/decompress/zstd_decompress_block.c | 177 +++++++++++++------------ 1 file changed, 89 insertions(+), 88 deletions(-) diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c index 6c40be4163e..40b2ae49221 100644 --- a/lib/decompress/zstd_decompress_block.c +++ b/lib/decompress/zstd_decompress_block.c @@ -932,97 +932,98 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; seq.matchLength = mlDInfo->baseValue; seq.litLength = llDInfo->baseValue; - U32 const ofBase = ofDInfo->baseValue; - BYTE const llBits = llDInfo->nbAdditionalBits; - BYTE const mlBits = mlDInfo->nbAdditionalBits; - BYTE const ofBits = ofDInfo->nbAdditionalBits; - BYTE const totalBits = llBits+mlBits+ofBits; - - U16 const llNext = llDInfo->nextState; - U16 const mlNext = mlDInfo->nextState; - U16 const ofNext = ofDInfo->nextState; - U32 const llnbBits = llDInfo->nbBits; - U32 const mlnbBits = mlDInfo->nbBits; - U32 const ofnbBits = ofDInfo->nbBits; - /* - * As gcc has better branch and block analyzers, sometimes it is only - * valuable to mark likelyness for clang, it gives around 3-4% of - * performance. - */ + { U32 const ofBase = ofDInfo->baseValue; + BYTE const llBits = llDInfo->nbAdditionalBits; + BYTE const mlBits = mlDInfo->nbAdditionalBits; + BYTE const ofBits = ofDInfo->nbAdditionalBits; + BYTE const totalBits = llBits+mlBits+ofBits; + + U16 const llNext = llDInfo->nextState; + U16 const mlNext = mlDInfo->nextState; + U16 const ofNext = ofDInfo->nextState; + U32 const llnbBits = llDInfo->nbBits; + U32 const mlnbBits = mlDInfo->nbBits; + U32 const ofnbBits = ofDInfo->nbBits; + /* + * As gcc has better branch and block analyzers, sometimes it is only + * valuable to mark likelyness for clang, it gives around 3-4% of + * performance. + */ - /* sequence */ - { size_t offset; -#if defined(__clang__) - if (LIKELY(ofBits > 1)) { -#else - if (ofBits > 1) { -#endif - ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); - ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); - assert(ofBits <= MaxOff); - if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { - U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); - offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); - BIT_reloadDStream(&seqState->DStream); - if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); - assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ - } else { - offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); - } - seqState->prevOffset[2] = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset; - } else { - U32 const ll0 = (llDInfo->baseValue == 0); - if (LIKELY((ofBits == 0))) { - offset = seqState->prevOffset[ll0]; - if (UNLIKELY(ll0)) { - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset; + /* sequence */ + { size_t offset; + #if defined(__clang__) + if (LIKELY(ofBits > 1)) { + #else + if (ofBits > 1) { + #endif + ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); + ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); + assert(ofBits <= MaxOff); + if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { + U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); + offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); + BIT_reloadDStream(&seqState->DStream); + if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); + assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ + } else { + offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); } + seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset; } else { - offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); - { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; - temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ - if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset = temp; - } } } - seq.offset = offset; - } - -#if defined(__clang__) - if (UNLIKELY(mlBits > 0)) -#else - if (mlBits > 0) -#endif - seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); - - if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) - BIT_reloadDStream(&seqState->DStream); - if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) - BIT_reloadDStream(&seqState->DStream); - /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ - ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); - -#if defined(__clang__) - if (UNLIKELY(llBits > 0)) -#else - if (llBits > 0) -#endif - seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); - - if (MEM_32bits()) - BIT_reloadDStream(&seqState->DStream); - - DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", - (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + U32 const ll0 = (llDInfo->baseValue == 0); + if (LIKELY((ofBits == 0))) { + offset = seqState->prevOffset[ll0]; + if (UNLIKELY(ll0)) { + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset; + } + } else { + offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); + { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; + temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ + if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset = temp; + } } } + seq.offset = offset; + } - ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ - ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ - ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ + #if defined(__clang__) + if (UNLIKELY(mlBits > 0)) + #else + if (mlBits > 0) + #endif + seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); + + if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) + BIT_reloadDStream(&seqState->DStream); + if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) + BIT_reloadDStream(&seqState->DStream); + /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ + ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); + + #if defined(__clang__) + if (UNLIKELY(llBits > 0)) + #else + if (llBits > 0) + #endif + seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); + + if (MEM_32bits()) + BIT_reloadDStream(&seqState->DStream); + + DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", + (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + + ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ + } return seq; } @@ -1155,7 +1156,7 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, __asm__("nop"); __asm__(".p2align 5"); __asm__("nop"); -# if __GNUC__ >= 9 +# if __GNUC__ >= 9 && __GNUC__ < 11 /* better for gcc-9 and gcc-10, worse for clang and gcc-8 */ __asm__(".p2align 3"); # else From 08a3ddbd28c362ef1f334c17894178d904436771 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Tue, 8 Jun 2021 20:54:21 +0100 Subject: [PATCH 4/5] Add comment for gcc-11 --- lib/decompress/zstd_decompress_block.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c index 40b2ae49221..e5391d66200 100644 --- a/lib/decompress/zstd_decompress_block.c +++ b/lib/decompress/zstd_decompress_block.c @@ -1157,7 +1157,7 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, __asm__(".p2align 5"); __asm__("nop"); # if __GNUC__ >= 9 && __GNUC__ < 11 - /* better for gcc-9 and gcc-10, worse for clang and gcc-8 */ + /* better for gcc-9 and gcc-10, worse for clang and gcc-8, gcc-11 */ __asm__(".p2align 3"); # else __asm__(".p2align 4"); From 2c2c9e7dfdcb8eee5a2b8611647698559d657477 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Tue, 29 Jun 2021 09:06:47 +0100 Subject: [PATCH 5/5] Add possible improvements for gcc-11 --- lib/decompress/zstd_decompress_block.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c index e5391d66200..9acf2f746ac 100644 --- a/lib/decompress/zstd_decompress_block.c +++ b/lib/decompress/zstd_decompress_block.c @@ -977,10 +977,8 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) U32 const ll0 = (llDInfo->baseValue == 0); if (LIKELY((ofBits == 0))) { offset = seqState->prevOffset[ll0]; - if (UNLIKELY(ll0)) { - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset; - } + seqState->prevOffset[1] = seqState->prevOffset[!ll0]; + seqState->prevOffset[0] = offset; } else { offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];