Skip to content

Commit

Permalink
Fix repcode-related OSS-fuzz issues in block splitter (#2560)
Browse files Browse the repository at this point in the history
* Do not emit last partitions of blocks as RLE/uncompressed

* Fix repcode updates within block splitter

* Add a entropytables confirm function, redo ZSTD_confirmRepcodesAndEntropyTables() for better function signature

* Add a repcode updater to block splitter, no longer need to force emit compressed blocks
  • Loading branch information
senhuang42 authored Mar 31, 2021
1 parent d334ad2 commit 255925c
Showing 1 changed file with 66 additions and 61 deletions.
127 changes: 66 additions & 61 deletions lib/compress/zstd_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -2483,7 +2483,7 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
void* dst, size_t dstCapacity,
size_t srcSize,
void* entropyWorkspace, size_t entropyWkspSize,
int bmi2, U32 const canEmitUncompressed)
int bmi2)
{
size_t const cSize = ZSTD_entropyCompressSeqStore_internal(
seqStorePtr, prevEntropy, nextEntropy, cctxParams,
Expand All @@ -2493,15 +2493,13 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
/* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
* Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
*/
if (canEmitUncompressed) {
if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity))
return 0; /* block not compressed */
FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed");

/* Check compressibility */
{ size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
if (cSize >= maxCSize) return 0; /* block not compressed */
}
if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity))
return 0; /* block not compressed */
FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed");

/* Check compressibility */
{ size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
if (cSize >= maxCSize) return 0; /* block not compressed */
}
DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
return cSize;
Expand Down Expand Up @@ -2793,11 +2791,11 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore)
return nbSeqs < 4 && nbLits < 10;
}

static void ZSTD_confirmRepcodesAndEntropyTables(ZSTD_CCtx* zc)
static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
{
ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock;
zc->blockState.prevCBlock = zc->blockState.nextCBlock;
zc->blockState.nextCBlock = tmp;
ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock;
bs->prevCBlock = bs->nextCBlock;
bs->nextCBlock = tmp;
}

/* Writes the block header */
Expand Down Expand Up @@ -3180,6 +3178,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
resultSeqStore->ofCode += startIdx;
}

/**
* ZSTD_seqStore_updateRepcodes(): Starting from an array of initial repcodes and a seqStore,
* construct the final repcodes at the conclusion of compressing the seqStore, stored in dstRep.
*/
static void ZSTD_seqStore_updateRepcodes(U32 dstRep[ZSTD_REP_NUM],
const U32 initialRep[ZSTD_REP_NUM],
const seqStore_t* const seqStore, U32 const nbSeq) {
repcodes_t updatedRepcodes;
U32 idx = 0;
ZSTD_memcpy(updatedRepcodes.rep, initialRep, sizeof(repcodes_t));
for (; idx < nbSeq; ++idx) {
seqDef const seq = seqStore->sequencesStart[idx];
U32 const ll0 = (seq.litLength == 0);
U32 const offCode = seq.offset - 1;
assert(seq.offset >= 1); /* seqDef::offset == offCode+1, and ZSTD_updateRep() expects an offCode */
updatedRepcodes = ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
}
ZSTD_memcpy(dstRep, updatedRepcodes.rep, sizeof(repcodes_t));
}

/* ZSTD_compressSeqStore_singleBlock():
* Compresses a seqStore into a block with a block header, into the buffer dst.
*
Expand All @@ -3188,7 +3206,7 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
static size_t ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* seqStore,
void* dst, size_t dstCapacity,
const void* src, size_t srcSize,
U32 lastBlock, U32 canEmitRLEorNoCompress) {
U32 lastBlock, U32 isPartition) {
const U32 rleMaxLength = 25;
BYTE* op = (BYTE*)dst;
const BYTE* ip = (const BYTE*)src;
Expand All @@ -3199,13 +3217,12 @@ static size_t ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* seqSt
op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize,
srcSize,
zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
zc->bmi2, canEmitRLEorNoCompress);
zc->bmi2);
FORWARD_IF_ERROR(cSeqsSize, "ZSTD_entropyCompressSeqStore failed!");

if (!zc->isFirstBlock &&
cSeqsSize < rleMaxLength &&
ZSTD_isRLE((BYTE const*)src, srcSize)&&
canEmitRLEorNoCompress) {
ZSTD_isRLE((BYTE const*)src, srcSize)) {
/* We don't want to emit our first block as a RLE even if it qualifies because
* doing so will cause the decoder (cli only) to throw a "should consume all input error."
* This is only an issue for zstd <= v1.4.3
Expand All @@ -3215,24 +3232,30 @@ static size_t ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* seqSt

if (zc->seqCollector.collectSequences) {
ZSTD_copyBlockSequences(zc);
ZSTD_confirmRepcodesAndEntropyTables(zc);
ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
return 0;
}

if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;

if (cSeqsSize == 0 && canEmitRLEorNoCompress) {
if (cSeqsSize == 0) {
cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
FORWARD_IF_ERROR(cSize, "Nocompress block failed");
DEBUGLOG(4, "Writing out nocompress block, size: %zu", cSize);
} else if (cSeqsSize == 1 && canEmitRLEorNoCompress) {
} else if (cSeqsSize == 1) {
cSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, srcSize, lastBlock);
FORWARD_IF_ERROR(cSize, "RLE compress block failed");
DEBUGLOG(4, "Writing out RLE block, size: %zu", cSize);
} else {
/* Error checking and repcodes update */
ZSTD_confirmRepcodesAndEntropyTables(zc);
if (isPartition) {
/* We manually update repcodes if we are currently compressing a partition. Otherwise,
* for non-split blocks, the repcodes are already correct as-is.
*/
ZSTD_seqStore_updateRepcodes(zc->blockState.nextCBlock->rep, zc->blockState.prevCBlock->rep,
seqStore, (U32)(seqStore->sequences - seqStore->sequencesStart));
}
ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
writeBlockHeader(op, cSeqsSize, srcSize, lastBlock);
cSize = ZSTD_blockHeaderSize + cSeqsSize;
DEBUGLOG(4, "Writing out compressed block, size: %zu", cSize);
Expand Down Expand Up @@ -3310,18 +3333,6 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
return splits.idx;
}

/* Return 1 if if the first three sequences of seqstore/block use repcodes */
static U32 ZSTD_seqStore_firstThreeContainRepcodes(const seqStore_t* const seqStore) {
U32 const seqLimit = MIN((U32)(seqStore->sequences - seqStore->sequencesStart), ZSTD_REP_NUM);
U32 seqIdx = 0;
for (; seqIdx < seqLimit; ++seqIdx) {
if (seqStore->sequencesStart[seqIdx].offset <= ZSTD_REP_MOVE) {
return 1;
}
}
return 0;
}

/* ZSTD_compressBlock_splitBlock():
* Attempts to split a given block into multiple blocks to improve compression ratio.
*
Expand All @@ -3338,59 +3349,53 @@ static size_t ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, s
size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
seqStore_t nextSeqStore;
seqStore_t currSeqStore;
U32 canEmitRLEorNoCompress = 1;
const size_t dstCapacityInitial = dstCapacity;

DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
(unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
(unsigned)zc->blockState.matchState.nextToUpdate);

if (numSplits == 0) {
size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, op, dstCapacity, ip, blockSize, lastBlock, 1);
size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
op, dstCapacity,
ip, blockSize,
lastBlock, 0 /* isPartition */);
FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!");
DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits");
assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
return cSizeSingleBlock;
}

ZSTD_deriveSeqStoreChunk(&currSeqStore, &zc->seqStore, 0, partitions[0]);
for (i = 0; i <= numSplits; ++i) {
size_t srcBytes;
size_t cSizeChunk;
U32 lastBlockActual;
U32 const lastPartition = (i == numSplits);
U32 lastBlockEntireSrc = 0;

srcBytes = ZSTD_countSeqStoreLiteralsBytes(&currSeqStore) + ZSTD_countSeqStoreMatchBytes(&currSeqStore);
lastBlockActual = lastBlock && (i == numSplits);
srcBytesTotal += srcBytes;
if (i == numSplits) {
if (lastPartition) {
/* This is the final partition, need to account for possible last literals */
srcBytes += blockSize - srcBytesTotal;
lastBlockEntireSrc = lastBlock;
} else {
ZSTD_deriveSeqStoreChunk(&nextSeqStore, &zc->seqStore, partitions[i], partitions[i+1]);
if (ZSTD_seqStore_firstThreeContainRepcodes(&nextSeqStore)) {
DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: Next block contains rep in first three seqs!");
canEmitRLEorNoCompress = 0;
}
}

cSizeChunk = ZSTD_compressSeqStore_singleBlock(zc, &currSeqStore, op, dstCapacity, ip, srcBytes, lastBlockActual, canEmitRLEorNoCompress);
cSizeChunk = ZSTD_compressSeqStore_singleBlock(zc, &currSeqStore,
op, dstCapacity,
ip, srcBytes,
lastBlockEntireSrc, 1 /* isPartition */);
DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(&currSeqStore, zc), cSizeChunk);
FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!");
ZSTD_memcpy(zc->blockState.nextCBlock->rep, zc->blockState.prevCBlock->rep, sizeof(U32)*ZSTD_REP_NUM);

ip += srcBytes;
op += cSizeChunk;
dstCapacity -= cSizeChunk;
cSize += cSizeChunk;
currSeqStore = nextSeqStore;
assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
}

if (cSize > ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize) {
/* If too large, recompress the original block to avoid any chance of a single block exceeding ZSTD_BLOCKSIZE_MAX */
cSize = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, (BYTE*)dst, dstCapacityInitial, (const BYTE*)src, blockSize, lastBlock, 1);
FORWARD_IF_ERROR(cSize, "Compressing single block from splitBlock_internal() fallback failed!");
DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: Compressed split block too large, recompressed");
}
assert(cSize <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
return cSize;
}

Expand Down Expand Up @@ -3445,7 +3450,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,

if (zc->seqCollector.collectSequences) {
ZSTD_copyBlockSequences(zc);
ZSTD_confirmRepcodesAndEntropyTables(zc);
ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
return 0;
}

Expand All @@ -3456,7 +3461,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
dst, dstCapacity,
srcSize,
zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
zc->bmi2, 1 /* Can emit uncompressed blocks */);
zc->bmi2);

if (zc->seqCollector.collectSequences) {
ZSTD_copyBlockSequences(zc);
Expand All @@ -3479,7 +3484,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,

out:
if (!ZSTD_isError(cSize) && cSize > 1) {
ZSTD_confirmRepcodesAndEntropyTables(zc);
ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
}
/* We check that dictionaries have offset codes available for the first
* block. After the first block, the offcode table might not have large
Expand Down Expand Up @@ -3532,7 +3537,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
ZSTD_confirmRepcodesAndEntropyTables(zc);
ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
return cSize;
}
}
Expand Down Expand Up @@ -5665,7 +5670,7 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize,
blockSize,
cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
cctx->bmi2, 1 /* Can emit uncompressed blocks */);
cctx->bmi2);
FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed");
DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize);

Expand All @@ -5691,7 +5696,7 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
} else {
U32 cBlockHeader;
/* Error checking and repcodes update */
ZSTD_confirmRepcodesAndEntropyTables(cctx);
ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockState);
if (cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;

Expand Down

0 comments on commit 255925c

Please sign in to comment.