Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Proactively skip huffman compression based on sampling where non-comp… #2717

Merged
merged 1 commit into from
Jul 1, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions lib/common/huf.h
Original file line number Diff line number Diff line change
Expand Up @@ -206,12 +206,13 @@ typedef enum {
* Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
* If it uses hufTable it does not modify hufTable or repeat.
* If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
* If preferRepeat then the old table will always be used if valid. */
* If preferRepeat then the old table will always be used if valid.
* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
const void* src, size_t srcSize,
unsigned maxSymbolValue, unsigned tableLog,
void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2);
HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor : api design :

as we get more and more parameters in the function signature,
it becomes more and more difficult to understand what this 1 or this 0 means.
For clarity, we end up commenting them, so that the reader can track what each value means.
... , 1 /* suspected uncompressible */);.

Another possibility is to use enum.
..., HUF_suspectedUncompressible);
In such a context, it's essentially a compiler-checked comment.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After some thought, added comments rather than an enum; this is really just a stand-in for a boolean value so I think the argument name is sufficient to convey the meaning as far as the function definitions are concerned..


/** HUF_buildCTable_wksp() :
* Same as HUF_buildCTable(), but using externally allocated scratch buffer.
Expand Down Expand Up @@ -311,12 +312,13 @@ size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, si
* Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
* If it uses hufTable it does not modify hufTable or repeat.
* If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
* If preferRepeat then the old table will always be used if valid. */
* If preferRepeat then the old table will always be used if valid.
* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
const void* src, size_t srcSize,
unsigned maxSymbolValue, unsigned tableLog,
void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2);
HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);

size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */
#ifndef HUF_FORCE_DECOMPRESS_X1
Expand Down
34 changes: 27 additions & 7 deletions lib/compress/huf_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -758,6 +758,9 @@ typedef struct {
} wksps;
} HUF_compress_tables_t;

#define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
#define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >= 2 */

/* HUF_compress_internal() :
* `workSpace_align4` must be aligned on 4-bytes boundaries,
* and occupies the same space as a table of HUF_WORKSPACE_SIZE_U32 unsigned */
Expand All @@ -768,7 +771,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
HUF_nbStreams_e nbStreams,
void* workSpace_align4, size_t wkspSize,
HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
const int bmi2)
const int bmi2, unsigned suspectUncompressible)
{
HUF_compress_tables_t* const table = (HUF_compress_tables_t*)workSpace_align4;
BYTE* const ostart = (BYTE*)dst;
Expand All @@ -795,6 +798,21 @@ HUF_compress_internal (void* dst, size_t dstSize,
nbStreams, oldHufTable, bmi2);
}

/* If uncompressible data is suspected, do a smaller sampling first */
DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2);
if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
size_t largestTotal = 0;
{ unsigned maxSymbolValueBegin = maxSymbolValue;
CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
largestTotal += largestBegin;
}
{ unsigned maxSymbolValueEnd = maxSymbolValue;
CHECK_V_F(largestEnd, HIST_count_simple (table->count, &maxSymbolValueEnd, (const BYTE*)src + srcSize - SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
largestTotal += largestEnd;
}
if (largestTotal <= ((2 * SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) >> 7)+4) return 0; /* heuristic : probably not compressible enough */
}

/* Scan input and build symbol stats */
{ CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, workSpace_align4, wkspSize) );
if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */
Expand Down Expand Up @@ -860,19 +878,20 @@ size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
return HUF_compress_internal(dst, dstSize, src, srcSize,
maxSymbolValue, huffLog, HUF_singleStream,
workSpace, wkspSize,
NULL, NULL, 0, 0 /*bmi2*/);
NULL, NULL, 0, 0 /*bmi2*/, 0);
}

size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
const void* src, size_t srcSize,
unsigned maxSymbolValue, unsigned huffLog,
void* workSpace, size_t wkspSize,
HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2)
HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat,
int bmi2, unsigned suspectUncompressible)
{
return HUF_compress_internal(dst, dstSize, src, srcSize,
maxSymbolValue, huffLog, HUF_singleStream,
workSpace, wkspSize, hufTable,
repeat, preferRepeat, bmi2);
repeat, preferRepeat, bmi2, suspectUncompressible);
}

/* HUF_compress4X_repeat():
Expand All @@ -886,22 +905,23 @@ size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
return HUF_compress_internal(dst, dstSize, src, srcSize,
maxSymbolValue, huffLog, HUF_fourStreams,
workSpace, wkspSize,
NULL, NULL, 0, 0 /*bmi2*/);
NULL, NULL, 0, 0 /*bmi2*/, 0);
}

/* HUF_compress4X_repeat():
* compress input using 4 streams.
* consider skipping quickly
* re-use an existing huffman compression table */
size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
const void* src, size_t srcSize,
unsigned maxSymbolValue, unsigned huffLog,
void* workSpace, size_t wkspSize,
HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2)
HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible)
{
return HUF_compress_internal(dst, dstSize, src, srcSize,
maxSymbolValue, huffLog, HUF_fourStreams,
workSpace, wkspSize,
hufTable, repeat, preferRepeat, bmi2);
hufTable, repeat, preferRepeat, bmi2, suspectUncompressible);
}

#ifndef ZSTD_NO_UNUSED_FUNCTIONS
Expand Down
7 changes: 6 additions & 1 deletion lib/compress/zstd_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -2557,6 +2557,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
* compresses both literals and sequences
* Returns compressed size of block, or a zstd error.
*/
#define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20
MEM_STATIC size_t
ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
const ZSTD_entropyCTables_t* prevEntropy,
Expand Down Expand Up @@ -2591,6 +2592,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,

/* Compress literals */
{ const BYTE* const literals = seqStorePtr->litStart;
size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart;
size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart;
/* Base suspicion of uncompressibility on ratio of literals to sequences */
unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
size_t const litSize = (size_t)(seqStorePtr->lit - literals);
size_t const cSize = ZSTD_compressLiterals(
&prevEntropy->huf, &nextEntropy->huf,
Expand All @@ -2599,7 +2604,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
op, dstCapacity,
literals, litSize,
entropyWorkspace, entropyWkspSize,
bmi2);
bmi2, suspectUncompressible);
FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
assert(cSize <= dstCapacity);
op += cSize;
Expand Down
7 changes: 4 additions & 3 deletions lib/compress/zstd_compress_literals.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
void* dst, size_t dstCapacity,
const void* src, size_t srcSize,
void* entropyWorkspace, size_t entropyWorkspaceSize,
const int bmi2)
const int bmi2,
unsigned suspectUncompressible)
{
size_t const minGain = ZSTD_minGain(srcSize, strategy);
size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
Expand Down Expand Up @@ -105,11 +106,11 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
HUF_compress1X_repeat(
ostart+lhSize, dstCapacity-lhSize, src, srcSize,
HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
(HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2) :
(HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) :
HUF_compress4X_repeat(
ostart+lhSize, dstCapacity-lhSize, src, srcSize,
HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
(HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2);
(HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible);
if (repeat != HUF_repeat_none) {
/* reused the existing table */
DEBUGLOG(5, "Reusing previous huffman table");
Expand Down
4 changes: 3 additions & 1 deletion lib/compress/zstd_compress_literals.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src,

size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);

/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
ZSTD_hufCTables_t* nextHuf,
ZSTD_strategy strategy, int disableLiteralCompression,
void* dst, size_t dstCapacity,
const void* src, size_t srcSize,
void* entropyWorkspace, size_t entropyWorkspaceSize,
const int bmi2);
const int bmi2,
unsigned suspectUncompressible);

#endif /* ZSTD_COMPRESS_LITERALS_H */