Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow user to specify memory limit for dictionary training #2925

Merged
merged 1 commit into from
Dec 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion programs/dibio.c
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t
int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
const char** fileNamesTable, int nbFiles, size_t chunkSize,
ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
ZDICT_fastCover_params_t* fastCoverParams, int optimize)
ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit)
{
fileStats fs;
size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */
Expand Down Expand Up @@ -341,6 +341,11 @@ int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
/* Limit the size of the training data to 2GB */
/* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */
loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );
if (memLimit != 0) {
DISPLAYLEVEL(2, "! Warning : setting manual memory limit for dictionary training data at %u MB \n",
(unsigned)(memLimit / (1 MB)));
embg marked this conversation as resolved.
Show resolved Hide resolved
loadedSize = (size_t)MIN(loadedSize, memLimit);
}
srcBuffer = malloc(loadedSize+NOISELENGTH);
sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
}
Expand Down
2 changes: 1 addition & 1 deletion programs/dibio.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,6 @@
int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
const char** fileNamesTable, int nbFiles, size_t chunkSize,
ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
ZDICT_fastCover_params_t* fastCoverParams, int optimize);
ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit);

#endif
6 changes: 6 additions & 0 deletions programs/zstd.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,10 @@ the last one takes effect.

This is also used during compression when using with --patch-from=. In this case,
this parameter overrides that maximum size allowed for a dictionary. (128 MB).

Additionally, this can be used to limit memory for dictionary training. This parameter
overrides the default limit of 2 GB. zstd will load training samples up to the memory limit
and ignore the rest.
* `--stream-size=#` :
Sets the pledged source size of input coming from a stream. This value must be exact, as it
will be included in the produced frame header. Incorrect stream sizes will cause an error.
Expand Down Expand Up @@ -329,6 +333,8 @@ Compression of small files similar to the sample set will be greatly improved.
resulting in a _small_ compression ratio improvement for this level.
* `-B#`:
Split input files into blocks of size # (default: no split)
* `-M#`, `--memory=#`:
Limit the amount of sample data loaded for training (default: 2 GB). See above for details.
* `--dictID=#`:
A dictionary ID is a locally unique ID
that a decoder can use to verify it is using the right dictionary.
Expand Down
6 changes: 3 additions & 3 deletions programs/zstdcli.c
Original file line number Diff line number Diff line change
Expand Up @@ -1327,18 +1327,18 @@ int main(int argCount, const char* argv[])
int const optimize = !coverParams.k || !coverParams.d;
coverParams.nbThreads = (unsigned)nbWorkers;
coverParams.zParams = zParams;
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, &coverParams, NULL, optimize);
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, &coverParams, NULL, optimize, memLimit);
} else if (dict == fastCover) {
int const optimize = !fastCoverParams.k || !fastCoverParams.d;
fastCoverParams.nbThreads = (unsigned)nbWorkers;
fastCoverParams.zParams = zParams;
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, NULL, &fastCoverParams, optimize);
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, NULL, &fastCoverParams, optimize, memLimit);
} else {
ZDICT_legacy_params_t dictParams;
memset(&dictParams, 0, sizeof(dictParams));
dictParams.selectivityLevel = dictSelect;
dictParams.zParams = zParams;
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, &dictParams, NULL, NULL, 0);
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, &dictParams, NULL, NULL, 0, memLimit);
}
#else
(void)dictCLevel; (void)dictSelect; (void)dictID; (void)maxDictSize; /* not used when ZSTD_NODICT set */
Expand Down
7 changes: 7 additions & 0 deletions tests/playTests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1029,6 +1029,13 @@ then
fi
rm -f tmp* dictionary

println "- Test --memory for dictionary compression"
datagen -g12M -P90 > tmpCorpusHighCompress
zstd --train -B2K tmpCorpusHighCompress -o tmpDictHighCompress --memory=10K && die "Dictionary training should fail : --memory too low (10K)"
embg marked this conversation as resolved.
Show resolved Hide resolved
zstd --train -B2K tmpCorpusHighCompress -o tmpDictHighCompress --memory=5MB 2> zstTrainWithMemLimitStdErr
cat zstTrainWithMemLimitStdErr | grep "setting manual memory limit for dictionary training data at 5 MB"
cat zstTrainWithMemLimitStdErr | grep "Training samples set too large (12 MB); training on 5 MB only..."
rm zstTrainWithMemLimitStdErr

println "\n===> fastCover dictionary builder : advanced options "
TESTFILE="$PRGDIR"/zstdcli.c
Expand Down