diff --git a/programs/dibio.c b/programs/dibio.c index e7fb905ec0..04860dbbfa 100644 --- a/programs/dibio.c +++ b/programs/dibio.c @@ -309,7 +309,7 @@ static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize, const char** fileNamesTable, int nbFiles, size_t chunkSize, ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams, - ZDICT_fastCover_params_t* fastCoverParams, int optimize) + ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit) { fileStats fs; size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */ @@ -341,6 +341,11 @@ int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize, /* Limit the size of the training data to 2GB */ /* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */ loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE ); + if (memLimit != 0) { + DISPLAYLEVEL(2, "! Warning : setting manual memory limit for dictionary training data at %u MB \n", + (unsigned)(memLimit / (1 MB))); + loadedSize = (size_t)MIN(loadedSize, memLimit); + } srcBuffer = malloc(loadedSize+NOISELENGTH); sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); } diff --git a/programs/dibio.h b/programs/dibio.h index 03ec80e595..666c1e6618 100644 --- a/programs/dibio.h +++ b/programs/dibio.h @@ -34,6 +34,6 @@ int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize, const char** fileNamesTable, int nbFiles, size_t chunkSize, ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams, - ZDICT_fastCover_params_t* fastCoverParams, int optimize); + ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit); #endif diff --git a/programs/zstd.1.md b/programs/zstd.1.md index ef37fef322..e343ec0448 100644 --- a/programs/zstd.1.md +++ b/programs/zstd.1.md @@ -190,6 +190,10 @@ the last one takes effect. This is also used during compression when using with --patch-from=. In this case, this parameter overrides that maximum size allowed for a dictionary. (128 MB). + + Additionally, this can be used to limit memory for dictionary training. This parameter + overrides the default limit of 2 GB. zstd will load training samples up to the memory limit + and ignore the rest. * `--stream-size=#` : Sets the pledged source size of input coming from a stream. This value must be exact, as it will be included in the produced frame header. Incorrect stream sizes will cause an error. @@ -329,6 +333,8 @@ Compression of small files similar to the sample set will be greatly improved. resulting in a _small_ compression ratio improvement for this level. * `-B#`: Split input files into blocks of size # (default: no split) +* `-M#`, `--memory=#`: + Limit the amount of sample data loaded for training (default: 2 GB). See above for details. * `--dictID=#`: A dictionary ID is a locally unique ID that a decoder can use to verify it is using the right dictionary. diff --git a/programs/zstdcli.c b/programs/zstdcli.c index 4d1978c80c..bfe18c0c1b 100644 --- a/programs/zstdcli.c +++ b/programs/zstdcli.c @@ -1327,18 +1327,18 @@ int main(int argCount, const char* argv[]) int const optimize = !coverParams.k || !coverParams.d; coverParams.nbThreads = (unsigned)nbWorkers; coverParams.zParams = zParams; - operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, &coverParams, NULL, optimize); + operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, &coverParams, NULL, optimize, memLimit); } else if (dict == fastCover) { int const optimize = !fastCoverParams.k || !fastCoverParams.d; fastCoverParams.nbThreads = (unsigned)nbWorkers; fastCoverParams.zParams = zParams; - operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, NULL, &fastCoverParams, optimize); + operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, NULL, &fastCoverParams, optimize, memLimit); } else { ZDICT_legacy_params_t dictParams; memset(&dictParams, 0, sizeof(dictParams)); dictParams.selectivityLevel = dictSelect; dictParams.zParams = zParams; - operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, &dictParams, NULL, NULL, 0); + operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, &dictParams, NULL, NULL, 0, memLimit); } #else (void)dictCLevel; (void)dictSelect; (void)dictID; (void)maxDictSize; /* not used when ZSTD_NODICT set */ diff --git a/tests/playTests.sh b/tests/playTests.sh index 3bc88b8f6f..11240898ea 100755 --- a/tests/playTests.sh +++ b/tests/playTests.sh @@ -1051,6 +1051,13 @@ then fi rm -f tmp* dictionary +println "- Test --memory for dictionary compression" +datagen -g12M -P90 > tmpCorpusHighCompress +zstd --train -B2K tmpCorpusHighCompress -o tmpDictHighCompress --memory=10K && die "Dictionary training should fail : --memory too low (10K)" +zstd --train -B2K tmpCorpusHighCompress -o tmpDictHighCompress --memory=5MB 2> zstTrainWithMemLimitStdErr +cat zstTrainWithMemLimitStdErr | grep "setting manual memory limit for dictionary training data at 5 MB" +cat zstTrainWithMemLimitStdErr | grep "Training samples set too large (12 MB); training on 5 MB only..." +rm zstTrainWithMemLimitStdErr println "\n===> fastCover dictionary builder : advanced options " TESTFILE="$PRGDIR"/zstdcli.c