Merge fastCover into DictBuilder (#1274)
* Minor fix
* Run non-optimize FASTCOVER 5 times in benchmark
* Merge fastCover into dictBuilder
* Fix mixed declaration issue
* Add fastcover to symbol.c
* Add fastCover.c and cover.h to build
* Change fastCover.c to fastcover.c
* Update benchmark to run FASTCOVER in dictBuilder
* Undo spliting fastcover_param into cover_param and f
* Remove convert param functions
* Assign f to parameter
* Add zdict.h to Makefile in lib
* Add cover.h to BUCK
* Cast 1 to U64 before shifting
* Remove trimming of zero freq head and tail in selectSegment and rebenchmark
* Remove f as a separate parameter of tryParam
* Read 8 bytes when d is 6
* Add trimming off zero frequency head and tail
* Use best functions from COVER and remove trimming part(which leads to worse compression ratio after previous bugs were fixed)
* Add finalize= argument to FASTCOVER to specify percentage of training samples passed to ZDICT_finalizeDictionary
* Change nbDmer to always read 8 bytes even when d=6
* Add skip=# argument to allow skipping dmers in computeFrequency in FASTCOVER
* Update comments and benchmarking result
* Change default method of ZDICT_trainFromBuffer to ZDICT_optimizeTrainFromBuffer_fastCover
* Add dictType enum and fix bug about passing zParam when converting to coverParam
* Combine finalize and skip into a single parameter
* Update acceleration parameters and benchmark on 3 sample sets
* Change default splitPoint of FASTCOVER to 0.75 and benchmark first 3 sample sets
* Initialize variables outside of for loop in benchmark.c
* Update benchmark result for hg-manifest
* Remove cover.h from install-includes
* Add explanation of f
* Set default compression level for trainFromBuffer to 3
* Add assertion of fastCoverParams in DiB_trainFromFiles
* Add checkTotalCompressedSize function + some minor fixes
* Add test for multithreading fastCovr
* Initialize segmentFreqs in every FASTCOVER_selectSegment and move mutex_unnlock to end of COVER_best_finish
* Free segmentFreqs
* Initialize segmentFreqs before calling FASTCOVER_buildDictionary instead of in FASTCOVER_selectSegment
* Add FASTCOVER_MEMMULT
* Minor fix
* Update benchmarking result
diff --git a/programs/dibio.c b/programs/dibio.c
index fbb8aa6..4b68be6 100644
--- a/programs/dibio.c
+++ b/programs/dibio.c
@@ -44,6 +44,7 @@
#define SAMPLESIZE_MAX (128 KB)
#define MEMMULT 11 /* rough estimation : memory cost to analyze 1 byte of sample */
#define COVER_MEMMULT 9 /* rough estimation : memory cost to analyze 1 byte of sample */
+#define FASTCOVER_MEMMULT 1 /* rough estimation : memory cost to analyze 1 byte of sample */
static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
#define NOISELENGTH 32
@@ -271,16 +272,19 @@
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
- ZDICT_legacy_params_t *params, ZDICT_cover_params_t *coverParams,
- int optimizeCover)
+ ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
+ ZDICT_fastCover_params_t* fastCoverParams, int optimize)
{
unsigned const displayLevel = params ? params->zParams.notificationLevel :
coverParams ? coverParams->zParams.notificationLevel :
+ fastCoverParams ? fastCoverParams->zParams.notificationLevel :
0; /* should never happen */
void* const dictBuffer = malloc(maxDictSize);
fileStats const fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
- size_t const memMult = params ? MEMMULT : COVER_MEMMULT;
+ size_t const memMult = params ? MEMMULT :
+ coverParams ? COVER_MEMMULT:
+ FASTCOVER_MEMMULT;
size_t const maxMem = DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad);
void* const srcBuffer = malloc(loadedSize+NOISELENGTH);
@@ -312,6 +316,7 @@
/* Load input buffer */
DISPLAYLEVEL(3, "Shuffling input files\n");
DiB_shuffle(fileNamesTable, nbFiles);
+
DiB_loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel);
{ size_t dictSize;
@@ -320,19 +325,36 @@
dictSize = ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, maxDictSize,
srcBuffer, sampleSizes, fs.nbSamples,
*params);
- } else if (optimizeCover) {
- assert(coverParams != NULL);
- dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
- srcBuffer, sampleSizes, fs.nbSamples,
- coverParams);
- if (!ZDICT_isError(dictSize)) {
- unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
- DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d, coverParams->steps, splitPercentage);
+ } else if (coverParams) {
+ if (optimize) {
+ dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
+ srcBuffer, sampleSizes, fs.nbSamples,
+ coverParams);
+ if (!ZDICT_isError(dictSize)) {
+ unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
+ DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d,
+ coverParams->steps, splitPercentage);
+ }
+ } else {
+ dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
+ sampleSizes, fs.nbSamples, *coverParams);
}
} else {
- assert(coverParams != NULL);
- dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
- sampleSizes, fs.nbSamples, *coverParams);
+ assert(fastCoverParams != NULL);
+ if (optimize) {
+ dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize,
+ srcBuffer, sampleSizes, fs.nbSamples,
+ fastCoverParams);
+ if (!ZDICT_isError(dictSize)) {
+ unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100);
+ DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastCoverParams->k,
+ fastCoverParams->d, fastCoverParams->f, fastCoverParams->steps, splitPercentage,
+ fastCoverParams->accel);
+ }
+ } else {
+ dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer,
+ sampleSizes, fs.nbSamples, *fastCoverParams);
+ }
}
if (ZDICT_isError(dictSize)) {
DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */