Add COVER to the zstd cli
diff --git a/programs/dibio.c b/programs/dibio.c
index b95bab3..ba15d21 100644
--- a/programs/dibio.c
+++ b/programs/dibio.c
@@ -42,6 +42,7 @@
#define SAMPLESIZE_MAX (128 KB)
#define MEMMULT 11 /* rough estimation : memory cost to analyze 1 byte of sample */
+#define COVER_MEMMULT 9 /* rough estimation : memory cost to analyze 1 byte of sample */
static const size_t maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
#define NOISELENGTH 32
@@ -118,10 +119,36 @@
fileSizes[n] = fileSize;
fclose(f);
} }
+ DISPLAYLEVEL(2, "\r%79s\r", "");
*bufferSizePtr = pos;
return n;
}
+#define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+static U32 DiB_rand(U32* src)
+{
+ static const U32 prime1 = 2654435761U;
+ static const U32 prime2 = 2246822519U;
+ U32 rand32 = *src;
+ rand32 *= prime1;
+ rand32 ^= prime2;
+ rand32 = DiB_rotl32(rand32, 13);
+ *src = rand32;
+ return rand32 >> 5;
+}
+
+static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
+ /* Initialize the pseudorandom number generator */
+ U32 seed = 0xFD2FB528;
+ unsigned i;
+ for (i = nbFiles - 1; i > 0; --i) {
+ unsigned const j = DiB_rand(&seed) % (i + 1);
+ const char* tmp = fileNamesTable[j];
+ fileNamesTable[j] = fileNamesTable[i];
+ fileNamesTable[i] = tmp;
+ }
+}
+
/*-********************************************************
* Dictionary training functions
@@ -202,7 +229,8 @@
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
const char** fileNamesTable, unsigned nbFiles,
- ZDICT_params_t params)
+ ZDICT_params_t *params, COVER_params_t *coverParams,
+ int optimizeCover)
{
void* const dictBuffer = malloc(maxDictSize);
size_t* const fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
@@ -213,8 +241,10 @@
int result = 0;
/* Checks */
+ if (params) g_displayLevel = params->notificationLevel;
+ else if (coverParams) g_displayLevel = coverParams->notificationLevel;
+ else EXM_THROW(13, "Neither dictionary algorith selected"); /* should not happen */
if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
- g_displayLevel = params.notificationLevel;
if (g_tooLargeSamples) {
DISPLAYLEVEL(2, "! Warning : some samples are very large \n");
DISPLAYLEVEL(2, "! Note that dictionary is only useful for small files or beginning of large files. \n");
@@ -233,12 +263,31 @@
DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20));
/* Load input buffer */
+ DISPLAYLEVEL(3, "Shuffling input files\n");
+ DiB_shuffle(fileNamesTable, nbFiles);
nbFiles = DiB_loadFiles(srcBuffer, &benchedSize, fileSizes, fileNamesTable, nbFiles);
- DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
- { size_t const dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize,
- srcBuffer, fileSizes, nbFiles,
- params);
+ {
+ size_t dictSize;
+ if (params) {
+ DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
+ dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize,
+ srcBuffer, fileSizes, nbFiles,
+ *params);
+ } else if (optimizeCover) {
+ dictSize = COVER_optimizeTrainFromBuffer(
+ dictBuffer, maxDictSize, srcBuffer, fileSizes, nbFiles,
+ coverParams);
+ if (!ZDICT_isError(dictSize)) {
+ DISPLAYLEVEL(2, "smoothing=%d\nkMin=%d\nkStep=%d\nkMax=%d\nd=%d\n",
+ coverParams->smoothing, coverParams->kMin,
+ coverParams->kStep, coverParams->kMax, coverParams->d);
+ }
+ } else {
+ dictSize = COVER_trainFromBuffer(dictBuffer, maxDictSize,
+ srcBuffer, fileSizes, nbFiles,
+ *coverParams);
+ }
if (ZDICT_isError(dictSize)) {
DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */
result = 1;
diff --git a/programs/dibio.h b/programs/dibio.h
index 6780d86..e61d004 100644
--- a/programs/dibio.h
+++ b/programs/dibio.h
@@ -32,7 +32,7 @@
*/
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
const char** fileNamesTable, unsigned nbFiles,
- ZDICT_params_t parameters);
-
+ ZDICT_params_t *params, COVER_params_t *coverParams,
+ int optimizeCover);
#endif
diff --git a/programs/zstdcli.c b/programs/zstdcli.c
index 978ffcf..f4d33d3 100644
--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c
@@ -127,6 +127,8 @@
DISPLAY( "\n");
DISPLAY( "Dictionary builder :\n");
DISPLAY( "--train ## : create a dictionary from a training set of files \n");
+ DISPLAY( "--cover=k=#,d=# : use the cover algorithm with parameters k and d \n");
+ DISPLAY( "--optimize-cover[=steps=#,k=#,d=#] : optimize cover parameters with optional parameters\n");
DISPLAY( " -o file : `file` is dictionary name (default: %s) \n", g_defaultDictName);
DISPLAY( "--maxdict ## : limit dictionary to specified size (default : %u) \n", g_defaultMaxDictSize);
DISPLAY( " -s# : dictionary selectivity level (default: %u)\n", g_defaultSelectivityLevel);
@@ -192,6 +194,29 @@
}
+#ifndef ZSTD_NODICT
+/**
+ * parseCoverParameters() :
+ * reads cover parameters from *stringPtr (e.g. "--cover=smoothing=100,kmin=48,kstep=4,kmax=64,d=8") into *params
+ * @return 1 means that cover parameters were correct
+ * @return 0 in case of malformed parameters
+ */
+static unsigned parseCoverParameters(const char* stringPtr, COVER_params_t *params)
+{
+ memset(params, 0, sizeof(*params));
+ for (; ;) {
+ if (longCommandWArg(&stringPtr, "smoothing=")) { params->smoothing = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+ if (longCommandWArg(&stringPtr, "k=") || longCommandWArg(&stringPtr, "kMin=") || longCommandWArg(&stringPtr, "kmin=")) { params->kMin = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+ if (longCommandWArg(&stringPtr, "kStep=") || longCommandWArg(&stringPtr, "kstep=")) { params->kStep = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+ if (longCommandWArg(&stringPtr, "kMax=") || longCommandWArg(&stringPtr, "kmax=")) { params->kMax = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+ if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+ return 0;
+ }
+ if (stringPtr[0] != 0) return 0;
+ DISPLAYLEVEL(4, "smoothing=%d\nkMin=%d\nkStep=%d\nkMax=%d\nd=%d\n", params->smoothing, params->kMin, params->kStep, params->kMax, params->d);
+ return 1;
+}
+#endif
/** parseCompressionParameters() :
* reads compression parameters from *stringPtr (e.g. "--zstd=wlog=23,clog=23,hlog=22,slog=6,slen=3,tlen=48,strat=6") into *params
* @return 1 means that compression parameters were correct
@@ -254,6 +279,10 @@
char* fileNamesBuf = NULL;
unsigned fileNamesNb;
#endif
+#ifndef ZSTD_NODICT
+ COVER_params_t coverParams;
+ int cover = 0;
+#endif
/* init */
(void)recursive; (void)cLevelLast; /* not used when ZSTD_NOBENCH set */
@@ -318,6 +347,20 @@
if (!strcmp(argument, "--rm")) { FIO_setRemoveSrcFile(1); continue; }
/* long commands with arguments */
+#ifndef ZSTD_NODICT
+ if (longCommandWArg(&argument, "--cover=")) {
+ cover=1; if (!parseCoverParameters(argument, &coverParams)) CLEAN_RETURN(badusage(programName));
+ continue;
+ }
+ if (longCommandWArg(&argument, "--optimize-cover")) {
+ cover=2;
+ /* Allow optional arguments following an = */
+ if (*argument == 0) { memset(&coverParams, 0, sizeof(coverParams)); }
+ else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
+ else if (!parseCoverParameters(argument, &coverParams)) { CLEAN_RETURN(badusage(programName)); }
+ continue;
+ }
+#endif
if (longCommandWArg(&argument, "--memlimit=")) { memLimit = readU32FromChar(&argument); continue; }
if (longCommandWArg(&argument, "--memory=")) { memLimit = readU32FromChar(&argument); continue; }
if (longCommandWArg(&argument, "--memlimit-decompress=")) { memLimit = readU32FromChar(&argument); continue; }
@@ -520,13 +563,20 @@
/* Check if dictionary builder is selected */
if (operation==zom_train) {
#ifndef ZSTD_NODICT
- ZDICT_params_t dictParams;
- memset(&dictParams, 0, sizeof(dictParams));
- dictParams.compressionLevel = dictCLevel;
- dictParams.selectivityLevel = dictSelect;
- dictParams.notificationLevel = displayLevel;
- dictParams.dictID = dictID;
- DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, dictParams);
+ if (cover) {
+ coverParams.compressionLevel = dictCLevel;
+ coverParams.notificationLevel = displayLevel;
+ coverParams.dictID = dictID;
+ DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, cover - 1);
+ } else {
+ ZDICT_params_t dictParams;
+ memset(&dictParams, 0, sizeof(dictParams));
+ dictParams.compressionLevel = dictCLevel;
+ dictParams.selectivityLevel = dictSelect;
+ dictParams.notificationLevel = displayLevel;
+ dictParams.dictID = dictID;
+ DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, &dictParams, NULL, 0);
+ }
#endif
goto _end;
}