Add COVER to the zstd cli

commit: df8415c50275cd0a4f2ab2e57fc2d6a928de2995 [log] [tgz]
author: Nick Terrell <terrelln@fb.com> Sat Dec 31 21:08:24 2016 -0800
committer: Nick Terrell <terrelln@fb.com> Mon Jan 02 14:43:08 2017 -0800
tree: bec965274e14338de2083587d29adc2a417f41b9
parent: 9103ed6c8bf01531602d09efe6a0fa21c5012b54 [diff] [blame]
diff --git a/programs/dibio.c b/programs/dibio.c
index b95bab3..ba15d21 100644
--- a/programs/dibio.c
+++ b/programs/dibio.c

@@ -42,6 +42,7 @@
 
 #define SAMPLESIZE_MAX (128 KB)
 #define MEMMULT 11    /* rough estimation : memory cost to analyze 1 byte of sample */
+#define COVER_MEMMULT 9    /* rough estimation : memory cost to analyze 1 byte of sample */
 static const size_t maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
 
 #define NOISELENGTH 32
@@ -118,10 +119,36 @@
             fileSizes[n] = fileSize;
             fclose(f);
     }   }
+    DISPLAYLEVEL(2, "\r%79s\r", "");
     *bufferSizePtr = pos;
     return n;
 }
 
+#define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+static U32 DiB_rand(U32* src)
+{
+    static const U32 prime1 = 2654435761U;
+    static const U32 prime2 = 2246822519U;
+    U32 rand32 = *src;
+    rand32 *= prime1;
+    rand32 ^= prime2;
+    rand32  = DiB_rotl32(rand32, 13);
+    *src = rand32;
+    return rand32 >> 5;
+}
+
+static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
+  /* Initialize the pseudorandom number generator */
+  U32 seed = 0xFD2FB528;
+  unsigned i;
+  for (i = nbFiles - 1; i > 0; --i) {
+    unsigned const j = DiB_rand(&seed) % (i + 1);
+    const char* tmp = fileNamesTable[j];
+    fileNamesTable[j] = fileNamesTable[i];
+    fileNamesTable[i] = tmp;
+  }
+}
+
 
 /*-********************************************************
 *  Dictionary training functions
@@ -202,7 +229,8 @@
 
 int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
                        const char** fileNamesTable, unsigned nbFiles,
-                       ZDICT_params_t params)
+                       ZDICT_params_t *params, COVER_params_t *coverParams,
+                       int optimizeCover)
 {
     void* const dictBuffer = malloc(maxDictSize);
     size_t* const fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
@@ -213,8 +241,10 @@
     int result = 0;
 
     /* Checks */
+    if (params) g_displayLevel = params->notificationLevel;
+    else if (coverParams) g_displayLevel = coverParams->notificationLevel;
+    else EXM_THROW(13, "Neither dictionary algorith selected");   /* should not happen */
     if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles");   /* should not happen */
-    g_displayLevel = params.notificationLevel;
     if (g_tooLargeSamples) {
         DISPLAYLEVEL(2, "!  Warning : some samples are very large \n");
         DISPLAYLEVEL(2, "!  Note that dictionary is only useful for small files or beginning of large files. \n");
@@ -233,12 +263,31 @@
         DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20));
 
     /* Load input buffer */
+    DISPLAYLEVEL(3, "Shuffling input files\n");
+    DiB_shuffle(fileNamesTable, nbFiles);
     nbFiles = DiB_loadFiles(srcBuffer, &benchedSize, fileSizes, fileNamesTable, nbFiles);
-    DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH);   /* guard band, for end of buffer condition */
 
-    {   size_t const dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize,
-                            srcBuffer, fileSizes, nbFiles,
-                            params);
+    {
+        size_t dictSize;
+        if (params) {
+            DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH);   /* guard band, for end of buffer condition */
+            dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize,
+                                                    srcBuffer, fileSizes, nbFiles,
+                                                    *params);
+        } else if (optimizeCover) {
+            dictSize = COVER_optimizeTrainFromBuffer(
+                dictBuffer, maxDictSize, srcBuffer, fileSizes, nbFiles,
+                coverParams);
+            if (!ZDICT_isError(dictSize)) {
+                DISPLAYLEVEL(2, "smoothing=%d\nkMin=%d\nkStep=%d\nkMax=%d\nd=%d\n",
+                             coverParams->smoothing, coverParams->kMin,
+                             coverParams->kStep, coverParams->kMax, coverParams->d);
+            }
+        } else {
+            dictSize = COVER_trainFromBuffer(dictBuffer, maxDictSize,
+                                             srcBuffer, fileSizes, nbFiles,
+                                             *coverParams);
+        }
         if (ZDICT_isError(dictSize)) {
             DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
             result = 1;
commit	df8415c50275cd0a4f2ab2e57fc2d6a928de2995	[log] [tgz]
author	Nick Terrell <terrelln@fb.com>	Sat Dec 31 21:08:24 2016 -0800
committer	Nick Terrell <terrelln@fb.com>	Mon Jan 02 14:43:08 2017 -0800
tree	bec965274e14338de2083587d29adc2a417f41b9
parent	9103ed6c8bf01531602d09efe6a0fa21c5012b54 [diff] [blame]