Added : ability to manually select the dictionary ID of a newly created dictionary

commit: 290aaa75212334b18b0ef36c679d4d4a0704acca [log] [tgz]
author: Yann Collet <yann.collet.73@gmail.com> Mon May 30 21:18:52 2016 +0200
committer: Yann Collet <yann.collet.73@gmail.com> Mon May 30 21:18:52 2016 +0200
tree: 3f94c7f039dc9b9a5daa4e61c650abb4e998bed9
parent: 815580a53864a17d5f486db6b9e4565ce59f5fa4 [diff]
diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c
index 1d37322..be141ce 100644
--- a/lib/dictBuilder/zdict.c
+++ b/lib/dictBuilder/zdict.c

@@ -819,10 +819,10 @@
                             ZDICT_params_t params)
 {
     U32 const dictListSize = MAX( MAX(DICTLISTSIZE, nbSamples), (U32)(maxDictSize/16));
-    dictItem* dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
+    dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
     unsigned selectivity = params.selectivityLevel;
     unsigned compressionLevel = params.compressionLevel;
-    size_t targetDictSize = maxDictSize;
+    size_t const targetDictSize = maxDictSize;
     size_t sBuffSize;
     size_t dictSize = 0;
 
@@ -865,17 +865,16 @@
     /* create dictionary */
     {   U32 dictContentSize = ZDICT_dictSize(dictList);
         size_t hSize;
-        BYTE* ptr;
-        U32 u;
 
         /* build dict content */
-        ptr = (BYTE*)dictBuffer + maxDictSize;
-        for (u=1; u<dictList->pos; u++) {
-            U32 l = dictList[u].length;
-            ptr -= l;
-            if (ptr<(BYTE*)dictBuffer) return ERROR(GENERIC);   /* should not happen */
-            memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
-        }
+        {   U32 u;
+            BYTE* ptr = (BYTE*)dictBuffer + maxDictSize;
+            for (u=1; u<dictList->pos; u++) {
+                U32 l = dictList[u].length;
+                ptr -= l;
+                if (ptr<(BYTE*)dictBuffer) return ERROR(GENERIC);   /* should not happen */
+                memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
+        }   }
 
         /* fast mode dict content */
         if (selectivity==1) {  /* note could also be used to complete a dictionary, but not necessarily better */
@@ -888,7 +887,8 @@
        /* dictionary header */
         MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
         {   U64 const randomID = XXH64((char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize, 0);
-            MEM_writeLE32((char*)dictBuffer+4, (U32)(randomID>>11));
+            U32 const dictID = params.dictID ? params.dictID : (U32)(randomID>>11);
+            MEM_writeLE32((char*)dictBuffer+4, dictID);
         }
         hSize = 8;
 

diff --git a/lib/dictBuilder/zdict_static.h b/lib/dictBuilder/zdict_static.h
index e5f909a..e34e6c0 100644
--- a/lib/dictBuilder/zdict_static.h
+++ b/lib/dictBuilder/zdict_static.h

@@ -54,7 +54,8 @@
     unsigned selectivityLevel;   /* 0 means default; larger => bigger selection => larger dictionary */
     unsigned compressionLevel;   /* 0 means default; target a specific zstd compression level */
     unsigned notificationLevel;  /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
-    unsigned reserved[3];        /* space for future parameters */
+    unsigned dictID;             /* 0 means auto mode (32-bits random value); other : force dictID value */
+    unsigned reserved[2];        /* space for future parameters */
 } ZDICT_params_t;
 
 
@@ -65,7 +66,7 @@
     Same as ZDICT_trainFromBuffer() with control over more parameters.
     `parameters` is optional and can be provided with values set to 0 to mean "default".
     @return : size of dictionary stored into `dictBuffer` (<= `dictBufferSize`)
-              or an error code, which can be tested by DiB_isError().
+              or an error code, which can be tested by ZDICT_isError().
     note : ZDICT_trainFromBuffer_advanced() will send notifications into stderr if instructed to, using ZDICT_setNotificationLevel()
 */
 size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,

diff --git a/programs/dibio.c b/programs/dibio.c
index 23f3c81..d23476e 100644
--- a/programs/dibio.c
+++ b/programs/dibio.c

@@ -101,27 +101,30 @@
 /* ********************************************************
 *  File related operations
 **********************************************************/
-static void DiB_loadFiles(void* buffer, size_t bufferSize,
-                          size_t* fileSizes,
-                          const char** fileNamesTable, unsigned nbFiles)
+/** DiB_loadFiles() :
+*   @return : nb of files effectively loaded into `buffer` */
+static unsigned DiB_loadFiles(void* buffer, size_t bufferSize,
+                              size_t* fileSizes,
+                              const char** fileNamesTable, unsigned nbFiles)
 {
-    char* buff = (char*)buffer;
+    char* const buff = (char*)buffer;
     size_t pos = 0;
     unsigned n;
 
     for (n=0; n<nbFiles; n++) {
-        size_t readSize;
-        unsigned long long fileSize = UTIL_getFileSize(fileNamesTable[n]);
-        FILE* f = fopen(fileNamesTable[n], "rb");
+        unsigned long long const fs64 = UTIL_getFileSize(fileNamesTable[n]);
+        size_t const fileSize = (size_t)(fs64 > bufferSize-pos ? 0 : fs64);
+        FILE* const f = fopen(fileNamesTable[n], "rb");
         if (f==NULL) EXM_THROW(10, "impossible to open file %s", fileNamesTable[n]);
         DISPLAYUPDATE(2, "Loading %s...       \r", fileNamesTable[n]);
-        if (fileSize > bufferSize-pos) fileSize = 0;  /* stop there, not enough memory to load all files */
-        readSize = fread(buff+pos, 1, (size_t)fileSize, f);
-        if (readSize != (size_t)fileSize) EXM_THROW(11, "could not read %s", fileNamesTable[n]);
-        pos += readSize;
-        fileSizes[n] = (size_t)fileSize;
+        { size_t const readSize = fread(buff+pos, 1, fileSize, f);
+          if (readSize != fileSize) EXM_THROW(11, "could not read %s", fileNamesTable[n]);
+          pos += readSize; }
+        fileSizes[n] = fileSize;
         fclose(f);
+        if (fileSize == 0) break;  /* stop there, not enough memory to load all files */
     }
+    return n;
 }
 
 
@@ -130,7 +133,7 @@
 **********************************************************/
 static size_t DiB_findMaxMem(unsigned long long requiredMem)
 {
-    size_t step = 8 MB;
+    size_t const step = 8 MB;
     void* testmem = NULL;
 
     requiredMem = (((requiredMem >> 23) + 1) << 23);
@@ -162,7 +165,7 @@
 static void DiB_saveDict(const char* dictFileName,
                          const void* buff, size_t buffSize)
 {
-    FILE* f = fopen(dictFileName, "wb");
+    FILE* const f = fopen(dictFileName, "wb");
     if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
 
     { size_t const n = fwrite(buff, 1, buffSize, f);
@@ -185,47 +188,44 @@
                               ZDICT_params_t parameters);
 
 
+#define MIN(a,b)  ((a)<(b)?(a):(b))
 int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
                        const char** fileNamesTable, unsigned nbFiles,
                        ZDICT_params_t params)
 {
-    void* srcBuffer;
-    size_t benchedSize;
-    size_t* fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
-    unsigned long long totalSizeToLoad = UTIL_getTotalFileSize(fileNamesTable, nbFiles);
-    void* dictBuffer = malloc(maxDictSize);
-    size_t dictSize;
+    void* const dictBuffer = malloc(maxDictSize);
+    size_t* const fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
+    unsigned long long const totalSizeToLoad = UTIL_getTotalFileSize(fileNamesTable, nbFiles);
+    size_t const maxMem =  DiB_findMaxMem(totalSizeToLoad * MEMMULT) / MEMMULT;
+    size_t const benchedSize = MIN (maxMem, (size_t)totalSizeToLoad);
+    void* const srcBuffer = malloc(benchedSize+NOISELENGTH);
     int result = 0;
 
+    /* Checks */
+    if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles");   /* should not happen */
+
     /* init */
     g_displayLevel = params.notificationLevel;
-    benchedSize = DiB_findMaxMem(totalSizeToLoad * MEMMULT) / MEMMULT;
-    if ((unsigned long long)benchedSize > totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad;
     if (benchedSize < totalSizeToLoad)
         DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20));
 
-    /* Memory allocation & restrictions */
-    srcBuffer = malloc(benchedSize+NOISELENGTH);     /* + noise */
-    if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles");  /* should not happen */
-
     /* Load input buffer */
-    DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles);
+    nbFiles = DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles);
     DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH);   /* guard band, for end of buffer condition */
 
-    /* call buffer version */
-    dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize,
-                        srcBuffer, fileSizes, nbFiles,
-                        params);
-    if (ZDICT_isError(dictSize)) {
-        DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
-        result = 1;
-        goto _cleanup;
+    {   size_t const dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize,
+                            srcBuffer, fileSizes, nbFiles,
+                            params);
+        if (ZDICT_isError(dictSize)) {
+            DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
+            result = 1;
+            goto _cleanup;
+        }
+        /* save dict */
+        DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
+        DiB_saveDict(dictFileName, dictBuffer, dictSize);
     }
 
-    /* save dict */
-    DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
-    DiB_saveDict(dictFileName, dictBuffer, dictSize);
-
     /* clean up */
 _cleanup:
     free(srcBuffer);

diff --git a/programs/tests/playTests.sh b/programs/tests/playTests.sh
index 1729052..ae78230 100755
--- a/programs/tests/playTests.sh
+++ b/programs/tests/playTests.sh

@@ -129,6 +129,8 @@
 diff zstdcli.c result
 $ZSTD --train *.c *.h -o tmpDictC
 $ZSTD -d tmp -D tmpDictC -of result && die "wrong dictionary not detected!"
+$ZSTD --train *.c --dictID 1 -o tmpDict1
+cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
 
 
 $ECHO "\n**** multiple files tests **** "

diff --git a/programs/zstd.1 b/programs/zstd.1
index 27d607f..1bab57a 100644
--- a/programs/zstd.1
+++ b/programs/zstd.1

@@ -18,11 +18,11 @@
 .PP
 .B unzstd
 is equivalent to
-.BR "zstd \-d" 
+.BR "zstd \-d"
 .br
 .B zstdcat
 is equivalent to
-.BR "zstd \-dc" 
+.BR "zstd \-dc"
 .br
 
 .SH DESCRIPTION
@@ -90,7 +90,15 @@
  dictionary saved into `file` (default: dictionary)
 .TP
 .B \--maxdict #
- limit dictionary to specified size (default : 112640) 
+ limit dictionary to specified size (default : 112640)
+.TP
+.B \--dictID #
+ A dictionary ID is a locally unique ID that a decoder can use to verify it is using the right dictionary.
+ By default, zstd will create a 4-bytes random number ID.
+ It's possible to give a precise number instead.
+ Short numbers have an advantage : an ID < 256 will only need 1 byte in the compressed frame header,
+ and an ID < 65536 will only need 2 bytes. This compares favorably to 4 bytes default.
+ However, it's up to the dictionary manager to not assign twice the same ID to 2 different dictionaries.
 .TP
 .B \-s#
  dictionary selectivity level (default: 9)

diff --git a/programs/zstdcli.c b/programs/zstdcli.c
index 68dd98c..74f3878 100644
--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c

@@ -143,6 +143,7 @@
     DISPLAY( " -o file: `file` is dictionary name (default: %s) \n", g_defaultDictName);
     DISPLAY( "--maxdict:limit dictionary to specified size (default : %u) \n", g_defaultMaxDictSize);
     DISPLAY( " -s#    : dictionary selectivity level (default: %u)\n", g_defaultSelectivityLevel);
+    DISPLAY( "--dictID: force dictionary ID to specified value (default: random)\n");
 #endif
 #ifndef ZSTD_NOBENCH
     DISPLAY( "\n");
@@ -185,7 +186,8 @@
         operationResult=0,
         dictBuild=0,
         nextArgumentIsOutFileName=0,
-        nextArgumentIsMaxDict=0;
+        nextArgumentIsMaxDict=0,
+        nextArgumentIsDictID=0;
     unsigned cLevel = 1;
     unsigned cLevelLast = 1;
     unsigned recursive = 0;
@@ -196,6 +198,7 @@
     const char* dictFileName = NULL;
     char* dynNameSpace = NULL;
     unsigned maxDictSize = g_defaultMaxDictSize;
+    unsigned dictID = 0;
     unsigned dictCLevel = g_defaultDictCLevel;
     unsigned dictSelect = g_defaultSelectivityLevel;
 #ifdef UTIL_HAS_CREATEFILELIST
@@ -236,6 +239,7 @@
         if (!strcmp(argument, "--test")) { decode=1; outFileName=nulmark; FIO_overwriteMode(); continue; }
         if (!strcmp(argument, "--train")) { dictBuild=1; outFileName=g_defaultDictName; continue; }
         if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; continue; }
+        if (!strcmp(argument, "--dictID")) { nextArgumentIsDictID=1; continue; }
         if (!strcmp(argument, "--keep")) { continue; }   /* does nothing, since preserving input is default; for gzip/xz compatibility */
         if (!strcmp(argument, "--ultra")) { FIO_setMaxWLog(0); continue; }
         if (!strcmp(argument, "--sparse")) { FIO_setSparseWrite(2); continue; }
@@ -393,6 +397,14 @@
             continue;
         }
 
+        if (nextArgumentIsDictID) {
+            nextArgumentIsDictID = 0;
+            dictID = 0;
+            while ((*argument>='0') && (*argument<='9'))
+                dictID = dictID * 10 + (*argument - '0'), argument++;
+            continue;
+        }
+
         /* add filename to list */
         filenameTable[filenameIdx++] = argument;
     }
@@ -429,6 +441,7 @@
         dictParams.compressionLevel = dictCLevel;
         dictParams.selectivityLevel = dictSelect;
         dictParams.notificationLevel = displayLevel;
+        dictParams.dictID = dictID;
         DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, dictParams);
 #endif
         goto _end;
commit	290aaa75212334b18b0ef36c679d4d4a0704acca	[log] [tgz]
author	Yann Collet <yann.collet.73@gmail.com>	Mon May 30 21:18:52 2016 +0200
committer	Yann Collet <yann.collet.73@gmail.com>	Mon May 30 21:18:52 2016 +0200
tree	3f94c7f039dc9b9a5daa4e61c650abb4e998bed9
parent	815580a53864a17d5f486db6b9e4565ce59f5fa4 [diff]