blob: a7c8090a848d721b0a4a1c25a429477275762ebf [file] [log] [blame]
Yann Collete618f8e2016-01-28 00:29:58 +01001/*
Yann Collet7d360282016-02-12 00:07:30 +01002 dictBuilder - dictionary builder for zstd
Yann Collete618f8e2016-01-28 00:29:58 +01003 Copyright (C) Yann Collet 2016
4
Yann Collet7d360282016-02-12 00:07:30 +01005 BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
Yann Collete618f8e2016-01-28 00:29:58 +01006
Yann Collet7d360282016-02-12 00:07:30 +01007 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are
9 met:
Yann Collete618f8e2016-01-28 00:29:58 +010010
Yann Collet7d360282016-02-12 00:07:30 +010011 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13 * Redistributions in binary form must reproduce the above
14 copyright notice, this list of conditions and the following disclaimer
15 in the documentation and/or other materials provided with the
16 distribution.
Yann Collete618f8e2016-01-28 00:29:58 +010017
Yann Collet7d360282016-02-12 00:07:30 +010018 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Yann Collete618f8e2016-01-28 00:29:58 +010029
30 You can contact the author at :
Yann Collet71eafdd2016-02-12 02:31:57 +010031 - Zstd homepage : https://www.zstd.net
Yann Collete618f8e2016-01-28 00:29:58 +010032*/
33
Yann Collet7d360282016-02-12 00:07:30 +010034/*-**************************************
Yann Collete618f8e2016-01-28 00:29:58 +010035* Compiler Options
36****************************************/
37/* Disable some Visual warning messages */
38#ifdef _MSC_VER
Yann Collete618f8e2016-01-28 00:29:58 +010039# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
40#endif
41
42/* Unix Large Files support (>4GB) */
43#define _FILE_OFFSET_BITS 64
44#if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */
45# define _LARGEFILE_SOURCE
46#elif ! defined(__LP64__) /* No point defining Large file for 64 bit */
47# define _LARGEFILE64_SOURCE
48#endif
49
Yann Collete618f8e2016-01-28 00:29:58 +010050
Yann Collet863ec402016-01-28 17:56:33 +010051/*-*************************************
Yann Collet7d360282016-02-12 00:07:30 +010052* Dependencies
Yann Collete618f8e2016-01-28 00:29:58 +010053***************************************/
Yann Collet7682e492016-01-31 23:45:35 +010054#include <stdlib.h> /* malloc, free */
55#include <string.h> /* memset */
56#include <stdio.h> /* fprintf, fopen, ftello64 */
57#include <sys/types.h> /* stat64 */
58#include <sys/stat.h> /* stat64 */
59#include <time.h> /* clock */
Yann Collete618f8e2016-01-28 00:29:58 +010060
Yann Collet7682e492016-01-31 23:45:35 +010061#include "mem.h" /* read */
Yann Colletf5229e02016-01-29 02:45:26 +010062#include "error_private.h"
Yann Collet7d360282016-02-12 00:07:30 +010063#include "fse.h"
Yann Collete618f8e2016-01-28 00:29:58 +010064#include "huff0_static.h"
Yann Collet7d360282016-02-12 00:07:30 +010065#include "zstd_internal.h"
Yann Colletf4c9d752016-02-12 18:45:02 +010066#include "divsufsort.h"
67#include "zdict_static.h"
Yann Collete618f8e2016-01-28 00:29:58 +010068
69
Yann Collet7682e492016-01-31 23:45:35 +010070/*-*************************************
Yann Collete618f8e2016-01-28 00:29:58 +010071* Compiler specifics
72***************************************/
73#if !defined(S_ISREG)
74# define S_ISREG(x) (((x) & S_IFMT) == S_IFREG)
75#endif
76
Yann Collete618f8e2016-01-28 00:29:58 +010077
Yann Collet863ec402016-01-28 17:56:33 +010078/*-*************************************
Yann Collete618f8e2016-01-28 00:29:58 +010079* Constants
80***************************************/
81#define KB *(1 <<10)
82#define MB *(1 <<20)
83#define GB *(1U<<30)
84
85#define DICTLISTSIZE 10000
Yann Collete618f8e2016-01-28 00:29:58 +010086
87#define NOISELENGTH 32
88#define PRIME1 2654435761U
89#define PRIME2 2246822519U
90
91#define MINRATIO 4
Yann Colletf5229e02016-01-29 02:45:26 +010092static const U32 g_compressionLevel_default = 5;
Yann Collet7682e492016-01-31 23:45:35 +010093static const U32 g_selectivity_default = 9;
94static const size_t g_provision_entropySize = 200;
95static const size_t g_min_fast_dictContent = 192;
Yann Collete618f8e2016-01-28 00:29:58 +010096
97
Yann Collet863ec402016-01-28 17:56:33 +010098/*-*************************************
99* Console display
Yann Collete618f8e2016-01-28 00:29:58 +0100100***************************************/
101#define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
102#define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
Yann Colletb31de732016-01-28 10:30:02 +0100103static unsigned g_displayLevel = 0; /* 0 : no display; 1: errors; 2: default; 4: full information */
Yann Collete618f8e2016-01-28 00:29:58 +0100104
105#define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
Yann Collet7d360282016-02-12 00:07:30 +0100106 if (ZDICT_GetMilliSpan(g_time) > refreshRate) \
Yann Collete618f8e2016-01-28 00:29:58 +0100107 { g_time = clock(); DISPLAY(__VA_ARGS__); \
108 if (g_displayLevel>=4) fflush(stdout); } }
109static const unsigned refreshRate = 300;
110static clock_t g_time = 0;
111
Yann Collet6f3acba2016-02-12 20:19:48 +0100112static void ZDICT_printHex(U32 dlevel, const void* ptr, size_t length)
Yann Collete618f8e2016-01-28 00:29:58 +0100113{
114 const BYTE* const b = (const BYTE*)ptr;
115 size_t u;
116 for (u=0; u<length; u++)
117 {
118 BYTE c = b[u];
119 if (c<32 || c>126) c = '.'; /* non-printable char */
120 DISPLAYLEVEL(dlevel, "%c", c);
121 }
122}
123
124
Yann Collet7d360282016-02-12 00:07:30 +0100125/*-********************************************************
Yann Collete618f8e2016-01-28 00:29:58 +0100126* Helper functions
127**********************************************************/
Yann Collet7d360282016-02-12 00:07:30 +0100128static unsigned ZDICT_GetMilliSpan(clock_t nPrevious)
Yann Collete618f8e2016-01-28 00:29:58 +0100129{
130 clock_t nCurrent = clock();
131 unsigned nSpan = (unsigned)(((nCurrent - nPrevious) * 1000) / CLOCKS_PER_SEC);
132 return nSpan;
133}
134
Yann Collet7d360282016-02-12 00:07:30 +0100135unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
Yann Collet7682e492016-01-31 23:45:35 +0100136
Yann Collet7d360282016-02-12 00:07:30 +0100137const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
Yann Collete618f8e2016-01-28 00:29:58 +0100138
139
140/*-********************************************************
141* Dictionary training functions
142**********************************************************/
Yann Collet7d360282016-02-12 00:07:30 +0100143static unsigned ZDICT_NbCommonBytes (register size_t val)
Yann Collete618f8e2016-01-28 00:29:58 +0100144{
145 if (MEM_isLittleEndian()) {
146 if (MEM_64bits()) {
147# if defined(_MSC_VER) && defined(_WIN64)
148 unsigned long r = 0;
149 _BitScanForward64( &r, (U64)val );
150 return (unsigned)(r>>3);
151# elif defined(__GNUC__) && (__GNUC__ >= 3)
152 return (__builtin_ctzll((U64)val) >> 3);
153# else
154 static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
155 return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
156# endif
157 } else { /* 32 bits */
158# if defined(_MSC_VER)
159 unsigned long r=0;
160 _BitScanForward( &r, (U32)val );
161 return (unsigned)(r>>3);
162# elif defined(__GNUC__) && (__GNUC__ >= 3)
163 return (__builtin_ctz((U32)val) >> 3);
164# else
165 static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
166 return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
167# endif
168 }
169 } else { /* Big Endian CPU */
170 if (MEM_64bits()) {
171# if defined(_MSC_VER) && defined(_WIN64)
172 unsigned long r = 0;
173 _BitScanReverse64( &r, val );
174 return (unsigned)(r>>3);
175# elif defined(__GNUC__) && (__GNUC__ >= 3)
176 return (__builtin_clzll(val) >> 3);
177# else
178 unsigned r;
179 const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */
180 if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
181 if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
182 r += (!val);
183 return r;
184# endif
185 } else { /* 32 bits */
186# if defined(_MSC_VER)
187 unsigned long r = 0;
188 _BitScanReverse( &r, (unsigned long)val );
189 return (unsigned)(r>>3);
190# elif defined(__GNUC__) && (__GNUC__ >= 3)
191 return (__builtin_clz((U32)val) >> 3);
192# else
193 unsigned r;
194 if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
195 r += (!val);
196 return r;
197# endif
198 } }
199}
200
201
Yann Collet7d360282016-02-12 00:07:30 +0100202/*! ZDICT_count() :
Yann Collete618f8e2016-01-28 00:29:58 +0100203 Count the nb of common bytes between 2 pointers.
204 Note : this function presumes end of buffer followed by noisy guard band.
205*/
Yann Collet7d360282016-02-12 00:07:30 +0100206static size_t ZDICT_count(const void* pIn, const void* pMatch)
Yann Collete618f8e2016-01-28 00:29:58 +0100207{
208 const char* const pStart = (const char*)pIn;
209 for (;;) {
Yann Collet7d360282016-02-12 00:07:30 +0100210 size_t diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
Yann Collete618f8e2016-01-28 00:29:58 +0100211 if (!diff) { pIn = (const char*)pIn+sizeof(size_t); pMatch = (const char*)pMatch+sizeof(size_t); continue; }
Yann Collet7d360282016-02-12 00:07:30 +0100212 pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff);
Yann Collete618f8e2016-01-28 00:29:58 +0100213 return (size_t)((const char*)pIn - pStart);
214 }
215}
216
217
218typedef struct {
219 U32 pos;
220 U32 length;
221 U32 savings;
222} dictItem;
223
Yann Collet7d360282016-02-12 00:07:30 +0100224static void ZDICT_initDictItem(dictItem* d)
Yann Collete618f8e2016-01-28 00:29:58 +0100225{
226 d->pos = 1;
227 d->length = 0;
228 d->savings = (U32)(-1);
229}
230
231
232#define LLIMIT 64 /* heuristic determined experimentally */
233#define MINMATCHLENGTH 7 /* heuristic determined experimentally */
Yann Collet7d360282016-02-12 00:07:30 +0100234static dictItem ZDICT_analyzePos(
Yann Collete618f8e2016-01-28 00:29:58 +0100235 BYTE* doneMarks,
Yann Collet7d360282016-02-12 00:07:30 +0100236 const int* suffix, U32 start,
Yann Collete618f8e2016-01-28 00:29:58 +0100237 const void* buffer, U32 minRatio)
238{
239 U32 lengthList[LLIMIT] = {0};
240 U32 cumulLength[LLIMIT] = {0};
241 U32 savings[LLIMIT] = {0};
242 const BYTE* b = (const BYTE*)buffer;
243 size_t length;
244 size_t maxLength = LLIMIT;
245 size_t pos = suffix[start];
246 U32 end = start;
247 dictItem solution;
248
Yann Collete618f8e2016-01-28 00:29:58 +0100249 /* init */
250 memset(&solution, 0, sizeof(solution));
251 doneMarks[pos] = 1;
252
253 /* trivial repetition cases */
254 if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2))
255 ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
256 ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
257 /* skip and mark segment */
258 U16 u16 = MEM_read16(b+pos+4);
259 U32 u, e = 6;
260 while (MEM_read16(b+pos+e) == u16) e+=2 ;
261 if (b[pos+e] == b[pos+e-1]) e++;
262 for (u=1; u<e; u++)
263 doneMarks[pos+u] = 1;
264 return solution;
265 }
266
267 /* look forward */
268 do {
269 end++;
Yann Collet7d360282016-02-12 00:07:30 +0100270 length = ZDICT_count(b + pos, b + suffix[end]);
Yann Collete618f8e2016-01-28 00:29:58 +0100271 } while (length >=MINMATCHLENGTH);
272
273 /* look backward */
274 do {
Yann Collet7d360282016-02-12 00:07:30 +0100275 length = ZDICT_count(b + pos, b + *(suffix+start-1));
Yann Collete618f8e2016-01-28 00:29:58 +0100276 if (length >=MINMATCHLENGTH) start--;
277 } while(length >= MINMATCHLENGTH);
278
279 /* exit if not found a minimum nb of repetitions */
280 if (end-start < minRatio) {
281 U32 idx;
282 for(idx=start; idx<end; idx++)
283 doneMarks[suffix[idx]] = 1;
284 return solution;
285 }
286
287 {
288 int i;
289 U32 searchLength;
290 U32 refinedStart = start;
291 U32 refinedEnd = end;
292
293 DISPLAYLEVEL(4, "\n");
294 DISPLAYLEVEL(4, "found %3u matches of length >= %u at pos %7u ", (U32)(end-start), MINMATCHLENGTH, (U32)pos);
295 DISPLAYLEVEL(4, "\n");
296
297 for (searchLength = MINMATCHLENGTH ; ; searchLength++) {
298 BYTE currentChar = 0;
299 U32 currentCount = 0;
300 U32 currentID = refinedStart;
301 U32 id;
302 U32 selectedCount = 0;
303 U32 selectedID = currentID;
304 for (id =refinedStart; id < refinedEnd; id++) {
305 if (b[ suffix[id] + searchLength] != currentChar) {
306 if (currentCount > selectedCount) {
307 selectedCount = currentCount;
308 selectedID = currentID;
309 }
310 currentID = id;
311 currentChar = b[ suffix[id] + searchLength];
312 currentCount = 0;
313 }
314 currentCount ++;
315 }
316 if (currentCount > selectedCount) { /* for last */
317 selectedCount = currentCount;
318 selectedID = currentID;
319 }
320
321 if (selectedCount < minRatio)
322 break;
Yann Collete618f8e2016-01-28 00:29:58 +0100323 refinedStart = selectedID;
324 refinedEnd = refinedStart + selectedCount;
325 }
326
327 /* evaluate gain based on new ref */
328 start = refinedStart;
329 pos = suffix[refinedStart];
330 end = start;
331 memset(lengthList, 0, sizeof(lengthList));
332
333 /* look forward */
334 do {
335 end++;
Yann Collet7d360282016-02-12 00:07:30 +0100336 length = ZDICT_count(b + pos, b + suffix[end]);
Yann Collete618f8e2016-01-28 00:29:58 +0100337 if (length >= LLIMIT) length = LLIMIT-1;
338 lengthList[length]++;
339 } while (length >=MINMATCHLENGTH);
340
341 /* look backward */
342 do {
Yann Collet7d360282016-02-12 00:07:30 +0100343 length = ZDICT_count(b + pos, b + suffix[start-1]);
Yann Collete618f8e2016-01-28 00:29:58 +0100344 if (length >= LLIMIT) length = LLIMIT-1;
345 lengthList[length]++;
346 if (length >=MINMATCHLENGTH) start--;
347 } while(length >= MINMATCHLENGTH);
348
349 /* largest useful length */
350 memset(cumulLength, 0, sizeof(cumulLength));
351 cumulLength[maxLength-1] = lengthList[maxLength-1];
Yann Collet9cadd082016-01-28 15:39:52 +0100352 for (i=(int)(maxLength-2); i>=0; i--)
Yann Collete618f8e2016-01-28 00:29:58 +0100353 cumulLength[i] = cumulLength[i+1] + lengthList[i];
354
355 for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break;
356 maxLength = i;
357
358 /* reduce maxLength in case of final into repetitive data */
359 {
Yann Collet9cadd082016-01-28 15:39:52 +0100360 U32 l = (U32)maxLength;
Yann Collete618f8e2016-01-28 00:29:58 +0100361 BYTE c = b[pos + maxLength-1];
362 while (b[pos+l-2]==c) l--;
363 maxLength = l;
364 }
365 if (maxLength < MINMATCHLENGTH) return solution; /* skip : no long-enough solution */
366
367 /* calculate savings */
368 savings[5] = 0;
369 for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
370 savings[i] = savings[i-1] + (lengthList[i] * (i-3));
371
372 DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n",
373 (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
374
Yann Collet9cadd082016-01-28 15:39:52 +0100375 solution.pos = (U32)pos;
376 solution.length = (U32)maxLength;
Yann Collete618f8e2016-01-28 00:29:58 +0100377 solution.savings = savings[maxLength];
378
379 /* mark positions done */
380 {
381 U32 id;
382 U32 testedPos;
383 for (id=start; id<end; id++) {
384 U32 p, pEnd;
385 testedPos = suffix[id];
386 if (testedPos == pos)
387 length = solution.length;
388 else {
Yann Collet7d360282016-02-12 00:07:30 +0100389 length = ZDICT_count(b+pos, b+testedPos);
Yann Collete618f8e2016-01-28 00:29:58 +0100390 if (length > solution.length) length = solution.length;
391 }
Yann Collet9cadd082016-01-28 15:39:52 +0100392 pEnd = (U32)(testedPos + length);
Yann Collete618f8e2016-01-28 00:29:58 +0100393 for (p=testedPos; p<pEnd; p++)
394 doneMarks[p] = 1;
395 } } }
396
397 return solution;
398}
399
400
Yann Collet7d360282016-02-12 00:07:30 +0100401/*! ZDICT_checkMerge
Yann Collete618f8e2016-01-28 00:29:58 +0100402 check if dictItem can be merged, do it if possible
403 @return : id of destination elt, 0 if not merged
404*/
Yann Collet7d360282016-02-12 00:07:30 +0100405static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
Yann Collete618f8e2016-01-28 00:29:58 +0100406{
407 const U32 tableSize = table->pos;
408 const U32 max = elt.pos + (elt.length-1);
409
410 /* tail overlap */
411 U32 u; for (u=1; u<tableSize; u++) {
412 if (u==eltNbToSkip) continue;
413 if ((table[u].pos > elt.pos) && (table[u].pos < max)) { /* overlap */
414 /* append */
415 U32 addedLength = table[u].pos - elt.pos;
416 table[u].length += addedLength;
417 table[u].pos = elt.pos;
418 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
419 table[u].savings += elt.length / 8; /* rough approx */
420 elt = table[u];
421 while ((u>1) && (table[u-1].savings < elt.savings))
422 table[u] = table[u-1], u--;
423 table[u] = elt;
424 return u;
425 } }
426
427 /* front overlap */
428 for (u=1; u<tableSize; u++) {
429 if (u==eltNbToSkip) continue;
430 if ((table[u].pos + table[u].length > elt.pos) && (table[u].pos < elt.pos)) { /* overlap */
431 /* append */
432 int addedLength = (elt.pos + elt.length) - (table[u].pos + table[u].length);
433 table[u].savings += elt.length / 8; /* rough approx */
434 if (addedLength > 0) { /* otherwise, already included */
435 table[u].length += addedLength;
436 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
437 }
438 elt = table[u];
439 while ((u>1) && (table[u-1].savings < elt.savings))
440 table[u] = table[u-1], u--;
441 table[u] = elt;
442 return u;
443 } }
444
445 return 0;
446}
447
448
Yann Collet7d360282016-02-12 00:07:30 +0100449static void ZDICT_removeDictItem(dictItem* table, U32 id)
Yann Collete618f8e2016-01-28 00:29:58 +0100450{
451 /* convention : first element is nb of elts */
452 U32 max = table->pos;
453 U32 u;
454 if (!id) return; /* protection, should never happen */
Yann Collet9cadd082016-01-28 15:39:52 +0100455 for (u=id; u<max-1; u++)
Yann Collete618f8e2016-01-28 00:29:58 +0100456 table[u] = table[u+1];
457 table->pos--;
458}
459
460
Yann Collet7d360282016-02-12 00:07:30 +0100461static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
Yann Collete618f8e2016-01-28 00:29:58 +0100462{
463 /* merge if possible */
Yann Collet7d360282016-02-12 00:07:30 +0100464 U32 mergeId = ZDICT_checkMerge(table, elt, 0);
Yann Collete618f8e2016-01-28 00:29:58 +0100465 if (mergeId) {
466 U32 newMerge = 1;
467 while (newMerge) {
Yann Collet7d360282016-02-12 00:07:30 +0100468 newMerge = ZDICT_checkMerge(table, table[mergeId], mergeId);
469 if (newMerge) ZDICT_removeDictItem(table, mergeId);
Yann Collete618f8e2016-01-28 00:29:58 +0100470 mergeId = newMerge;
471 }
472 return;
473 }
474
475 /* insert */
476 {
477 U32 current;
478 U32 nextElt = table->pos;
479 if (nextElt >= maxSize) nextElt = maxSize-1;
480 current = nextElt-1;
481 while (table[current].savings < elt.savings) {
482 table[current+1] = table[current];
483 current--;
484 }
485 table[current+1] = elt;
486 table->pos = nextElt+1;
487 }
488}
489
490
Yann Collet7d360282016-02-12 00:07:30 +0100491static U32 ZDICT_dictSize(const dictItem* dictList)
Yann Collete618f8e2016-01-28 00:29:58 +0100492{
493 U32 u, dictSize = 0;
494 for (u=1; u<dictList[0].pos; u++)
495 dictSize += dictList[u].length;
496 return dictSize;
497}
498
499
Yann Collet71eafdd2016-02-12 02:31:57 +0100500static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
Yann Collete618f8e2016-01-28 00:29:58 +0100501 const void* const buffer, const size_t bufferSize, /* buffer must end with noisy guard band */
Yann Collet7682e492016-01-31 23:45:35 +0100502 const size_t* fileSizes, unsigned nbFiles,
503 U32 shiftRatio, unsigned maxDictSize)
Yann Collete618f8e2016-01-28 00:29:58 +0100504{
Yann Collet7d360282016-02-12 00:07:30 +0100505 int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
506 int* const suffix = suffix0+1;
Yann Collete618f8e2016-01-28 00:29:58 +0100507 U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
508 BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */
509 U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
510 U32 minRatio = nbFiles >> shiftRatio;
Yann Collet7d360282016-02-12 00:07:30 +0100511 int divSuftSortResult;
Yann Collet71eafdd2016-02-12 02:31:57 +0100512 size_t result = 0;
Yann Collete618f8e2016-01-28 00:29:58 +0100513
514 /* init */
515 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
Yann Collet71eafdd2016-02-12 02:31:57 +0100516 if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) {
517 result = ERROR(memory_allocation);
518 goto _cleanup;
519 }
Yann Collete618f8e2016-01-28 00:29:58 +0100520 if (minRatio < MINRATIO) minRatio = MINRATIO;
521 memset(doneMarks, 0, bufferSize+16);
522
523 /* sort */
Yann Collet7682e492016-01-31 23:45:35 +0100524 DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
Yann Collet7d360282016-02-12 00:07:30 +0100525 divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
Yann Collet71eafdd2016-02-12 02:31:57 +0100526 if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
Yann Collet7d360282016-02-12 00:07:30 +0100527 suffix[bufferSize] = (int)bufferSize; /* leads into noise */
528 suffix0[0] = (int)bufferSize; /* leads into noise */
Yann Collete618f8e2016-01-28 00:29:58 +0100529 {
530 /* build reverse suffix sort */
531 size_t pos;
532 for (pos=0; pos < bufferSize; pos++)
Yann Collet9cadd082016-01-28 15:39:52 +0100533 reverseSuffix[suffix[pos]] = (U32)pos;
Yann Collete618f8e2016-01-28 00:29:58 +0100534 /* build file pos */
535 filePos[0] = 0;
536 for (pos=1; pos<nbFiles; pos++)
Yann Collet9cadd082016-01-28 15:39:52 +0100537 filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
Yann Collete618f8e2016-01-28 00:29:58 +0100538 }
539
540 DISPLAYLEVEL(2, "finding patterns ... \n");
Yann Collet82516192016-01-29 16:48:10 +0100541 DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio);
Yann Collete618f8e2016-01-28 00:29:58 +0100542
543 {
544 U32 cursor; for (cursor=0; cursor < bufferSize; ) {
545 dictItem solution;
Yann Collete618f8e2016-01-28 00:29:58 +0100546 if (doneMarks[cursor]) { cursor++; continue; }
Yann Collet7d360282016-02-12 00:07:30 +0100547 solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio);
Yann Collete618f8e2016-01-28 00:29:58 +0100548 if (solution.length==0) { cursor++; continue; }
Yann Collet7d360282016-02-12 00:07:30 +0100549 ZDICT_insertDictItem(dictList, dictListSize, solution);
Yann Collete618f8e2016-01-28 00:29:58 +0100550 cursor += solution.length;
551 DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
Yann Collet82516192016-01-29 16:48:10 +0100552 } }
Yann Collete618f8e2016-01-28 00:29:58 +0100553
554 /* limit dictionary size */
555 {
556 U32 max = dictList->pos; /* convention : nb of useful elts within dictList */
557 U32 currentSize = 0;
558 U32 n; for (n=1; n<max; n++) {
559 currentSize += dictList[n].length;
560 if (currentSize > maxDictSize) break;
561 }
562 dictList->pos = n;
563 }
564
Yann Collet71eafdd2016-02-12 02:31:57 +0100565_cleanup:
Yann Collete618f8e2016-01-28 00:29:58 +0100566 free(suffix0);
567 free(reverseSuffix);
568 free(doneMarks);
569 free(filePos);
Yann Collet71eafdd2016-02-12 02:31:57 +0100570 return result;
Yann Collete618f8e2016-01-28 00:29:58 +0100571}
572
573
Yann Collet7d360282016-02-12 00:07:30 +0100574static void ZDICT_fillNoise(void* buffer, size_t length)
Yann Collete618f8e2016-01-28 00:29:58 +0100575{
576 unsigned acc = PRIME1;
577 size_t p=0;;
578
579 for (p=0; p<length; p++) {
580 acc *= PRIME2;
581 ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
582 }
583}
584
585
586typedef struct
587{
588 ZSTD_CCtx* ref;
589 ZSTD_CCtx* zc;
Yann Collet569b81a2016-03-16 15:26:51 +0100590 void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */
Yann Collete618f8e2016-01-28 00:29:58 +0100591} EStats_ress_t;
592
593
Yann Collet7d360282016-02-12 00:07:30 +0100594static void ZDICT_countEStats(EStats_ress_t esr,
Yann Collete618f8e2016-01-28 00:29:58 +0100595 U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount,
596 const void* src, size_t srcSize)
597{
598 const BYTE* bytePtr;
599 const U32* u32Ptr;
Yann Collet7d360282016-02-12 00:07:30 +0100600 seqStore_t seqStore;
Yann Collete618f8e2016-01-28 00:29:58 +0100601
Yann Collet569b81a2016-03-16 15:26:51 +0100602 if (srcSize > ZSTD_BLOCKSIZE_MAX) srcSize = ZSTD_BLOCKSIZE_MAX; /* protection vs large samples */
Yann Collete618f8e2016-01-28 00:29:58 +0100603 ZSTD_copyCCtx(esr.zc, esr.ref);
Yann Collet569b81a2016-03-16 15:26:51 +0100604 ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
Yann Collet7d360282016-02-12 00:07:30 +0100605 seqStore = ZSTD_copySeqStore(esr.zc);
Yann Collete618f8e2016-01-28 00:29:58 +0100606
607 /* count stats */
Yann Collet7d360282016-02-12 00:07:30 +0100608 for(bytePtr = seqStore.litStart; bytePtr < seqStore.lit; bytePtr++)
Yann Collete618f8e2016-01-28 00:29:58 +0100609 countLit[*bytePtr]++;
Yann Collet7d360282016-02-12 00:07:30 +0100610 for(u32Ptr = seqStore.offsetStart; u32Ptr < seqStore.offset; u32Ptr++) {
Yann Collete618f8e2016-01-28 00:29:58 +0100611 BYTE offcode = (BYTE)ZSTD_highbit(*u32Ptr) + 1;
612 if (*u32Ptr==0) offcode=0;
613 offsetcodeCount[offcode]++;
614 }
Yann Collet7d360282016-02-12 00:07:30 +0100615 for(bytePtr = seqStore.matchLengthStart; bytePtr < seqStore.matchLength; bytePtr++)
Yann Collete618f8e2016-01-28 00:29:58 +0100616 matchlengthCount[*bytePtr]++;
Yann Collet7d360282016-02-12 00:07:30 +0100617 for(bytePtr = seqStore.litLengthStart; bytePtr < seqStore.litLength; bytePtr++)
Yann Collete618f8e2016-01-28 00:29:58 +0100618 litlengthCount[*bytePtr]++;
619}
620
Yann Collet3152a8c2016-02-23 21:28:59 +0100621static size_t ZDICT_maxSampleSize(const size_t* fileSizes, unsigned nbFiles)
622{
623 unsigned u;
624 size_t max=0;
625 for (u=0; u<nbFiles; u++)
626 if (max < fileSizes[u]) max = fileSizes[u];
627 return max;
628}
Yann Collete618f8e2016-01-28 00:29:58 +0100629
Yann Collet7d360282016-02-12 00:07:30 +0100630#define OFFCODE_MAX 18 /* only applicable to first block */
631static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
Yann Colletf5229e02016-01-29 02:45:26 +0100632 unsigned compressionLevel,
Yann Collet7682e492016-01-31 23:45:35 +0100633 const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
Yann Collete618f8e2016-01-28 00:29:58 +0100634 const void* dictBuffer, size_t dictBufferSize)
635{
636 U32 countLit[256];
637 U32 offcodeCount[MaxOff+1];
638 HUF_CREATE_STATIC_CTABLE(hufTable, 255);
639 short offcodeNCount[MaxOff+1];
640 U32 matchLengthCount[MaxML+1];
641 short matchLengthNCount[MaxML+1];
642 U32 litlengthCount[MaxLL+1];
643 short litlengthNCount[MaxLL+1];
644 EStats_ress_t esr;
645 ZSTD_parameters params;
646 U32 u, huffLog = 12, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
647 size_t pos = 0, errorCode;
648 size_t eSize = 0;
649
650 /* init */
651 for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */
652 for (u=0; u<=OFFCODE_MAX; u++) offcodeCount[u]=1;
653 for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
654 for (u=0; u<=MaxLL; u++) litlengthCount[u]=1;
655 esr.ref = ZSTD_createCCtx();
656 esr.zc = ZSTD_createCCtx();
Yann Collet569b81a2016-03-16 15:26:51 +0100657 esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
Yann Collet71eafdd2016-02-12 02:31:57 +0100658 if (!esr.ref || !esr.zc || !esr.workPlace) {
659 eSize = ERROR(memory_allocation);
660 DISPLAYLEVEL(1, "Not enough memory");
661 goto _cleanup;
662 }
Yann Colletf5229e02016-01-29 02:45:26 +0100663 if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
Yann Collet3152a8c2016-02-23 21:28:59 +0100664 params = ZSTD_getParams(compressionLevel, MAX(dictBufferSize, ZDICT_maxSampleSize(fileSizes, nbFiles)));
Yann Collete618f8e2016-01-28 00:29:58 +0100665 params.strategy = ZSTD_greedy;
666 ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params);
667
668 /* collect stats on all files */
669 for (u=0; u<nbFiles; u++) {
Yann Collet7d360282016-02-12 00:07:30 +0100670 ZDICT_countEStats(esr,
Yann Collete618f8e2016-01-28 00:29:58 +0100671 countLit, offcodeCount, matchLengthCount, litlengthCount,
672 (const char*)srcBuffer + pos, fileSizes[u]);
673 pos += fileSizes[u];
674 }
675
676 /* analyze */
677 errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
Yann Collet71eafdd2016-02-12 02:31:57 +0100678 if (HUF_isError(errorCode)) {
679 eSize = ERROR(GENERIC);
680 DISPLAYLEVEL(1, "HUF_buildCTable error");
681 goto _cleanup;
682 }
Yann Collete618f8e2016-01-28 00:29:58 +0100683 huffLog = (U32)errorCode;
684
685 total=0; for (u=0; u<=OFFCODE_MAX; u++) total+=offcodeCount[u];
686 errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, OFFCODE_MAX);
Yann Collet71eafdd2016-02-12 02:31:57 +0100687 if (FSE_isError(errorCode)) {
688 eSize = ERROR(GENERIC);
689 DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount");
690 goto _cleanup;
691 }
Yann Collete618f8e2016-01-28 00:29:58 +0100692 Offlog = (U32)errorCode;
693
694 total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
695 errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
Yann Collet71eafdd2016-02-12 02:31:57 +0100696 if (FSE_isError(errorCode)) {
697 eSize = ERROR(GENERIC);
698 DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount");
699 goto _cleanup;
700 }
Yann Collete618f8e2016-01-28 00:29:58 +0100701 mlLog = (U32)errorCode;
702
703 total=0; for (u=0; u<=MaxLL; u++) total+=litlengthCount[u];
704 errorCode = FSE_normalizeCount(litlengthNCount, llLog, litlengthCount, total, MaxLL);
Yann Collet71eafdd2016-02-12 02:31:57 +0100705 if (FSE_isError(errorCode)) {
706 eSize = ERROR(GENERIC);
707 DISPLAYLEVEL(1, "FSE_normalizeCount error with litlengthCount");
708 goto _cleanup;
709 }
Yann Collete618f8e2016-01-28 00:29:58 +0100710 llLog = (U32)errorCode;
711
712 /* write result to buffer */
713 errorCode = HUF_writeCTable(dstBuffer, maxDstSize, hufTable, 255, huffLog);
Yann Collet71eafdd2016-02-12 02:31:57 +0100714 if (HUF_isError(errorCode)) {
715 eSize = ERROR(GENERIC);
716 DISPLAYLEVEL(1, "HUF_writeCTable error");
717 goto _cleanup;
718 }
Yann Collete618f8e2016-01-28 00:29:58 +0100719 dstBuffer = (char*)dstBuffer + errorCode;
720 maxDstSize -= errorCode;
721 eSize += errorCode;
722
723 errorCode = FSE_writeNCount(dstBuffer, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
Yann Collet71eafdd2016-02-12 02:31:57 +0100724 if (FSE_isError(errorCode)) {
725 eSize = ERROR(GENERIC);
726 DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount");
727 goto _cleanup;
728 }
Yann Collete618f8e2016-01-28 00:29:58 +0100729 dstBuffer = (char*)dstBuffer + errorCode;
730 maxDstSize -= errorCode;
731 eSize += errorCode;
732
733 errorCode = FSE_writeNCount(dstBuffer, maxDstSize, matchLengthNCount, MaxML, mlLog);
Yann Collet71eafdd2016-02-12 02:31:57 +0100734 if (FSE_isError(errorCode)) {
735 eSize = ERROR(GENERIC);
736 DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount");
737 goto _cleanup;
738 }
Yann Collete618f8e2016-01-28 00:29:58 +0100739 dstBuffer = (char*)dstBuffer + errorCode;
740 maxDstSize -= errorCode;
741 eSize += errorCode;
742
743 errorCode = FSE_writeNCount(dstBuffer, maxDstSize, litlengthNCount, MaxLL, llLog);
Yann Collet71eafdd2016-02-12 02:31:57 +0100744 if (FSE_isError(errorCode)) {
745 eSize = ERROR(GENERIC);
746 DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount");
747 goto _cleanup;
748 }
Yann Collete618f8e2016-01-28 00:29:58 +0100749 dstBuffer = (char*)dstBuffer + errorCode;
750 maxDstSize -= errorCode;
751 eSize += errorCode;
752
Yann Collet71eafdd2016-02-12 02:31:57 +0100753_cleanup:
Yann Collete618f8e2016-01-28 00:29:58 +0100754 ZSTD_freeCCtx(esr.ref);
755 ZSTD_freeCCtx(esr.zc);
756 free(esr.workPlace);
757
758 return eSize;
759}
760
761
Yann Colletf5229e02016-01-29 02:45:26 +0100762#define DIB_FASTSEGMENTSIZE 64
Yann Collet71eafdd2016-02-12 02:31:57 +0100763/*! ZDICT_fastSampling() (based on an idea proposed by Giuseppe Ottaviano) :
764 Fill `dictBuffer` with stripes of size DIB_FASTSEGMENTSIZE from `samplesBuffer`,
Yann Collet7d360282016-02-12 00:07:30 +0100765 up to `dictSize`.
766 Filling starts from the end of `dictBuffer`, down to maximum possible.
767 if `dictSize` is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of `dictBuffer` won't be used.
Yann Collet71eafdd2016-02-12 02:31:57 +0100768 @return : amount of data written into `dictBuffer`,
Yann Collet7d360282016-02-12 00:07:30 +0100769 or an error code
Yann Colletf5229e02016-01-29 02:45:26 +0100770*/
Yann Collet7d360282016-02-12 00:07:30 +0100771static size_t ZDICT_fastSampling(void* dictBuffer, size_t dictSize,
Yann Colletf5229e02016-01-29 02:45:26 +0100772 const void* samplesBuffer, size_t samplesSize)
773{
774 char* dstPtr = (char*)dictBuffer + dictSize;
775 const char* srcPtr = (const char*)samplesBuffer;
776 size_t nbSegments = dictSize / DIB_FASTSEGMENTSIZE;
777 size_t segNb, interSize;
778
779 if (nbSegments <= 2) return ERROR(srcSize_wrong);
780 if (samplesSize < dictSize) return ERROR(srcSize_wrong);
781
782 /* first and last segments are part of dictionary, in case they contain interesting header/footer */
783 dstPtr -= DIB_FASTSEGMENTSIZE;
784 memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
785 dstPtr -= DIB_FASTSEGMENTSIZE;
786 memcpy(dstPtr, srcPtr+samplesSize-DIB_FASTSEGMENTSIZE, DIB_FASTSEGMENTSIZE);
787
788 /* regularly copy a segment */
789 interSize = (samplesSize - nbSegments*DIB_FASTSEGMENTSIZE) / (nbSegments-1);
790 srcPtr += DIB_FASTSEGMENTSIZE;
791 for (segNb=2; segNb < nbSegments; segNb++) {
792 srcPtr += interSize;
793 dstPtr -= DIB_FASTSEGMENTSIZE;
794 memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
795 srcPtr += DIB_FASTSEGMENTSIZE;
796 }
797
798 return nbSegments * DIB_FASTSEGMENTSIZE;
799}
800
801
Yann Collet7d360282016-02-12 00:07:30 +0100802size_t ZDICT_trainFromBuffer_unsafe(
Yann Collet7682e492016-01-31 23:45:35 +0100803 void* dictBuffer, size_t maxDictSize,
804 const void* samplesBuffer, const size_t* sampleSizes, unsigned nbSamples,
Yann Collet7d360282016-02-12 00:07:30 +0100805 ZDICT_params_t params)
Yann Collete618f8e2016-01-28 00:29:58 +0100806{
Yann Colletb35c4642016-02-01 00:03:10 +0100807 const U32 dictListSize = MAX( MAX(DICTLISTSIZE, nbSamples), (U32)(maxDictSize/16));
Yann Collete618f8e2016-01-28 00:29:58 +0100808 dictItem* dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
Yann Collet7682e492016-01-31 23:45:35 +0100809 unsigned selectivity = params.selectivityLevel;
810 unsigned compressionLevel = params.compressionLevel;
Yann Collet3152a8c2016-02-23 21:28:59 +0100811 size_t targetDictSize = maxDictSize;
Yann Collet7682e492016-01-31 23:45:35 +0100812 size_t sBuffSize;
813 size_t dictSize = 0;
814
815 /* checks */
816 if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) return ERROR(dstSize_tooSmall);
Yann Collete618f8e2016-01-28 00:29:58 +0100817
818 /* init */
Yann Collet7682e492016-01-31 23:45:35 +0100819 { unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += sampleSizes[u]; }
Yann Collet7d360282016-02-12 00:07:30 +0100820 if (!dictList) return ERROR(memory_allocation);
821 ZDICT_initDictItem(dictList);
Yann Collet6f3acba2016-02-12 20:19:48 +0100822 g_displayLevel = params.notificationLevel;
Yann Collet7682e492016-01-31 23:45:35 +0100823 if (selectivity==0) selectivity = g_selectivity_default;
824 if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
Yann Collete618f8e2016-01-28 00:29:58 +0100825
Yann Collet7d360282016-02-12 00:07:30 +0100826 /* build dictionary */
827 if (selectivity>1) { /* selectivity == 1 => fast mode */
828 ZDICT_trainBuffer(dictList, dictListSize,
Yann Collet7682e492016-01-31 23:45:35 +0100829 samplesBuffer, sBuffSize,
830 sampleSizes, nbSamples,
Yann Colletb35c4642016-02-01 00:03:10 +0100831 selectivity, (U32)targetDictSize);
Yann Collete618f8e2016-01-28 00:29:58 +0100832
Yann Colletf5229e02016-01-29 02:45:26 +0100833 /* display best matches */
834 if (g_displayLevel>= 3) {
835 const U32 nb = 25;
836 U32 u;
Yann Collet7d360282016-02-12 00:07:30 +0100837 U32 dictContentSize = ZDICT_dictSize(dictList);
Yann Colletf5229e02016-01-29 02:45:26 +0100838 DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
839 DISPLAYLEVEL(3, "list %u best segments \n", nb);
840 for (u=1; u<=nb; u++) {
841 U32 p = dictList[u].pos;
842 U32 l = dictList[u].length;
843 U32 d = MIN(40, l);
844 DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
845 u, l, p, dictList[u].savings);
Yann Collet7d360282016-02-12 00:07:30 +0100846 ZDICT_printHex(3, (const char*)samplesBuffer+p, d);
Yann Colletf5229e02016-01-29 02:45:26 +0100847 DISPLAYLEVEL(3, "| \n");
848 } } }
Yann Collete618f8e2016-01-28 00:29:58 +0100849
850 /* create dictionary */
851 {
Yann Collet7d360282016-02-12 00:07:30 +0100852 U32 dictContentSize = ZDICT_dictSize(dictList);
Yann Collet7682e492016-01-31 23:45:35 +0100853 size_t hSize;
Yann Collete618f8e2016-01-28 00:29:58 +0100854 BYTE* ptr;
855 U32 u;
856
Yann Collete618f8e2016-01-28 00:29:58 +0100857 /* build dict content */
Yann Collet7682e492016-01-31 23:45:35 +0100858 ptr = (BYTE*)dictBuffer + maxDictSize;
Yann Collete618f8e2016-01-28 00:29:58 +0100859 for (u=1; u<dictList->pos; u++) {
860 U32 l = dictList[u].length;
861 ptr -= l;
Yann Collet7682e492016-01-31 23:45:35 +0100862 if (ptr<(BYTE*)dictBuffer) return ERROR(GENERIC); /* should not happen */
863 memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
Yann Collete618f8e2016-01-28 00:29:58 +0100864 }
865
Yann Collet82516192016-01-29 16:48:10 +0100866 /* fast mode dict content */
Yann Collet7682e492016-01-31 23:45:35 +0100867 if (selectivity==1) { /* note could also be used to complete a dictionary, but not necessarily better */
868 DISPLAYLEVEL(3, "\r%70s\r", ""); /* clean display line */
869 DISPLAYLEVEL(3, "Adding %u KB with fast sampling \n", (U32)(targetDictSize>>10));
Yann Collet3152a8c2016-02-23 21:28:59 +0100870 dictContentSize = (U32)ZDICT_fastSampling(dictBuffer, targetDictSize,
871 samplesBuffer, sBuffSize);
Yann Colletf5229e02016-01-29 02:45:26 +0100872 }
873
Yann Collet7682e492016-01-31 23:45:35 +0100874 /* dictionary header */
875 MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
Yann Collete618f8e2016-01-28 00:29:58 +0100876 hSize = 4;
Yann Collete618f8e2016-01-28 00:29:58 +0100877
878 /* entropic tables */
Yann Collet7682e492016-01-31 23:45:35 +0100879 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
Yann Collete618f8e2016-01-28 00:29:58 +0100880 DISPLAYLEVEL(2, "statistics ... \n");
Yann Collet7d360282016-02-12 00:07:30 +0100881 hSize += ZDICT_analyzeEntropy((char*)dictBuffer+4, maxDictSize-4,
Yann Colletf5229e02016-01-29 02:45:26 +0100882 compressionLevel,
Yann Collet7682e492016-01-31 23:45:35 +0100883 samplesBuffer, sampleSizes, nbSamples,
884 (char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize);
Yann Collete618f8e2016-01-28 00:29:58 +0100885
Yann Collet7682e492016-01-31 23:45:35 +0100886 if (hSize + dictContentSize < maxDictSize)
887 memmove((char*)dictBuffer + hSize, (char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize);
888 dictSize = MIN(maxDictSize, hSize+dictContentSize);
Yann Collete618f8e2016-01-28 00:29:58 +0100889 }
890
891 /* clean up */
Yann Collete618f8e2016-01-28 00:29:58 +0100892 free(dictList);
Yann Collet7682e492016-01-31 23:45:35 +0100893 return dictSize;
Yann Collete618f8e2016-01-28 00:29:58 +0100894}
895
Yann Collet7682e492016-01-31 23:45:35 +0100896
Yann Collet7d360282016-02-12 00:07:30 +0100897size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
898 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
899 ZDICT_params_t params)
Yann Collet7682e492016-01-31 23:45:35 +0100900{
901 size_t sBuffSize;
902 void* newBuff;
903 size_t result;
904
Yann Collet7d360282016-02-12 00:07:30 +0100905 { unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; }
Yann Collet7682e492016-01-31 23:45:35 +0100906 newBuff = malloc(sBuffSize + NOISELENGTH);
907 if (!newBuff) return ERROR(memory_allocation);
908
909 memcpy(newBuff, samplesBuffer, sBuffSize);
Yann Collet7d360282016-02-12 00:07:30 +0100910 ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
Yann Collet7682e492016-01-31 23:45:35 +0100911
Yann Collet7d360282016-02-12 00:07:30 +0100912 result = ZDICT_trainFromBuffer_unsafe(dictBuffer, dictBufferCapacity,
913 newBuff, samplesSizes, nbSamples,
Yann Collet7682e492016-01-31 23:45:35 +0100914 params);
915 free(newBuff);
916 return result;
917}
918
919
Yann Collet7d360282016-02-12 00:07:30 +0100920/* issue : samplesBuffer need to be followed by a noisy guard band.
921* work around : duplicate the buffer, and add the noise ? */
922size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
923 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
Yann Collet7682e492016-01-31 23:45:35 +0100924{
Yann Collet7d360282016-02-12 00:07:30 +0100925 ZDICT_params_t params;
926 memset(&params, 0, sizeof(params));
927 return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity,
928 samplesBuffer, samplesSizes, nbSamples,
929 params);
Yann Collet7682e492016-01-31 23:45:35 +0100930}
Yann Collet7d360282016-02-12 00:07:30 +0100931