blob: 50b67d58d78c65750cb50e31a2951662ba82ad16 [file] [log] [blame]
Yann Collete618f8e2016-01-28 00:29:58 +01001/*
Yann Collet7d360282016-02-12 00:07:30 +01002 dictBuilder - dictionary builder for zstd
Yann Collete618f8e2016-01-28 00:29:58 +01003 Copyright (C) Yann Collet 2016
4
Yann Collet7d360282016-02-12 00:07:30 +01005 BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
Yann Collete618f8e2016-01-28 00:29:58 +01006
Yann Collet7d360282016-02-12 00:07:30 +01007 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are
9 met:
Yann Collete618f8e2016-01-28 00:29:58 +010010
Yann Collet7d360282016-02-12 00:07:30 +010011 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13 * Redistributions in binary form must reproduce the above
14 copyright notice, this list of conditions and the following disclaimer
15 in the documentation and/or other materials provided with the
16 distribution.
Yann Collete618f8e2016-01-28 00:29:58 +010017
Yann Collet7d360282016-02-12 00:07:30 +010018 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Yann Collete618f8e2016-01-28 00:29:58 +010029
30 You can contact the author at :
Yann Collet71eafdd2016-02-12 02:31:57 +010031 - Zstd homepage : https://www.zstd.net
Yann Collete618f8e2016-01-28 00:29:58 +010032*/
33
Yann Collet7d360282016-02-12 00:07:30 +010034/*-**************************************
Yann Collete618f8e2016-01-28 00:29:58 +010035* Compiler Options
36****************************************/
37/* Disable some Visual warning messages */
38#ifdef _MSC_VER
39# define _CRT_SECURE_NO_WARNINGS /* fopen */
40# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
41#endif
42
43/* Unix Large Files support (>4GB) */
44#define _FILE_OFFSET_BITS 64
45#if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */
46# define _LARGEFILE_SOURCE
47#elif ! defined(__LP64__) /* No point defining Large file for 64 bit */
48# define _LARGEFILE64_SOURCE
49#endif
50
Yann Collete618f8e2016-01-28 00:29:58 +010051
Yann Collet863ec402016-01-28 17:56:33 +010052/*-*************************************
Yann Collet7d360282016-02-12 00:07:30 +010053* Dependencies
Yann Collete618f8e2016-01-28 00:29:58 +010054***************************************/
Yann Collet7682e492016-01-31 23:45:35 +010055#include <stdlib.h> /* malloc, free */
56#include <string.h> /* memset */
57#include <stdio.h> /* fprintf, fopen, ftello64 */
58#include <sys/types.h> /* stat64 */
59#include <sys/stat.h> /* stat64 */
60#include <time.h> /* clock */
Yann Collete618f8e2016-01-28 00:29:58 +010061
Yann Collet7682e492016-01-31 23:45:35 +010062#include "mem.h" /* read */
Yann Colletf5229e02016-01-29 02:45:26 +010063#include "error_private.h"
Yann Collete618f8e2016-01-28 00:29:58 +010064#include "divsufsort.h"
Yann Collet7d360282016-02-12 00:07:30 +010065#include "dictBuilder_static.h"
66#include "fse.h"
Yann Collete618f8e2016-01-28 00:29:58 +010067#include "huff0_static.h"
Yann Collet7d360282016-02-12 00:07:30 +010068#include "zstd_internal.h"
Yann Collete618f8e2016-01-28 00:29:58 +010069
70
Yann Collet7682e492016-01-31 23:45:35 +010071/*-*************************************
Yann Collete618f8e2016-01-28 00:29:58 +010072* Compiler specifics
73***************************************/
74#if !defined(S_ISREG)
75# define S_ISREG(x) (((x) & S_IFMT) == S_IFREG)
76#endif
77
Yann Collete618f8e2016-01-28 00:29:58 +010078
Yann Collet863ec402016-01-28 17:56:33 +010079/*-*************************************
Yann Collete618f8e2016-01-28 00:29:58 +010080* Constants
81***************************************/
82#define KB *(1 <<10)
83#define MB *(1 <<20)
84#define GB *(1U<<30)
85
86#define DICTLISTSIZE 10000
Yann Collete618f8e2016-01-28 00:29:58 +010087
88#define NOISELENGTH 32
89#define PRIME1 2654435761U
90#define PRIME2 2246822519U
91
92#define MINRATIO 4
Yann Colletf5229e02016-01-29 02:45:26 +010093static const U32 g_compressionLevel_default = 5;
Yann Collet7682e492016-01-31 23:45:35 +010094static const U32 g_selectivity_default = 9;
95static const size_t g_provision_entropySize = 200;
96static const size_t g_min_fast_dictContent = 192;
Yann Collete618f8e2016-01-28 00:29:58 +010097
98
Yann Collet863ec402016-01-28 17:56:33 +010099/*-*************************************
100* Console display
Yann Collete618f8e2016-01-28 00:29:58 +0100101***************************************/
102#define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
103#define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
Yann Colletb31de732016-01-28 10:30:02 +0100104static unsigned g_displayLevel = 0; /* 0 : no display; 1: errors; 2: default; 4: full information */
Yann Collet7d360282016-02-12 00:07:30 +0100105void ZDICT_setNotificationLevel(unsigned l) { g_displayLevel=l; }
Yann Collete618f8e2016-01-28 00:29:58 +0100106
107#define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
Yann Collet7d360282016-02-12 00:07:30 +0100108 if (ZDICT_GetMilliSpan(g_time) > refreshRate) \
Yann Collete618f8e2016-01-28 00:29:58 +0100109 { g_time = clock(); DISPLAY(__VA_ARGS__); \
110 if (g_displayLevel>=4) fflush(stdout); } }
111static const unsigned refreshRate = 300;
112static clock_t g_time = 0;
113
Yann Collet7d360282016-02-12 00:07:30 +0100114void ZDICT_printHex(U32 dlevel, const void* ptr, size_t length)
Yann Collete618f8e2016-01-28 00:29:58 +0100115{
116 const BYTE* const b = (const BYTE*)ptr;
117 size_t u;
118 for (u=0; u<length; u++)
119 {
120 BYTE c = b[u];
121 if (c<32 || c>126) c = '.'; /* non-printable char */
122 DISPLAYLEVEL(dlevel, "%c", c);
123 }
124}
125
126
Yann Collet7d360282016-02-12 00:07:30 +0100127/*-********************************************************
Yann Collete618f8e2016-01-28 00:29:58 +0100128* Helper functions
129**********************************************************/
Yann Collet7d360282016-02-12 00:07:30 +0100130static unsigned ZDICT_GetMilliSpan(clock_t nPrevious)
Yann Collete618f8e2016-01-28 00:29:58 +0100131{
132 clock_t nCurrent = clock();
133 unsigned nSpan = (unsigned)(((nCurrent - nPrevious) * 1000) / CLOCKS_PER_SEC);
134 return nSpan;
135}
136
Yann Collet7d360282016-02-12 00:07:30 +0100137unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
Yann Collet7682e492016-01-31 23:45:35 +0100138
Yann Collet7d360282016-02-12 00:07:30 +0100139const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
Yann Collete618f8e2016-01-28 00:29:58 +0100140
141
142/*-********************************************************
143* Dictionary training functions
144**********************************************************/
Yann Collet7d360282016-02-12 00:07:30 +0100145static unsigned ZDICT_NbCommonBytes (register size_t val)
Yann Collete618f8e2016-01-28 00:29:58 +0100146{
147 if (MEM_isLittleEndian()) {
148 if (MEM_64bits()) {
149# if defined(_MSC_VER) && defined(_WIN64)
150 unsigned long r = 0;
151 _BitScanForward64( &r, (U64)val );
152 return (unsigned)(r>>3);
153# elif defined(__GNUC__) && (__GNUC__ >= 3)
154 return (__builtin_ctzll((U64)val) >> 3);
155# else
156 static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
157 return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
158# endif
159 } else { /* 32 bits */
160# if defined(_MSC_VER)
161 unsigned long r=0;
162 _BitScanForward( &r, (U32)val );
163 return (unsigned)(r>>3);
164# elif defined(__GNUC__) && (__GNUC__ >= 3)
165 return (__builtin_ctz((U32)val) >> 3);
166# else
167 static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
168 return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
169# endif
170 }
171 } else { /* Big Endian CPU */
172 if (MEM_64bits()) {
173# if defined(_MSC_VER) && defined(_WIN64)
174 unsigned long r = 0;
175 _BitScanReverse64( &r, val );
176 return (unsigned)(r>>3);
177# elif defined(__GNUC__) && (__GNUC__ >= 3)
178 return (__builtin_clzll(val) >> 3);
179# else
180 unsigned r;
181 const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */
182 if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
183 if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
184 r += (!val);
185 return r;
186# endif
187 } else { /* 32 bits */
188# if defined(_MSC_VER)
189 unsigned long r = 0;
190 _BitScanReverse( &r, (unsigned long)val );
191 return (unsigned)(r>>3);
192# elif defined(__GNUC__) && (__GNUC__ >= 3)
193 return (__builtin_clz((U32)val) >> 3);
194# else
195 unsigned r;
196 if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
197 r += (!val);
198 return r;
199# endif
200 } }
201}
202
203
Yann Collet7d360282016-02-12 00:07:30 +0100204/*! ZDICT_count() :
Yann Collete618f8e2016-01-28 00:29:58 +0100205 Count the nb of common bytes between 2 pointers.
206 Note : this function presumes end of buffer followed by noisy guard band.
207*/
Yann Collet7d360282016-02-12 00:07:30 +0100208static size_t ZDICT_count(const void* pIn, const void* pMatch)
Yann Collete618f8e2016-01-28 00:29:58 +0100209{
210 const char* const pStart = (const char*)pIn;
211 for (;;) {
Yann Collet7d360282016-02-12 00:07:30 +0100212 size_t diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
Yann Collete618f8e2016-01-28 00:29:58 +0100213 if (!diff) { pIn = (const char*)pIn+sizeof(size_t); pMatch = (const char*)pMatch+sizeof(size_t); continue; }
Yann Collet7d360282016-02-12 00:07:30 +0100214 pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff);
Yann Collete618f8e2016-01-28 00:29:58 +0100215 return (size_t)((const char*)pIn - pStart);
216 }
217}
218
219
220typedef struct {
221 U32 pos;
222 U32 length;
223 U32 savings;
224} dictItem;
225
Yann Collet7d360282016-02-12 00:07:30 +0100226static void ZDICT_initDictItem(dictItem* d)
Yann Collete618f8e2016-01-28 00:29:58 +0100227{
228 d->pos = 1;
229 d->length = 0;
230 d->savings = (U32)(-1);
231}
232
233
234#define LLIMIT 64 /* heuristic determined experimentally */
235#define MINMATCHLENGTH 7 /* heuristic determined experimentally */
Yann Collet7d360282016-02-12 00:07:30 +0100236static dictItem ZDICT_analyzePos(
Yann Collete618f8e2016-01-28 00:29:58 +0100237 BYTE* doneMarks,
Yann Collet7d360282016-02-12 00:07:30 +0100238 const int* suffix, U32 start,
Yann Collete618f8e2016-01-28 00:29:58 +0100239 const void* buffer, U32 minRatio)
240{
241 U32 lengthList[LLIMIT] = {0};
242 U32 cumulLength[LLIMIT] = {0};
243 U32 savings[LLIMIT] = {0};
244 const BYTE* b = (const BYTE*)buffer;
245 size_t length;
246 size_t maxLength = LLIMIT;
247 size_t pos = suffix[start];
248 U32 end = start;
249 dictItem solution;
250
Yann Collete618f8e2016-01-28 00:29:58 +0100251 /* init */
252 memset(&solution, 0, sizeof(solution));
253 doneMarks[pos] = 1;
254
255 /* trivial repetition cases */
256 if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2))
257 ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
258 ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
259 /* skip and mark segment */
260 U16 u16 = MEM_read16(b+pos+4);
261 U32 u, e = 6;
262 while (MEM_read16(b+pos+e) == u16) e+=2 ;
263 if (b[pos+e] == b[pos+e-1]) e++;
264 for (u=1; u<e; u++)
265 doneMarks[pos+u] = 1;
266 return solution;
267 }
268
269 /* look forward */
270 do {
271 end++;
Yann Collet7d360282016-02-12 00:07:30 +0100272 length = ZDICT_count(b + pos, b + suffix[end]);
Yann Collete618f8e2016-01-28 00:29:58 +0100273 } while (length >=MINMATCHLENGTH);
274
275 /* look backward */
276 do {
Yann Collet7d360282016-02-12 00:07:30 +0100277 length = ZDICT_count(b + pos, b + *(suffix+start-1));
Yann Collete618f8e2016-01-28 00:29:58 +0100278 if (length >=MINMATCHLENGTH) start--;
279 } while(length >= MINMATCHLENGTH);
280
281 /* exit if not found a minimum nb of repetitions */
282 if (end-start < minRatio) {
283 U32 idx;
284 for(idx=start; idx<end; idx++)
285 doneMarks[suffix[idx]] = 1;
286 return solution;
287 }
288
289 {
290 int i;
291 U32 searchLength;
292 U32 refinedStart = start;
293 U32 refinedEnd = end;
294
295 DISPLAYLEVEL(4, "\n");
296 DISPLAYLEVEL(4, "found %3u matches of length >= %u at pos %7u ", (U32)(end-start), MINMATCHLENGTH, (U32)pos);
297 DISPLAYLEVEL(4, "\n");
298
299 for (searchLength = MINMATCHLENGTH ; ; searchLength++) {
300 BYTE currentChar = 0;
301 U32 currentCount = 0;
302 U32 currentID = refinedStart;
303 U32 id;
304 U32 selectedCount = 0;
305 U32 selectedID = currentID;
306 for (id =refinedStart; id < refinedEnd; id++) {
307 if (b[ suffix[id] + searchLength] != currentChar) {
308 if (currentCount > selectedCount) {
309 selectedCount = currentCount;
310 selectedID = currentID;
311 }
312 currentID = id;
313 currentChar = b[ suffix[id] + searchLength];
314 currentCount = 0;
315 }
316 currentCount ++;
317 }
318 if (currentCount > selectedCount) { /* for last */
319 selectedCount = currentCount;
320 selectedID = currentID;
321 }
322
323 if (selectedCount < minRatio)
324 break;
Yann Collete618f8e2016-01-28 00:29:58 +0100325 refinedStart = selectedID;
326 refinedEnd = refinedStart + selectedCount;
327 }
328
329 /* evaluate gain based on new ref */
330 start = refinedStart;
331 pos = suffix[refinedStart];
332 end = start;
333 memset(lengthList, 0, sizeof(lengthList));
334
335 /* look forward */
336 do {
337 end++;
Yann Collet7d360282016-02-12 00:07:30 +0100338 length = ZDICT_count(b + pos, b + suffix[end]);
Yann Collete618f8e2016-01-28 00:29:58 +0100339 if (length >= LLIMIT) length = LLIMIT-1;
340 lengthList[length]++;
341 } while (length >=MINMATCHLENGTH);
342
343 /* look backward */
344 do {
Yann Collet7d360282016-02-12 00:07:30 +0100345 length = ZDICT_count(b + pos, b + suffix[start-1]);
Yann Collete618f8e2016-01-28 00:29:58 +0100346 if (length >= LLIMIT) length = LLIMIT-1;
347 lengthList[length]++;
348 if (length >=MINMATCHLENGTH) start--;
349 } while(length >= MINMATCHLENGTH);
350
351 /* largest useful length */
352 memset(cumulLength, 0, sizeof(cumulLength));
353 cumulLength[maxLength-1] = lengthList[maxLength-1];
Yann Collet9cadd082016-01-28 15:39:52 +0100354 for (i=(int)(maxLength-2); i>=0; i--)
Yann Collete618f8e2016-01-28 00:29:58 +0100355 cumulLength[i] = cumulLength[i+1] + lengthList[i];
356
357 for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break;
358 maxLength = i;
359
360 /* reduce maxLength in case of final into repetitive data */
361 {
Yann Collet9cadd082016-01-28 15:39:52 +0100362 U32 l = (U32)maxLength;
Yann Collete618f8e2016-01-28 00:29:58 +0100363 BYTE c = b[pos + maxLength-1];
364 while (b[pos+l-2]==c) l--;
365 maxLength = l;
366 }
367 if (maxLength < MINMATCHLENGTH) return solution; /* skip : no long-enough solution */
368
369 /* calculate savings */
370 savings[5] = 0;
371 for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
372 savings[i] = savings[i-1] + (lengthList[i] * (i-3));
373
374 DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n",
375 (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
376
Yann Collet9cadd082016-01-28 15:39:52 +0100377 solution.pos = (U32)pos;
378 solution.length = (U32)maxLength;
Yann Collete618f8e2016-01-28 00:29:58 +0100379 solution.savings = savings[maxLength];
380
381 /* mark positions done */
382 {
383 U32 id;
384 U32 testedPos;
385 for (id=start; id<end; id++) {
386 U32 p, pEnd;
387 testedPos = suffix[id];
388 if (testedPos == pos)
389 length = solution.length;
390 else {
Yann Collet7d360282016-02-12 00:07:30 +0100391 length = ZDICT_count(b+pos, b+testedPos);
Yann Collete618f8e2016-01-28 00:29:58 +0100392 if (length > solution.length) length = solution.length;
393 }
Yann Collet9cadd082016-01-28 15:39:52 +0100394 pEnd = (U32)(testedPos + length);
Yann Collete618f8e2016-01-28 00:29:58 +0100395 for (p=testedPos; p<pEnd; p++)
396 doneMarks[p] = 1;
397 } } }
398
399 return solution;
400}
401
402
Yann Collet7d360282016-02-12 00:07:30 +0100403/*! ZDICT_checkMerge
Yann Collete618f8e2016-01-28 00:29:58 +0100404 check if dictItem can be merged, do it if possible
405 @return : id of destination elt, 0 if not merged
406*/
Yann Collet7d360282016-02-12 00:07:30 +0100407static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
Yann Collete618f8e2016-01-28 00:29:58 +0100408{
409 const U32 tableSize = table->pos;
410 const U32 max = elt.pos + (elt.length-1);
411
412 /* tail overlap */
413 U32 u; for (u=1; u<tableSize; u++) {
414 if (u==eltNbToSkip) continue;
415 if ((table[u].pos > elt.pos) && (table[u].pos < max)) { /* overlap */
416 /* append */
417 U32 addedLength = table[u].pos - elt.pos;
418 table[u].length += addedLength;
419 table[u].pos = elt.pos;
420 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
421 table[u].savings += elt.length / 8; /* rough approx */
422 elt = table[u];
423 while ((u>1) && (table[u-1].savings < elt.savings))
424 table[u] = table[u-1], u--;
425 table[u] = elt;
426 return u;
427 } }
428
429 /* front overlap */
430 for (u=1; u<tableSize; u++) {
431 if (u==eltNbToSkip) continue;
432 if ((table[u].pos + table[u].length > elt.pos) && (table[u].pos < elt.pos)) { /* overlap */
433 /* append */
434 int addedLength = (elt.pos + elt.length) - (table[u].pos + table[u].length);
435 table[u].savings += elt.length / 8; /* rough approx */
436 if (addedLength > 0) { /* otherwise, already included */
437 table[u].length += addedLength;
438 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
439 }
440 elt = table[u];
441 while ((u>1) && (table[u-1].savings < elt.savings))
442 table[u] = table[u-1], u--;
443 table[u] = elt;
444 return u;
445 } }
446
447 return 0;
448}
449
450
Yann Collet7d360282016-02-12 00:07:30 +0100451static void ZDICT_removeDictItem(dictItem* table, U32 id)
Yann Collete618f8e2016-01-28 00:29:58 +0100452{
453 /* convention : first element is nb of elts */
454 U32 max = table->pos;
455 U32 u;
456 if (!id) return; /* protection, should never happen */
Yann Collet9cadd082016-01-28 15:39:52 +0100457 for (u=id; u<max-1; u++)
Yann Collete618f8e2016-01-28 00:29:58 +0100458 table[u] = table[u+1];
459 table->pos--;
460}
461
462
Yann Collet7d360282016-02-12 00:07:30 +0100463static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
Yann Collete618f8e2016-01-28 00:29:58 +0100464{
465 /* merge if possible */
Yann Collet7d360282016-02-12 00:07:30 +0100466 U32 mergeId = ZDICT_checkMerge(table, elt, 0);
Yann Collete618f8e2016-01-28 00:29:58 +0100467 if (mergeId) {
468 U32 newMerge = 1;
469 while (newMerge) {
Yann Collet7d360282016-02-12 00:07:30 +0100470 newMerge = ZDICT_checkMerge(table, table[mergeId], mergeId);
471 if (newMerge) ZDICT_removeDictItem(table, mergeId);
Yann Collete618f8e2016-01-28 00:29:58 +0100472 mergeId = newMerge;
473 }
474 return;
475 }
476
477 /* insert */
478 {
479 U32 current;
480 U32 nextElt = table->pos;
481 if (nextElt >= maxSize) nextElt = maxSize-1;
482 current = nextElt-1;
483 while (table[current].savings < elt.savings) {
484 table[current+1] = table[current];
485 current--;
486 }
487 table[current+1] = elt;
488 table->pos = nextElt+1;
489 }
490}
491
492
Yann Collet7d360282016-02-12 00:07:30 +0100493static U32 ZDICT_dictSize(const dictItem* dictList)
Yann Collete618f8e2016-01-28 00:29:58 +0100494{
495 U32 u, dictSize = 0;
496 for (u=1; u<dictList[0].pos; u++)
497 dictSize += dictList[u].length;
498 return dictSize;
499}
500
501
Yann Collet71eafdd2016-02-12 02:31:57 +0100502static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
Yann Collete618f8e2016-01-28 00:29:58 +0100503 const void* const buffer, const size_t bufferSize, /* buffer must end with noisy guard band */
Yann Collet7682e492016-01-31 23:45:35 +0100504 const size_t* fileSizes, unsigned nbFiles,
505 U32 shiftRatio, unsigned maxDictSize)
Yann Collete618f8e2016-01-28 00:29:58 +0100506{
Yann Collet7d360282016-02-12 00:07:30 +0100507 int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
508 int* const suffix = suffix0+1;
Yann Collete618f8e2016-01-28 00:29:58 +0100509 U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
510 BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */
511 U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
512 U32 minRatio = nbFiles >> shiftRatio;
Yann Collet7d360282016-02-12 00:07:30 +0100513 int divSuftSortResult;
Yann Collet71eafdd2016-02-12 02:31:57 +0100514 size_t result = 0;
Yann Collete618f8e2016-01-28 00:29:58 +0100515
516 /* init */
517 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
Yann Collet71eafdd2016-02-12 02:31:57 +0100518 if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) {
519 result = ERROR(memory_allocation);
520 goto _cleanup;
521 }
Yann Collete618f8e2016-01-28 00:29:58 +0100522 if (minRatio < MINRATIO) minRatio = MINRATIO;
523 memset(doneMarks, 0, bufferSize+16);
524
525 /* sort */
Yann Collet7682e492016-01-31 23:45:35 +0100526 DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
Yann Collet7d360282016-02-12 00:07:30 +0100527 divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
Yann Collet71eafdd2016-02-12 02:31:57 +0100528 if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
Yann Collet7d360282016-02-12 00:07:30 +0100529 suffix[bufferSize] = (int)bufferSize; /* leads into noise */
530 suffix0[0] = (int)bufferSize; /* leads into noise */
Yann Collete618f8e2016-01-28 00:29:58 +0100531 {
532 /* build reverse suffix sort */
533 size_t pos;
534 for (pos=0; pos < bufferSize; pos++)
Yann Collet9cadd082016-01-28 15:39:52 +0100535 reverseSuffix[suffix[pos]] = (U32)pos;
Yann Collete618f8e2016-01-28 00:29:58 +0100536 /* build file pos */
537 filePos[0] = 0;
538 for (pos=1; pos<nbFiles; pos++)
Yann Collet9cadd082016-01-28 15:39:52 +0100539 filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
Yann Collete618f8e2016-01-28 00:29:58 +0100540 }
541
542 DISPLAYLEVEL(2, "finding patterns ... \n");
Yann Collet82516192016-01-29 16:48:10 +0100543 DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio);
Yann Collete618f8e2016-01-28 00:29:58 +0100544
545 {
546 U32 cursor; for (cursor=0; cursor < bufferSize; ) {
547 dictItem solution;
Yann Collete618f8e2016-01-28 00:29:58 +0100548 if (doneMarks[cursor]) { cursor++; continue; }
Yann Collet7d360282016-02-12 00:07:30 +0100549 solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio);
Yann Collete618f8e2016-01-28 00:29:58 +0100550 if (solution.length==0) { cursor++; continue; }
Yann Collet7d360282016-02-12 00:07:30 +0100551 ZDICT_insertDictItem(dictList, dictListSize, solution);
Yann Collete618f8e2016-01-28 00:29:58 +0100552 cursor += solution.length;
553 DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
Yann Collet82516192016-01-29 16:48:10 +0100554 } }
Yann Collete618f8e2016-01-28 00:29:58 +0100555
556 /* limit dictionary size */
557 {
558 U32 max = dictList->pos; /* convention : nb of useful elts within dictList */
559 U32 currentSize = 0;
560 U32 n; for (n=1; n<max; n++) {
561 currentSize += dictList[n].length;
562 if (currentSize > maxDictSize) break;
563 }
564 dictList->pos = n;
565 }
566
Yann Collet71eafdd2016-02-12 02:31:57 +0100567_cleanup:
Yann Collete618f8e2016-01-28 00:29:58 +0100568 free(suffix0);
569 free(reverseSuffix);
570 free(doneMarks);
571 free(filePos);
Yann Collet71eafdd2016-02-12 02:31:57 +0100572 return result;
Yann Collete618f8e2016-01-28 00:29:58 +0100573}
574
575
Yann Collet7d360282016-02-12 00:07:30 +0100576static void ZDICT_fillNoise(void* buffer, size_t length)
Yann Collete618f8e2016-01-28 00:29:58 +0100577{
578 unsigned acc = PRIME1;
579 size_t p=0;;
580
581 for (p=0; p<length; p++) {
582 acc *= PRIME2;
583 ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
584 }
585}
586
587
588typedef struct
589{
590 ZSTD_CCtx* ref;
591 ZSTD_CCtx* zc;
592 void* workPlace; /* must be BLOCKSIZE allocated */
593} EStats_ress_t;
594
595
Yann Collet7d360282016-02-12 00:07:30 +0100596static void ZDICT_countEStats(EStats_ress_t esr,
Yann Collete618f8e2016-01-28 00:29:58 +0100597 U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount,
598 const void* src, size_t srcSize)
599{
600 const BYTE* bytePtr;
601 const U32* u32Ptr;
Yann Collet7d360282016-02-12 00:07:30 +0100602 seqStore_t seqStore;
Yann Collete618f8e2016-01-28 00:29:58 +0100603
604 if (srcSize > BLOCKSIZE) srcSize = BLOCKSIZE; /* protection vs large samples */
605 ZSTD_copyCCtx(esr.zc, esr.ref);
606 ZSTD_compressBlock(esr.zc, esr.workPlace, BLOCKSIZE, src, srcSize);
Yann Collet7d360282016-02-12 00:07:30 +0100607 seqStore = ZSTD_copySeqStore(esr.zc);
Yann Collete618f8e2016-01-28 00:29:58 +0100608
609 /* count stats */
Yann Collet7d360282016-02-12 00:07:30 +0100610 for(bytePtr = seqStore.litStart; bytePtr < seqStore.lit; bytePtr++)
Yann Collete618f8e2016-01-28 00:29:58 +0100611 countLit[*bytePtr]++;
Yann Collet7d360282016-02-12 00:07:30 +0100612 for(u32Ptr = seqStore.offsetStart; u32Ptr < seqStore.offset; u32Ptr++) {
Yann Collete618f8e2016-01-28 00:29:58 +0100613 BYTE offcode = (BYTE)ZSTD_highbit(*u32Ptr) + 1;
614 if (*u32Ptr==0) offcode=0;
615 offsetcodeCount[offcode]++;
616 }
Yann Collet7d360282016-02-12 00:07:30 +0100617 for(bytePtr = seqStore.matchLengthStart; bytePtr < seqStore.matchLength; bytePtr++)
Yann Collete618f8e2016-01-28 00:29:58 +0100618 matchlengthCount[*bytePtr]++;
Yann Collet7d360282016-02-12 00:07:30 +0100619 for(bytePtr = seqStore.litLengthStart; bytePtr < seqStore.litLength; bytePtr++)
Yann Collete618f8e2016-01-28 00:29:58 +0100620 litlengthCount[*bytePtr]++;
621}
622
623
Yann Collet7d360282016-02-12 00:07:30 +0100624#define OFFCODE_MAX 18 /* only applicable to first block */
625static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
Yann Colletf5229e02016-01-29 02:45:26 +0100626 unsigned compressionLevel,
Yann Collet7682e492016-01-31 23:45:35 +0100627 const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
Yann Collete618f8e2016-01-28 00:29:58 +0100628 const void* dictBuffer, size_t dictBufferSize)
629{
630 U32 countLit[256];
631 U32 offcodeCount[MaxOff+1];
632 HUF_CREATE_STATIC_CTABLE(hufTable, 255);
633 short offcodeNCount[MaxOff+1];
634 U32 matchLengthCount[MaxML+1];
635 short matchLengthNCount[MaxML+1];
636 U32 litlengthCount[MaxLL+1];
637 short litlengthNCount[MaxLL+1];
638 EStats_ress_t esr;
639 ZSTD_parameters params;
640 U32 u, huffLog = 12, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
641 size_t pos = 0, errorCode;
642 size_t eSize = 0;
643
644 /* init */
645 for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */
646 for (u=0; u<=OFFCODE_MAX; u++) offcodeCount[u]=1;
647 for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
648 for (u=0; u<=MaxLL; u++) litlengthCount[u]=1;
649 esr.ref = ZSTD_createCCtx();
650 esr.zc = ZSTD_createCCtx();
651 esr.workPlace = malloc(BLOCKSIZE);
Yann Collet71eafdd2016-02-12 02:31:57 +0100652 if (!esr.ref || !esr.zc || !esr.workPlace) {
653 eSize = ERROR(memory_allocation);
654 DISPLAYLEVEL(1, "Not enough memory");
655 goto _cleanup;
656 }
Yann Colletf5229e02016-01-29 02:45:26 +0100657 if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
658 params = ZSTD_getParams(compressionLevel, dictBufferSize + 15 KB);
Yann Collete618f8e2016-01-28 00:29:58 +0100659 params.strategy = ZSTD_greedy;
660 ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params);
661
662 /* collect stats on all files */
663 for (u=0; u<nbFiles; u++) {
Yann Collet7d360282016-02-12 00:07:30 +0100664 ZDICT_countEStats(esr,
Yann Collete618f8e2016-01-28 00:29:58 +0100665 countLit, offcodeCount, matchLengthCount, litlengthCount,
666 (const char*)srcBuffer + pos, fileSizes[u]);
667 pos += fileSizes[u];
668 }
669
670 /* analyze */
671 errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
Yann Collet71eafdd2016-02-12 02:31:57 +0100672 if (HUF_isError(errorCode)) {
673 eSize = ERROR(GENERIC);
674 DISPLAYLEVEL(1, "HUF_buildCTable error");
675 goto _cleanup;
676 }
Yann Collete618f8e2016-01-28 00:29:58 +0100677 huffLog = (U32)errorCode;
678
679 total=0; for (u=0; u<=OFFCODE_MAX; u++) total+=offcodeCount[u];
680 errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, OFFCODE_MAX);
Yann Collet71eafdd2016-02-12 02:31:57 +0100681 if (FSE_isError(errorCode)) {
682 eSize = ERROR(GENERIC);
683 DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount");
684 goto _cleanup;
685 }
Yann Collete618f8e2016-01-28 00:29:58 +0100686 Offlog = (U32)errorCode;
687
688 total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
689 errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
Yann Collet71eafdd2016-02-12 02:31:57 +0100690 if (FSE_isError(errorCode)) {
691 eSize = ERROR(GENERIC);
692 DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount");
693 goto _cleanup;
694 }
Yann Collete618f8e2016-01-28 00:29:58 +0100695 mlLog = (U32)errorCode;
696
697 total=0; for (u=0; u<=MaxLL; u++) total+=litlengthCount[u];
698 errorCode = FSE_normalizeCount(litlengthNCount, llLog, litlengthCount, total, MaxLL);
Yann Collet71eafdd2016-02-12 02:31:57 +0100699 if (FSE_isError(errorCode)) {
700 eSize = ERROR(GENERIC);
701 DISPLAYLEVEL(1, "FSE_normalizeCount error with litlengthCount");
702 goto _cleanup;
703 }
Yann Collete618f8e2016-01-28 00:29:58 +0100704 llLog = (U32)errorCode;
705
706 /* write result to buffer */
707 errorCode = HUF_writeCTable(dstBuffer, maxDstSize, hufTable, 255, huffLog);
Yann Collet71eafdd2016-02-12 02:31:57 +0100708 if (HUF_isError(errorCode)) {
709 eSize = ERROR(GENERIC);
710 DISPLAYLEVEL(1, "HUF_writeCTable error");
711 goto _cleanup;
712 }
Yann Collete618f8e2016-01-28 00:29:58 +0100713 dstBuffer = (char*)dstBuffer + errorCode;
714 maxDstSize -= errorCode;
715 eSize += errorCode;
716
717 errorCode = FSE_writeNCount(dstBuffer, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
Yann Collet71eafdd2016-02-12 02:31:57 +0100718 if (FSE_isError(errorCode)) {
719 eSize = ERROR(GENERIC);
720 DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount");
721 goto _cleanup;
722 }
Yann Collete618f8e2016-01-28 00:29:58 +0100723 dstBuffer = (char*)dstBuffer + errorCode;
724 maxDstSize -= errorCode;
725 eSize += errorCode;
726
727 errorCode = FSE_writeNCount(dstBuffer, maxDstSize, matchLengthNCount, MaxML, mlLog);
Yann Collet71eafdd2016-02-12 02:31:57 +0100728 if (FSE_isError(errorCode)) {
729 eSize = ERROR(GENERIC);
730 DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount");
731 goto _cleanup;
732 }
Yann Collete618f8e2016-01-28 00:29:58 +0100733 dstBuffer = (char*)dstBuffer + errorCode;
734 maxDstSize -= errorCode;
735 eSize += errorCode;
736
737 errorCode = FSE_writeNCount(dstBuffer, maxDstSize, litlengthNCount, MaxLL, llLog);
Yann Collet71eafdd2016-02-12 02:31:57 +0100738 if (FSE_isError(errorCode)) {
739 eSize = ERROR(GENERIC);
740 DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount");
741 goto _cleanup;
742 }
Yann Collete618f8e2016-01-28 00:29:58 +0100743 dstBuffer = (char*)dstBuffer + errorCode;
744 maxDstSize -= errorCode;
745 eSize += errorCode;
746
Yann Collet71eafdd2016-02-12 02:31:57 +0100747_cleanup:
Yann Collete618f8e2016-01-28 00:29:58 +0100748 ZSTD_freeCCtx(esr.ref);
749 ZSTD_freeCCtx(esr.zc);
750 free(esr.workPlace);
751
752 return eSize;
753}
754
755
Yann Colletf5229e02016-01-29 02:45:26 +0100756#define DIB_FASTSEGMENTSIZE 64
Yann Collet71eafdd2016-02-12 02:31:57 +0100757/*! ZDICT_fastSampling() (based on an idea proposed by Giuseppe Ottaviano) :
758 Fill `dictBuffer` with stripes of size DIB_FASTSEGMENTSIZE from `samplesBuffer`,
Yann Collet7d360282016-02-12 00:07:30 +0100759 up to `dictSize`.
760 Filling starts from the end of `dictBuffer`, down to maximum possible.
761 if `dictSize` is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of `dictBuffer` won't be used.
Yann Collet71eafdd2016-02-12 02:31:57 +0100762 @return : amount of data written into `dictBuffer`,
Yann Collet7d360282016-02-12 00:07:30 +0100763 or an error code
Yann Colletf5229e02016-01-29 02:45:26 +0100764*/
Yann Collet7d360282016-02-12 00:07:30 +0100765static size_t ZDICT_fastSampling(void* dictBuffer, size_t dictSize,
Yann Colletf5229e02016-01-29 02:45:26 +0100766 const void* samplesBuffer, size_t samplesSize)
767{
768 char* dstPtr = (char*)dictBuffer + dictSize;
769 const char* srcPtr = (const char*)samplesBuffer;
770 size_t nbSegments = dictSize / DIB_FASTSEGMENTSIZE;
771 size_t segNb, interSize;
772
773 if (nbSegments <= 2) return ERROR(srcSize_wrong);
774 if (samplesSize < dictSize) return ERROR(srcSize_wrong);
775
776 /* first and last segments are part of dictionary, in case they contain interesting header/footer */
777 dstPtr -= DIB_FASTSEGMENTSIZE;
778 memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
779 dstPtr -= DIB_FASTSEGMENTSIZE;
780 memcpy(dstPtr, srcPtr+samplesSize-DIB_FASTSEGMENTSIZE, DIB_FASTSEGMENTSIZE);
781
782 /* regularly copy a segment */
783 interSize = (samplesSize - nbSegments*DIB_FASTSEGMENTSIZE) / (nbSegments-1);
784 srcPtr += DIB_FASTSEGMENTSIZE;
785 for (segNb=2; segNb < nbSegments; segNb++) {
786 srcPtr += interSize;
787 dstPtr -= DIB_FASTSEGMENTSIZE;
788 memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
789 srcPtr += DIB_FASTSEGMENTSIZE;
790 }
791
792 return nbSegments * DIB_FASTSEGMENTSIZE;
793}
794
795
Yann Collet7d360282016-02-12 00:07:30 +0100796size_t ZDICT_trainFromBuffer_unsafe(
Yann Collet7682e492016-01-31 23:45:35 +0100797 void* dictBuffer, size_t maxDictSize,
798 const void* samplesBuffer, const size_t* sampleSizes, unsigned nbSamples,
Yann Collet7d360282016-02-12 00:07:30 +0100799 ZDICT_params_t params)
Yann Collete618f8e2016-01-28 00:29:58 +0100800{
Yann Colletb35c4642016-02-01 00:03:10 +0100801 const U32 dictListSize = MAX( MAX(DICTLISTSIZE, nbSamples), (U32)(maxDictSize/16));
Yann Collete618f8e2016-01-28 00:29:58 +0100802 dictItem* dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
Yann Collet7682e492016-01-31 23:45:35 +0100803 unsigned selectivity = params.selectivityLevel;
804 unsigned compressionLevel = params.compressionLevel;
805 size_t targetDictSize = maxDictSize - g_provision_entropySize;
806 size_t sBuffSize;
807 size_t dictSize = 0;
808
809 /* checks */
810 if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) return ERROR(dstSize_tooSmall);
Yann Collete618f8e2016-01-28 00:29:58 +0100811
812 /* init */
Yann Collet7682e492016-01-31 23:45:35 +0100813 { unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += sampleSizes[u]; }
Yann Collet7d360282016-02-12 00:07:30 +0100814 if (!dictList) return ERROR(memory_allocation);
815 ZDICT_initDictItem(dictList);
Yann Collet7682e492016-01-31 23:45:35 +0100816 if (selectivity==0) selectivity = g_selectivity_default;
817 if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
Yann Collete618f8e2016-01-28 00:29:58 +0100818
Yann Collet7d360282016-02-12 00:07:30 +0100819 /* build dictionary */
820 if (selectivity>1) { /* selectivity == 1 => fast mode */
821 ZDICT_trainBuffer(dictList, dictListSize,
Yann Collet7682e492016-01-31 23:45:35 +0100822 samplesBuffer, sBuffSize,
823 sampleSizes, nbSamples,
Yann Colletb35c4642016-02-01 00:03:10 +0100824 selectivity, (U32)targetDictSize);
Yann Collete618f8e2016-01-28 00:29:58 +0100825
Yann Colletf5229e02016-01-29 02:45:26 +0100826 /* display best matches */
827 if (g_displayLevel>= 3) {
828 const U32 nb = 25;
829 U32 u;
Yann Collet7d360282016-02-12 00:07:30 +0100830 U32 dictContentSize = ZDICT_dictSize(dictList);
Yann Colletf5229e02016-01-29 02:45:26 +0100831 DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
832 DISPLAYLEVEL(3, "list %u best segments \n", nb);
833 for (u=1; u<=nb; u++) {
834 U32 p = dictList[u].pos;
835 U32 l = dictList[u].length;
836 U32 d = MIN(40, l);
837 DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
838 u, l, p, dictList[u].savings);
Yann Collet7d360282016-02-12 00:07:30 +0100839 ZDICT_printHex(3, (const char*)samplesBuffer+p, d);
Yann Colletf5229e02016-01-29 02:45:26 +0100840 DISPLAYLEVEL(3, "| \n");
841 } } }
Yann Collete618f8e2016-01-28 00:29:58 +0100842
843 /* create dictionary */
844 {
Yann Collet7d360282016-02-12 00:07:30 +0100845 U32 dictContentSize = ZDICT_dictSize(dictList);
Yann Collet7682e492016-01-31 23:45:35 +0100846 size_t hSize;
Yann Collete618f8e2016-01-28 00:29:58 +0100847 BYTE* ptr;
848 U32 u;
849
Yann Collete618f8e2016-01-28 00:29:58 +0100850 /* build dict content */
Yann Collet7682e492016-01-31 23:45:35 +0100851 ptr = (BYTE*)dictBuffer + maxDictSize;
Yann Collete618f8e2016-01-28 00:29:58 +0100852 for (u=1; u<dictList->pos; u++) {
853 U32 l = dictList[u].length;
854 ptr -= l;
Yann Collet7682e492016-01-31 23:45:35 +0100855 if (ptr<(BYTE*)dictBuffer) return ERROR(GENERIC); /* should not happen */
856 memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
Yann Collete618f8e2016-01-28 00:29:58 +0100857 }
858
Yann Collet82516192016-01-29 16:48:10 +0100859 /* fast mode dict content */
Yann Collet7682e492016-01-31 23:45:35 +0100860 if (selectivity==1) { /* note could also be used to complete a dictionary, but not necessarily better */
861 DISPLAYLEVEL(3, "\r%70s\r", ""); /* clean display line */
862 DISPLAYLEVEL(3, "Adding %u KB with fast sampling \n", (U32)(targetDictSize>>10));
Yann Collet7d360282016-02-12 00:07:30 +0100863 dictContentSize = (U32)ZDICT_fastSampling((char*)dictBuffer + g_provision_entropySize,
Yann Collet7682e492016-01-31 23:45:35 +0100864 targetDictSize, samplesBuffer, sBuffSize);
Yann Colletf5229e02016-01-29 02:45:26 +0100865 }
866
Yann Collet7682e492016-01-31 23:45:35 +0100867 /* dictionary header */
868 MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
Yann Collete618f8e2016-01-28 00:29:58 +0100869 hSize = 4;
Yann Collete618f8e2016-01-28 00:29:58 +0100870
871 /* entropic tables */
Yann Collet7682e492016-01-31 23:45:35 +0100872 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
Yann Collete618f8e2016-01-28 00:29:58 +0100873 DISPLAYLEVEL(2, "statistics ... \n");
Yann Collet7d360282016-02-12 00:07:30 +0100874 hSize += ZDICT_analyzeEntropy((char*)dictBuffer+4, maxDictSize-4,
Yann Colletf5229e02016-01-29 02:45:26 +0100875 compressionLevel,
Yann Collet7682e492016-01-31 23:45:35 +0100876 samplesBuffer, sampleSizes, nbSamples,
877 (char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize);
Yann Collete618f8e2016-01-28 00:29:58 +0100878
Yann Collet7682e492016-01-31 23:45:35 +0100879 if (hSize + dictContentSize < maxDictSize)
880 memmove((char*)dictBuffer + hSize, (char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize);
881 dictSize = MIN(maxDictSize, hSize+dictContentSize);
Yann Collete618f8e2016-01-28 00:29:58 +0100882 }
883
884 /* clean up */
Yann Collete618f8e2016-01-28 00:29:58 +0100885 free(dictList);
Yann Collet7682e492016-01-31 23:45:35 +0100886 return dictSize;
Yann Collete618f8e2016-01-28 00:29:58 +0100887}
888
Yann Collet7682e492016-01-31 23:45:35 +0100889
Yann Collet7d360282016-02-12 00:07:30 +0100890size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
891 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
892 ZDICT_params_t params)
Yann Collet7682e492016-01-31 23:45:35 +0100893{
894 size_t sBuffSize;
895 void* newBuff;
896 size_t result;
897
Yann Collet7d360282016-02-12 00:07:30 +0100898 { unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; }
Yann Collet7682e492016-01-31 23:45:35 +0100899 newBuff = malloc(sBuffSize + NOISELENGTH);
900 if (!newBuff) return ERROR(memory_allocation);
901
902 memcpy(newBuff, samplesBuffer, sBuffSize);
Yann Collet7d360282016-02-12 00:07:30 +0100903 ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
Yann Collet7682e492016-01-31 23:45:35 +0100904
Yann Collet7d360282016-02-12 00:07:30 +0100905 result = ZDICT_trainFromBuffer_unsafe(dictBuffer, dictBufferCapacity,
906 newBuff, samplesSizes, nbSamples,
Yann Collet7682e492016-01-31 23:45:35 +0100907 params);
908 free(newBuff);
909 return result;
910}
911
912
Yann Collet7d360282016-02-12 00:07:30 +0100913/* issue : samplesBuffer need to be followed by a noisy guard band.
914* work around : duplicate the buffer, and add the noise ? */
915size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
916 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
Yann Collet7682e492016-01-31 23:45:35 +0100917{
Yann Collet7d360282016-02-12 00:07:30 +0100918 ZDICT_params_t params;
919 memset(&params, 0, sizeof(params));
920 return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity,
921 samplesBuffer, samplesSizes, nbSamples,
922 params);
Yann Collet7682e492016-01-31 23:45:35 +0100923}
Yann Collet7d360282016-02-12 00:07:30 +0100924