blob: ede3b3d2ae4966af7f9972d909dc2657fc77e31f [file] [log] [blame]
Yann Collete618f8e2016-01-28 00:29:58 +01001/*
Yann Collet7d360282016-02-12 00:07:30 +01002 dictBuilder - dictionary builder for zstd
Yann Collete618f8e2016-01-28 00:29:58 +01003 Copyright (C) Yann Collet 2016
4
Yann Collet7d360282016-02-12 00:07:30 +01005 BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
Yann Collete618f8e2016-01-28 00:29:58 +01006
Yann Collet7d360282016-02-12 00:07:30 +01007 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are
9 met:
Yann Collete618f8e2016-01-28 00:29:58 +010010
Yann Collet7d360282016-02-12 00:07:30 +010011 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13 * Redistributions in binary form must reproduce the above
14 copyright notice, this list of conditions and the following disclaimer
15 in the documentation and/or other materials provided with the
16 distribution.
Yann Collete618f8e2016-01-28 00:29:58 +010017
Yann Collet7d360282016-02-12 00:07:30 +010018 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Yann Collete618f8e2016-01-28 00:29:58 +010029
30 You can contact the author at :
Yann Collet7d360282016-02-12 00:07:30 +010031 - Zstd source repository : https://www.zstd.net
Yann Collete618f8e2016-01-28 00:29:58 +010032*/
33
Yann Collet7d360282016-02-12 00:07:30 +010034/*-**************************************
Yann Collete618f8e2016-01-28 00:29:58 +010035* Compiler Options
36****************************************/
37/* Disable some Visual warning messages */
38#ifdef _MSC_VER
39# define _CRT_SECURE_NO_WARNINGS /* fopen */
40# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
41#endif
42
43/* Unix Large Files support (>4GB) */
44#define _FILE_OFFSET_BITS 64
45#if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */
46# define _LARGEFILE_SOURCE
47#elif ! defined(__LP64__) /* No point defining Large file for 64 bit */
48# define _LARGEFILE64_SOURCE
49#endif
50
Yann Collete618f8e2016-01-28 00:29:58 +010051
Yann Collet863ec402016-01-28 17:56:33 +010052/*-*************************************
Yann Collet7d360282016-02-12 00:07:30 +010053* Dependencies
Yann Collete618f8e2016-01-28 00:29:58 +010054***************************************/
Yann Collet7682e492016-01-31 23:45:35 +010055#include <stdlib.h> /* malloc, free */
56#include <string.h> /* memset */
57#include <stdio.h> /* fprintf, fopen, ftello64 */
58#include <sys/types.h> /* stat64 */
59#include <sys/stat.h> /* stat64 */
60#include <time.h> /* clock */
Yann Collete618f8e2016-01-28 00:29:58 +010061
Yann Collet7682e492016-01-31 23:45:35 +010062#include "mem.h" /* read */
Yann Colletf5229e02016-01-29 02:45:26 +010063#include "error_private.h"
Yann Collete618f8e2016-01-28 00:29:58 +010064#include "divsufsort.h"
Yann Collet7d360282016-02-12 00:07:30 +010065#include "dictBuilder_static.h"
66#include "fse.h"
Yann Collete618f8e2016-01-28 00:29:58 +010067#include "huff0_static.h"
Yann Collet7d360282016-02-12 00:07:30 +010068#include "zstd_internal.h"
Yann Collete618f8e2016-01-28 00:29:58 +010069
70
Yann Collet7682e492016-01-31 23:45:35 +010071/*-*************************************
Yann Collete618f8e2016-01-28 00:29:58 +010072* Compiler specifics
73***************************************/
74#if !defined(S_ISREG)
75# define S_ISREG(x) (((x) & S_IFMT) == S_IFREG)
76#endif
77
Yann Collete618f8e2016-01-28 00:29:58 +010078
Yann Collet863ec402016-01-28 17:56:33 +010079/*-*************************************
Yann Collete618f8e2016-01-28 00:29:58 +010080* Constants
81***************************************/
82#define KB *(1 <<10)
83#define MB *(1 <<20)
84#define GB *(1U<<30)
85
86#define DICTLISTSIZE 10000
87#define MEMMULT 11
Yann Collet35f7de52016-01-31 02:51:03 +010088static const size_t maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
Yann Collete618f8e2016-01-28 00:29:58 +010089
90#define NOISELENGTH 32
91#define PRIME1 2654435761U
92#define PRIME2 2246822519U
93
94#define MINRATIO 4
Yann Colletf5229e02016-01-29 02:45:26 +010095static const U32 g_compressionLevel_default = 5;
Yann Collet7682e492016-01-31 23:45:35 +010096static const U32 g_selectivity_default = 9;
97static const size_t g_provision_entropySize = 200;
98static const size_t g_min_fast_dictContent = 192;
Yann Collete618f8e2016-01-28 00:29:58 +010099
100
Yann Collet863ec402016-01-28 17:56:33 +0100101/*-*************************************
102* Console display
Yann Collete618f8e2016-01-28 00:29:58 +0100103***************************************/
104#define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
105#define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
Yann Colletb31de732016-01-28 10:30:02 +0100106static unsigned g_displayLevel = 0; /* 0 : no display; 1: errors; 2: default; 4: full information */
Yann Collet7d360282016-02-12 00:07:30 +0100107void ZDICT_setNotificationLevel(unsigned l) { g_displayLevel=l; }
Yann Collete618f8e2016-01-28 00:29:58 +0100108
109#define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
Yann Collet7d360282016-02-12 00:07:30 +0100110 if (ZDICT_GetMilliSpan(g_time) > refreshRate) \
Yann Collete618f8e2016-01-28 00:29:58 +0100111 { g_time = clock(); DISPLAY(__VA_ARGS__); \
112 if (g_displayLevel>=4) fflush(stdout); } }
113static const unsigned refreshRate = 300;
114static clock_t g_time = 0;
115
Yann Collet7d360282016-02-12 00:07:30 +0100116void ZDICT_printHex(U32 dlevel, const void* ptr, size_t length)
Yann Collete618f8e2016-01-28 00:29:58 +0100117{
118 const BYTE* const b = (const BYTE*)ptr;
119 size_t u;
120 for (u=0; u<length; u++)
121 {
122 BYTE c = b[u];
123 if (c<32 || c>126) c = '.'; /* non-printable char */
124 DISPLAYLEVEL(dlevel, "%c", c);
125 }
126}
127
128
Yann Collet863ec402016-01-28 17:56:33 +0100129/*-*************************************
Yann Collete618f8e2016-01-28 00:29:58 +0100130* Exceptions
131***************************************/
132#ifndef DEBUG
133# define DEBUG 0
134#endif
135#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
136#define EXM_THROW(error, ...) \
137{ \
138 DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
139 DISPLAYLEVEL(1, "Error %i : ", error); \
140 DISPLAYLEVEL(1, __VA_ARGS__); \
141 DISPLAYLEVEL(1, "\n"); \
142 exit(error); \
143}
144
145
Yann Collet7d360282016-02-12 00:07:30 +0100146/*-********************************************************
Yann Collete618f8e2016-01-28 00:29:58 +0100147* Helper functions
148**********************************************************/
Yann Collet7d360282016-02-12 00:07:30 +0100149static unsigned ZDICT_GetMilliSpan(clock_t nPrevious)
Yann Collete618f8e2016-01-28 00:29:58 +0100150{
151 clock_t nCurrent = clock();
152 unsigned nSpan = (unsigned)(((nCurrent - nPrevious) * 1000) / CLOCKS_PER_SEC);
153 return nSpan;
154}
155
Yann Collet7d360282016-02-12 00:07:30 +0100156unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
Yann Collet7682e492016-01-31 23:45:35 +0100157
Yann Collet7d360282016-02-12 00:07:30 +0100158const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
Yann Collete618f8e2016-01-28 00:29:58 +0100159
160
161/*-********************************************************
162* Dictionary training functions
163**********************************************************/
Yann Collet7d360282016-02-12 00:07:30 +0100164static unsigned ZDICT_NbCommonBytes (register size_t val)
Yann Collete618f8e2016-01-28 00:29:58 +0100165{
166 if (MEM_isLittleEndian()) {
167 if (MEM_64bits()) {
168# if defined(_MSC_VER) && defined(_WIN64)
169 unsigned long r = 0;
170 _BitScanForward64( &r, (U64)val );
171 return (unsigned)(r>>3);
172# elif defined(__GNUC__) && (__GNUC__ >= 3)
173 return (__builtin_ctzll((U64)val) >> 3);
174# else
175 static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
176 return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
177# endif
178 } else { /* 32 bits */
179# if defined(_MSC_VER)
180 unsigned long r=0;
181 _BitScanForward( &r, (U32)val );
182 return (unsigned)(r>>3);
183# elif defined(__GNUC__) && (__GNUC__ >= 3)
184 return (__builtin_ctz((U32)val) >> 3);
185# else
186 static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
187 return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
188# endif
189 }
190 } else { /* Big Endian CPU */
191 if (MEM_64bits()) {
192# if defined(_MSC_VER) && defined(_WIN64)
193 unsigned long r = 0;
194 _BitScanReverse64( &r, val );
195 return (unsigned)(r>>3);
196# elif defined(__GNUC__) && (__GNUC__ >= 3)
197 return (__builtin_clzll(val) >> 3);
198# else
199 unsigned r;
200 const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */
201 if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
202 if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
203 r += (!val);
204 return r;
205# endif
206 } else { /* 32 bits */
207# if defined(_MSC_VER)
208 unsigned long r = 0;
209 _BitScanReverse( &r, (unsigned long)val );
210 return (unsigned)(r>>3);
211# elif defined(__GNUC__) && (__GNUC__ >= 3)
212 return (__builtin_clz((U32)val) >> 3);
213# else
214 unsigned r;
215 if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
216 r += (!val);
217 return r;
218# endif
219 } }
220}
221
222
Yann Collet7d360282016-02-12 00:07:30 +0100223/*! ZDICT_count() :
Yann Collete618f8e2016-01-28 00:29:58 +0100224 Count the nb of common bytes between 2 pointers.
225 Note : this function presumes end of buffer followed by noisy guard band.
226*/
Yann Collet7d360282016-02-12 00:07:30 +0100227static size_t ZDICT_count(const void* pIn, const void* pMatch)
Yann Collete618f8e2016-01-28 00:29:58 +0100228{
229 const char* const pStart = (const char*)pIn;
230 for (;;) {
Yann Collet7d360282016-02-12 00:07:30 +0100231 size_t diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
Yann Collete618f8e2016-01-28 00:29:58 +0100232 if (!diff) { pIn = (const char*)pIn+sizeof(size_t); pMatch = (const char*)pMatch+sizeof(size_t); continue; }
Yann Collet7d360282016-02-12 00:07:30 +0100233 pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff);
Yann Collete618f8e2016-01-28 00:29:58 +0100234 return (size_t)((const char*)pIn - pStart);
235 }
236}
237
238
239typedef struct {
240 U32 pos;
241 U32 length;
242 U32 savings;
243} dictItem;
244
Yann Collet7d360282016-02-12 00:07:30 +0100245static void ZDICT_initDictItem(dictItem* d)
Yann Collete618f8e2016-01-28 00:29:58 +0100246{
247 d->pos = 1;
248 d->length = 0;
249 d->savings = (U32)(-1);
250}
251
252
253#define LLIMIT 64 /* heuristic determined experimentally */
254#define MINMATCHLENGTH 7 /* heuristic determined experimentally */
Yann Collet7d360282016-02-12 00:07:30 +0100255static dictItem ZDICT_analyzePos(
Yann Collete618f8e2016-01-28 00:29:58 +0100256 BYTE* doneMarks,
Yann Collet7d360282016-02-12 00:07:30 +0100257 const int* suffix, U32 start,
Yann Collete618f8e2016-01-28 00:29:58 +0100258 const void* buffer, U32 minRatio)
259{
260 U32 lengthList[LLIMIT] = {0};
261 U32 cumulLength[LLIMIT] = {0};
262 U32 savings[LLIMIT] = {0};
263 const BYTE* b = (const BYTE*)buffer;
264 size_t length;
265 size_t maxLength = LLIMIT;
266 size_t pos = suffix[start];
267 U32 end = start;
268 dictItem solution;
269
Yann Collete618f8e2016-01-28 00:29:58 +0100270 /* init */
271 memset(&solution, 0, sizeof(solution));
272 doneMarks[pos] = 1;
273
274 /* trivial repetition cases */
275 if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2))
276 ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
277 ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
278 /* skip and mark segment */
279 U16 u16 = MEM_read16(b+pos+4);
280 U32 u, e = 6;
281 while (MEM_read16(b+pos+e) == u16) e+=2 ;
282 if (b[pos+e] == b[pos+e-1]) e++;
283 for (u=1; u<e; u++)
284 doneMarks[pos+u] = 1;
285 return solution;
286 }
287
288 /* look forward */
289 do {
290 end++;
Yann Collet7d360282016-02-12 00:07:30 +0100291 length = ZDICT_count(b + pos, b + suffix[end]);
Yann Collete618f8e2016-01-28 00:29:58 +0100292 } while (length >=MINMATCHLENGTH);
293
294 /* look backward */
295 do {
Yann Collet7d360282016-02-12 00:07:30 +0100296 length = ZDICT_count(b + pos, b + *(suffix+start-1));
Yann Collete618f8e2016-01-28 00:29:58 +0100297 if (length >=MINMATCHLENGTH) start--;
298 } while(length >= MINMATCHLENGTH);
299
300 /* exit if not found a minimum nb of repetitions */
301 if (end-start < minRatio) {
302 U32 idx;
303 for(idx=start; idx<end; idx++)
304 doneMarks[suffix[idx]] = 1;
305 return solution;
306 }
307
308 {
309 int i;
310 U32 searchLength;
311 U32 refinedStart = start;
312 U32 refinedEnd = end;
313
314 DISPLAYLEVEL(4, "\n");
315 DISPLAYLEVEL(4, "found %3u matches of length >= %u at pos %7u ", (U32)(end-start), MINMATCHLENGTH, (U32)pos);
316 DISPLAYLEVEL(4, "\n");
317
318 for (searchLength = MINMATCHLENGTH ; ; searchLength++) {
319 BYTE currentChar = 0;
320 U32 currentCount = 0;
321 U32 currentID = refinedStart;
322 U32 id;
323 U32 selectedCount = 0;
324 U32 selectedID = currentID;
325 for (id =refinedStart; id < refinedEnd; id++) {
326 if (b[ suffix[id] + searchLength] != currentChar) {
327 if (currentCount > selectedCount) {
328 selectedCount = currentCount;
329 selectedID = currentID;
330 }
331 currentID = id;
332 currentChar = b[ suffix[id] + searchLength];
333 currentCount = 0;
334 }
335 currentCount ++;
336 }
337 if (currentCount > selectedCount) { /* for last */
338 selectedCount = currentCount;
339 selectedID = currentID;
340 }
341
342 if (selectedCount < minRatio)
343 break;
Yann Collete618f8e2016-01-28 00:29:58 +0100344 refinedStart = selectedID;
345 refinedEnd = refinedStart + selectedCount;
346 }
347
348 /* evaluate gain based on new ref */
349 start = refinedStart;
350 pos = suffix[refinedStart];
351 end = start;
352 memset(lengthList, 0, sizeof(lengthList));
353
354 /* look forward */
355 do {
356 end++;
Yann Collet7d360282016-02-12 00:07:30 +0100357 length = ZDICT_count(b + pos, b + suffix[end]);
Yann Collete618f8e2016-01-28 00:29:58 +0100358 if (length >= LLIMIT) length = LLIMIT-1;
359 lengthList[length]++;
360 } while (length >=MINMATCHLENGTH);
361
362 /* look backward */
363 do {
Yann Collet7d360282016-02-12 00:07:30 +0100364 length = ZDICT_count(b + pos, b + suffix[start-1]);
Yann Collete618f8e2016-01-28 00:29:58 +0100365 if (length >= LLIMIT) length = LLIMIT-1;
366 lengthList[length]++;
367 if (length >=MINMATCHLENGTH) start--;
368 } while(length >= MINMATCHLENGTH);
369
370 /* largest useful length */
371 memset(cumulLength, 0, sizeof(cumulLength));
372 cumulLength[maxLength-1] = lengthList[maxLength-1];
Yann Collet9cadd082016-01-28 15:39:52 +0100373 for (i=(int)(maxLength-2); i>=0; i--)
Yann Collete618f8e2016-01-28 00:29:58 +0100374 cumulLength[i] = cumulLength[i+1] + lengthList[i];
375
376 for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break;
377 maxLength = i;
378
379 /* reduce maxLength in case of final into repetitive data */
380 {
Yann Collet9cadd082016-01-28 15:39:52 +0100381 U32 l = (U32)maxLength;
Yann Collete618f8e2016-01-28 00:29:58 +0100382 BYTE c = b[pos + maxLength-1];
383 while (b[pos+l-2]==c) l--;
384 maxLength = l;
385 }
386 if (maxLength < MINMATCHLENGTH) return solution; /* skip : no long-enough solution */
387
388 /* calculate savings */
389 savings[5] = 0;
390 for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
391 savings[i] = savings[i-1] + (lengthList[i] * (i-3));
392
393 DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n",
394 (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
395
Yann Collet9cadd082016-01-28 15:39:52 +0100396 solution.pos = (U32)pos;
397 solution.length = (U32)maxLength;
Yann Collete618f8e2016-01-28 00:29:58 +0100398 solution.savings = savings[maxLength];
399
400 /* mark positions done */
401 {
402 U32 id;
403 U32 testedPos;
404 for (id=start; id<end; id++) {
405 U32 p, pEnd;
406 testedPos = suffix[id];
407 if (testedPos == pos)
408 length = solution.length;
409 else {
Yann Collet7d360282016-02-12 00:07:30 +0100410 length = ZDICT_count(b+pos, b+testedPos);
Yann Collete618f8e2016-01-28 00:29:58 +0100411 if (length > solution.length) length = solution.length;
412 }
Yann Collet9cadd082016-01-28 15:39:52 +0100413 pEnd = (U32)(testedPos + length);
Yann Collete618f8e2016-01-28 00:29:58 +0100414 for (p=testedPos; p<pEnd; p++)
415 doneMarks[p] = 1;
416 } } }
417
418 return solution;
419}
420
421
Yann Collet7d360282016-02-12 00:07:30 +0100422/*! ZDICT_checkMerge
Yann Collete618f8e2016-01-28 00:29:58 +0100423 check if dictItem can be merged, do it if possible
424 @return : id of destination elt, 0 if not merged
425*/
Yann Collet7d360282016-02-12 00:07:30 +0100426static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
Yann Collete618f8e2016-01-28 00:29:58 +0100427{
428 const U32 tableSize = table->pos;
429 const U32 max = elt.pos + (elt.length-1);
430
431 /* tail overlap */
432 U32 u; for (u=1; u<tableSize; u++) {
433 if (u==eltNbToSkip) continue;
434 if ((table[u].pos > elt.pos) && (table[u].pos < max)) { /* overlap */
435 /* append */
436 U32 addedLength = table[u].pos - elt.pos;
437 table[u].length += addedLength;
438 table[u].pos = elt.pos;
439 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
440 table[u].savings += elt.length / 8; /* rough approx */
441 elt = table[u];
442 while ((u>1) && (table[u-1].savings < elt.savings))
443 table[u] = table[u-1], u--;
444 table[u] = elt;
445 return u;
446 } }
447
448 /* front overlap */
449 for (u=1; u<tableSize; u++) {
450 if (u==eltNbToSkip) continue;
451 if ((table[u].pos + table[u].length > elt.pos) && (table[u].pos < elt.pos)) { /* overlap */
452 /* append */
453 int addedLength = (elt.pos + elt.length) - (table[u].pos + table[u].length);
454 table[u].savings += elt.length / 8; /* rough approx */
455 if (addedLength > 0) { /* otherwise, already included */
456 table[u].length += addedLength;
457 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
458 }
459 elt = table[u];
460 while ((u>1) && (table[u-1].savings < elt.savings))
461 table[u] = table[u-1], u--;
462 table[u] = elt;
463 return u;
464 } }
465
466 return 0;
467}
468
469
Yann Collet7d360282016-02-12 00:07:30 +0100470static void ZDICT_removeDictItem(dictItem* table, U32 id)
Yann Collete618f8e2016-01-28 00:29:58 +0100471{
472 /* convention : first element is nb of elts */
473 U32 max = table->pos;
474 U32 u;
475 if (!id) return; /* protection, should never happen */
Yann Collet9cadd082016-01-28 15:39:52 +0100476 for (u=id; u<max-1; u++)
Yann Collete618f8e2016-01-28 00:29:58 +0100477 table[u] = table[u+1];
478 table->pos--;
479}
480
481
Yann Collet7d360282016-02-12 00:07:30 +0100482static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
Yann Collete618f8e2016-01-28 00:29:58 +0100483{
484 /* merge if possible */
Yann Collet7d360282016-02-12 00:07:30 +0100485 U32 mergeId = ZDICT_checkMerge(table, elt, 0);
Yann Collete618f8e2016-01-28 00:29:58 +0100486 if (mergeId) {
487 U32 newMerge = 1;
488 while (newMerge) {
Yann Collet7d360282016-02-12 00:07:30 +0100489 newMerge = ZDICT_checkMerge(table, table[mergeId], mergeId);
490 if (newMerge) ZDICT_removeDictItem(table, mergeId);
Yann Collete618f8e2016-01-28 00:29:58 +0100491 mergeId = newMerge;
492 }
493 return;
494 }
495
496 /* insert */
497 {
498 U32 current;
499 U32 nextElt = table->pos;
500 if (nextElt >= maxSize) nextElt = maxSize-1;
501 current = nextElt-1;
502 while (table[current].savings < elt.savings) {
503 table[current+1] = table[current];
504 current--;
505 }
506 table[current+1] = elt;
507 table->pos = nextElt+1;
508 }
509}
510
511
Yann Collet7d360282016-02-12 00:07:30 +0100512static U32 ZDICT_dictSize(const dictItem* dictList)
Yann Collete618f8e2016-01-28 00:29:58 +0100513{
514 U32 u, dictSize = 0;
515 for (u=1; u<dictList[0].pos; u++)
516 dictSize += dictList[u].length;
517 return dictSize;
518}
519
520
Yann Collet7d360282016-02-12 00:07:30 +0100521static void ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
Yann Collete618f8e2016-01-28 00:29:58 +0100522 const void* const buffer, const size_t bufferSize, /* buffer must end with noisy guard band */
Yann Collet7682e492016-01-31 23:45:35 +0100523 const size_t* fileSizes, unsigned nbFiles,
524 U32 shiftRatio, unsigned maxDictSize)
Yann Collete618f8e2016-01-28 00:29:58 +0100525{
Yann Collet7d360282016-02-12 00:07:30 +0100526 int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
527 int* const suffix = suffix0+1;
Yann Collete618f8e2016-01-28 00:29:58 +0100528 U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
529 BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */
530 U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
531 U32 minRatio = nbFiles >> shiftRatio;
Yann Collet7d360282016-02-12 00:07:30 +0100532 int divSuftSortResult;
Yann Collete618f8e2016-01-28 00:29:58 +0100533
534 /* init */
535 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
536 if (!suffix0 || !reverseSuffix || !doneMarks || !filePos)
Yann Collet7d360282016-02-12 00:07:30 +0100537 EXM_THROW(1, "not enough memory for ZDICT_trainBuffer");
Yann Collete618f8e2016-01-28 00:29:58 +0100538 if (minRatio < MINRATIO) minRatio = MINRATIO;
539 memset(doneMarks, 0, bufferSize+16);
540
541 /* sort */
Yann Collet7682e492016-01-31 23:45:35 +0100542 DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
Yann Collet7d360282016-02-12 00:07:30 +0100543 divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
544 if (divSuftSortResult != 0) EXM_THROW(2, "sort failed");
545 suffix[bufferSize] = (int)bufferSize; /* leads into noise */
546 suffix0[0] = (int)bufferSize; /* leads into noise */
Yann Collete618f8e2016-01-28 00:29:58 +0100547 {
548 /* build reverse suffix sort */
549 size_t pos;
550 for (pos=0; pos < bufferSize; pos++)
Yann Collet9cadd082016-01-28 15:39:52 +0100551 reverseSuffix[suffix[pos]] = (U32)pos;
Yann Collete618f8e2016-01-28 00:29:58 +0100552 /* build file pos */
553 filePos[0] = 0;
554 for (pos=1; pos<nbFiles; pos++)
Yann Collet9cadd082016-01-28 15:39:52 +0100555 filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
Yann Collete618f8e2016-01-28 00:29:58 +0100556 }
557
558 DISPLAYLEVEL(2, "finding patterns ... \n");
Yann Collet82516192016-01-29 16:48:10 +0100559 DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio);
Yann Collete618f8e2016-01-28 00:29:58 +0100560
561 {
562 U32 cursor; for (cursor=0; cursor < bufferSize; ) {
563 dictItem solution;
Yann Collete618f8e2016-01-28 00:29:58 +0100564 if (doneMarks[cursor]) { cursor++; continue; }
Yann Collet7d360282016-02-12 00:07:30 +0100565 solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio);
Yann Collete618f8e2016-01-28 00:29:58 +0100566 if (solution.length==0) { cursor++; continue; }
Yann Collet7d360282016-02-12 00:07:30 +0100567 ZDICT_insertDictItem(dictList, dictListSize, solution);
Yann Collete618f8e2016-01-28 00:29:58 +0100568 cursor += solution.length;
569 DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
Yann Collet82516192016-01-29 16:48:10 +0100570 } }
Yann Collete618f8e2016-01-28 00:29:58 +0100571
572 /* limit dictionary size */
573 {
574 U32 max = dictList->pos; /* convention : nb of useful elts within dictList */
575 U32 currentSize = 0;
576 U32 n; for (n=1; n<max; n++) {
577 currentSize += dictList[n].length;
578 if (currentSize > maxDictSize) break;
579 }
580 dictList->pos = n;
581 }
582
583 free(suffix0);
584 free(reverseSuffix);
585 free(doneMarks);
586 free(filePos);
587}
588
589
Yann Collet7d360282016-02-12 00:07:30 +0100590static void ZDICT_fillNoise(void* buffer, size_t length)
Yann Collete618f8e2016-01-28 00:29:58 +0100591{
592 unsigned acc = PRIME1;
593 size_t p=0;;
594
595 for (p=0; p<length; p++) {
596 acc *= PRIME2;
597 ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
598 }
599}
600
601
602typedef struct
603{
604 ZSTD_CCtx* ref;
605 ZSTD_CCtx* zc;
606 void* workPlace; /* must be BLOCKSIZE allocated */
607} EStats_ress_t;
608
609
Yann Collet7d360282016-02-12 00:07:30 +0100610static void ZDICT_countEStats(EStats_ress_t esr,
Yann Collete618f8e2016-01-28 00:29:58 +0100611 U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount,
612 const void* src, size_t srcSize)
613{
614 const BYTE* bytePtr;
615 const U32* u32Ptr;
Yann Collet7d360282016-02-12 00:07:30 +0100616 seqStore_t seqStore;
Yann Collete618f8e2016-01-28 00:29:58 +0100617
618 if (srcSize > BLOCKSIZE) srcSize = BLOCKSIZE; /* protection vs large samples */
619 ZSTD_copyCCtx(esr.zc, esr.ref);
620 ZSTD_compressBlock(esr.zc, esr.workPlace, BLOCKSIZE, src, srcSize);
Yann Collet7d360282016-02-12 00:07:30 +0100621 seqStore = ZSTD_copySeqStore(esr.zc);
Yann Collete618f8e2016-01-28 00:29:58 +0100622
623 /* count stats */
Yann Collet7d360282016-02-12 00:07:30 +0100624 for(bytePtr = seqStore.litStart; bytePtr < seqStore.lit; bytePtr++)
Yann Collete618f8e2016-01-28 00:29:58 +0100625 countLit[*bytePtr]++;
Yann Collet7d360282016-02-12 00:07:30 +0100626 for(u32Ptr = seqStore.offsetStart; u32Ptr < seqStore.offset; u32Ptr++) {
Yann Collete618f8e2016-01-28 00:29:58 +0100627 BYTE offcode = (BYTE)ZSTD_highbit(*u32Ptr) + 1;
628 if (*u32Ptr==0) offcode=0;
629 offsetcodeCount[offcode]++;
630 }
Yann Collet7d360282016-02-12 00:07:30 +0100631 for(bytePtr = seqStore.matchLengthStart; bytePtr < seqStore.matchLength; bytePtr++)
Yann Collete618f8e2016-01-28 00:29:58 +0100632 matchlengthCount[*bytePtr]++;
Yann Collet7d360282016-02-12 00:07:30 +0100633 for(bytePtr = seqStore.litLengthStart; bytePtr < seqStore.litLength; bytePtr++)
Yann Collete618f8e2016-01-28 00:29:58 +0100634 litlengthCount[*bytePtr]++;
635}
636
637
Yann Collet7d360282016-02-12 00:07:30 +0100638#define OFFCODE_MAX 18 /* only applicable to first block */
639static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
Yann Colletf5229e02016-01-29 02:45:26 +0100640 unsigned compressionLevel,
Yann Collet7682e492016-01-31 23:45:35 +0100641 const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
Yann Collete618f8e2016-01-28 00:29:58 +0100642 const void* dictBuffer, size_t dictBufferSize)
643{
644 U32 countLit[256];
645 U32 offcodeCount[MaxOff+1];
646 HUF_CREATE_STATIC_CTABLE(hufTable, 255);
647 short offcodeNCount[MaxOff+1];
648 U32 matchLengthCount[MaxML+1];
649 short matchLengthNCount[MaxML+1];
650 U32 litlengthCount[MaxLL+1];
651 short litlengthNCount[MaxLL+1];
652 EStats_ress_t esr;
653 ZSTD_parameters params;
654 U32 u, huffLog = 12, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
655 size_t pos = 0, errorCode;
656 size_t eSize = 0;
657
658 /* init */
659 for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */
660 for (u=0; u<=OFFCODE_MAX; u++) offcodeCount[u]=1;
661 for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
662 for (u=0; u<=MaxLL; u++) litlengthCount[u]=1;
663 esr.ref = ZSTD_createCCtx();
664 esr.zc = ZSTD_createCCtx();
665 esr.workPlace = malloc(BLOCKSIZE);
666 if (!esr.ref || !esr.zc || !esr.workPlace) EXM_THROW(30, "Not enough memory");
Yann Colletf5229e02016-01-29 02:45:26 +0100667 if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
668 params = ZSTD_getParams(compressionLevel, dictBufferSize + 15 KB);
Yann Collete618f8e2016-01-28 00:29:58 +0100669 params.strategy = ZSTD_greedy;
670 ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params);
671
672 /* collect stats on all files */
673 for (u=0; u<nbFiles; u++) {
Yann Collet7d360282016-02-12 00:07:30 +0100674 ZDICT_countEStats(esr,
Yann Collete618f8e2016-01-28 00:29:58 +0100675 countLit, offcodeCount, matchLengthCount, litlengthCount,
676 (const char*)srcBuffer + pos, fileSizes[u]);
677 pos += fileSizes[u];
678 }
679
680 /* analyze */
681 errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
682 if (HUF_isError(errorCode)) EXM_THROW(31, "HUF_buildCTable error");
683 huffLog = (U32)errorCode;
684
685 total=0; for (u=0; u<=OFFCODE_MAX; u++) total+=offcodeCount[u];
686 errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, OFFCODE_MAX);
687 if (FSE_isError(errorCode)) EXM_THROW(32, "FSE_normalizeCount error with offcodeCount");
688 Offlog = (U32)errorCode;
689
690 total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
691 errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
692 if (FSE_isError(errorCode)) EXM_THROW(33, "FSE_normalizeCount error with matchLengthCount");
693 mlLog = (U32)errorCode;
694
695 total=0; for (u=0; u<=MaxLL; u++) total+=litlengthCount[u];
696 errorCode = FSE_normalizeCount(litlengthNCount, llLog, litlengthCount, total, MaxLL);
697 if (FSE_isError(errorCode)) EXM_THROW(34, "FSE_normalizeCount error with litlengthCount");
698 llLog = (U32)errorCode;
699
700 /* write result to buffer */
701 errorCode = HUF_writeCTable(dstBuffer, maxDstSize, hufTable, 255, huffLog);
702 if (HUF_isError(errorCode)) EXM_THROW(41, "HUF_writeCTable error");
703 dstBuffer = (char*)dstBuffer + errorCode;
704 maxDstSize -= errorCode;
705 eSize += errorCode;
706
707 errorCode = FSE_writeNCount(dstBuffer, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
708 if (FSE_isError(errorCode)) EXM_THROW(42, "FSE_writeNCount error with offcodeNCount");
709 dstBuffer = (char*)dstBuffer + errorCode;
710 maxDstSize -= errorCode;
711 eSize += errorCode;
712
713 errorCode = FSE_writeNCount(dstBuffer, maxDstSize, matchLengthNCount, MaxML, mlLog);
714 if (FSE_isError(errorCode)) EXM_THROW(43, "FSE_writeNCount error with matchLengthNCount");
715 dstBuffer = (char*)dstBuffer + errorCode;
716 maxDstSize -= errorCode;
717 eSize += errorCode;
718
719 errorCode = FSE_writeNCount(dstBuffer, maxDstSize, litlengthNCount, MaxLL, llLog);
720 if (FSE_isError(errorCode)) EXM_THROW(43, "FSE_writeNCount error with litlengthNCount");
721 dstBuffer = (char*)dstBuffer + errorCode;
722 maxDstSize -= errorCode;
723 eSize += errorCode;
724
725 /* clean */
726 ZSTD_freeCCtx(esr.ref);
727 ZSTD_freeCCtx(esr.zc);
728 free(esr.workPlace);
729
730 return eSize;
731}
732
733
Yann Colletf5229e02016-01-29 02:45:26 +0100734#define DIB_FASTSEGMENTSIZE 64
Yann Collet7d360282016-02-12 00:07:30 +0100735/*! ZDICT_fastSampling (based on an idea by Giuseppe Ottaviano)
736 Fill `dictBuffer` with stripes of size DIB_FASTSEGMENTSIZE from `samplesBuffer`
737 up to `dictSize`.
738 Filling starts from the end of `dictBuffer`, down to maximum possible.
739 if `dictSize` is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of `dictBuffer` won't be used.
740 @return : amount of data written into `dictBuffer`
741 or an error code
Yann Colletf5229e02016-01-29 02:45:26 +0100742*/
Yann Collet7d360282016-02-12 00:07:30 +0100743static size_t ZDICT_fastSampling(void* dictBuffer, size_t dictSize,
Yann Colletf5229e02016-01-29 02:45:26 +0100744 const void* samplesBuffer, size_t samplesSize)
745{
746 char* dstPtr = (char*)dictBuffer + dictSize;
747 const char* srcPtr = (const char*)samplesBuffer;
748 size_t nbSegments = dictSize / DIB_FASTSEGMENTSIZE;
749 size_t segNb, interSize;
750
751 if (nbSegments <= 2) return ERROR(srcSize_wrong);
752 if (samplesSize < dictSize) return ERROR(srcSize_wrong);
753
754 /* first and last segments are part of dictionary, in case they contain interesting header/footer */
755 dstPtr -= DIB_FASTSEGMENTSIZE;
756 memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
757 dstPtr -= DIB_FASTSEGMENTSIZE;
758 memcpy(dstPtr, srcPtr+samplesSize-DIB_FASTSEGMENTSIZE, DIB_FASTSEGMENTSIZE);
759
760 /* regularly copy a segment */
761 interSize = (samplesSize - nbSegments*DIB_FASTSEGMENTSIZE) / (nbSegments-1);
762 srcPtr += DIB_FASTSEGMENTSIZE;
763 for (segNb=2; segNb < nbSegments; segNb++) {
764 srcPtr += interSize;
765 dstPtr -= DIB_FASTSEGMENTSIZE;
766 memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
767 srcPtr += DIB_FASTSEGMENTSIZE;
768 }
769
770 return nbSegments * DIB_FASTSEGMENTSIZE;
771}
772
773
Yann Collet7d360282016-02-12 00:07:30 +0100774size_t ZDICT_trainFromBuffer_unsafe(
Yann Collet7682e492016-01-31 23:45:35 +0100775 void* dictBuffer, size_t maxDictSize,
776 const void* samplesBuffer, const size_t* sampleSizes, unsigned nbSamples,
Yann Collet7d360282016-02-12 00:07:30 +0100777 ZDICT_params_t params)
Yann Collete618f8e2016-01-28 00:29:58 +0100778{
Yann Colletb35c4642016-02-01 00:03:10 +0100779 const U32 dictListSize = MAX( MAX(DICTLISTSIZE, nbSamples), (U32)(maxDictSize/16));
Yann Collete618f8e2016-01-28 00:29:58 +0100780 dictItem* dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
Yann Collet7682e492016-01-31 23:45:35 +0100781 unsigned selectivity = params.selectivityLevel;
782 unsigned compressionLevel = params.compressionLevel;
783 size_t targetDictSize = maxDictSize - g_provision_entropySize;
784 size_t sBuffSize;
785 size_t dictSize = 0;
786
787 /* checks */
788 if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) return ERROR(dstSize_tooSmall);
Yann Collete618f8e2016-01-28 00:29:58 +0100789
790 /* init */
Yann Collet7682e492016-01-31 23:45:35 +0100791 { unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += sampleSizes[u]; }
Yann Collet7d360282016-02-12 00:07:30 +0100792 if (!dictList) return ERROR(memory_allocation);
793 ZDICT_initDictItem(dictList);
Yann Collet7682e492016-01-31 23:45:35 +0100794 if (selectivity==0) selectivity = g_selectivity_default;
795 if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
Yann Collete618f8e2016-01-28 00:29:58 +0100796
Yann Collet7d360282016-02-12 00:07:30 +0100797 /* build dictionary */
798 if (selectivity>1) { /* selectivity == 1 => fast mode */
799 ZDICT_trainBuffer(dictList, dictListSize,
Yann Collet7682e492016-01-31 23:45:35 +0100800 samplesBuffer, sBuffSize,
801 sampleSizes, nbSamples,
Yann Colletb35c4642016-02-01 00:03:10 +0100802 selectivity, (U32)targetDictSize);
Yann Collete618f8e2016-01-28 00:29:58 +0100803
Yann Colletf5229e02016-01-29 02:45:26 +0100804 /* display best matches */
805 if (g_displayLevel>= 3) {
806 const U32 nb = 25;
807 U32 u;
Yann Collet7d360282016-02-12 00:07:30 +0100808 U32 dictContentSize = ZDICT_dictSize(dictList);
Yann Colletf5229e02016-01-29 02:45:26 +0100809 DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
810 DISPLAYLEVEL(3, "list %u best segments \n", nb);
811 for (u=1; u<=nb; u++) {
812 U32 p = dictList[u].pos;
813 U32 l = dictList[u].length;
814 U32 d = MIN(40, l);
815 DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
816 u, l, p, dictList[u].savings);
Yann Collet7d360282016-02-12 00:07:30 +0100817 ZDICT_printHex(3, (const char*)samplesBuffer+p, d);
Yann Colletf5229e02016-01-29 02:45:26 +0100818 DISPLAYLEVEL(3, "| \n");
819 } } }
Yann Collete618f8e2016-01-28 00:29:58 +0100820
821 /* create dictionary */
822 {
Yann Collet7d360282016-02-12 00:07:30 +0100823 U32 dictContentSize = ZDICT_dictSize(dictList);
Yann Collet7682e492016-01-31 23:45:35 +0100824 size_t hSize;
Yann Collete618f8e2016-01-28 00:29:58 +0100825 BYTE* ptr;
826 U32 u;
827
Yann Collete618f8e2016-01-28 00:29:58 +0100828 /* build dict content */
Yann Collet7682e492016-01-31 23:45:35 +0100829 ptr = (BYTE*)dictBuffer + maxDictSize;
Yann Collete618f8e2016-01-28 00:29:58 +0100830 for (u=1; u<dictList->pos; u++) {
831 U32 l = dictList[u].length;
832 ptr -= l;
Yann Collet7682e492016-01-31 23:45:35 +0100833 if (ptr<(BYTE*)dictBuffer) return ERROR(GENERIC); /* should not happen */
834 memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
Yann Collete618f8e2016-01-28 00:29:58 +0100835 }
836
Yann Collet82516192016-01-29 16:48:10 +0100837 /* fast mode dict content */
Yann Collet7682e492016-01-31 23:45:35 +0100838 if (selectivity==1) { /* note could also be used to complete a dictionary, but not necessarily better */
839 DISPLAYLEVEL(3, "\r%70s\r", ""); /* clean display line */
840 DISPLAYLEVEL(3, "Adding %u KB with fast sampling \n", (U32)(targetDictSize>>10));
Yann Collet7d360282016-02-12 00:07:30 +0100841 dictContentSize = (U32)ZDICT_fastSampling((char*)dictBuffer + g_provision_entropySize,
Yann Collet7682e492016-01-31 23:45:35 +0100842 targetDictSize, samplesBuffer, sBuffSize);
Yann Colletf5229e02016-01-29 02:45:26 +0100843 }
844
Yann Collet7682e492016-01-31 23:45:35 +0100845 /* dictionary header */
846 MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
Yann Collete618f8e2016-01-28 00:29:58 +0100847 hSize = 4;
Yann Collete618f8e2016-01-28 00:29:58 +0100848
849 /* entropic tables */
Yann Collet7682e492016-01-31 23:45:35 +0100850 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
Yann Collete618f8e2016-01-28 00:29:58 +0100851 DISPLAYLEVEL(2, "statistics ... \n");
Yann Collet7d360282016-02-12 00:07:30 +0100852 hSize += ZDICT_analyzeEntropy((char*)dictBuffer+4, maxDictSize-4,
Yann Colletf5229e02016-01-29 02:45:26 +0100853 compressionLevel,
Yann Collet7682e492016-01-31 23:45:35 +0100854 samplesBuffer, sampleSizes, nbSamples,
855 (char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize);
Yann Collete618f8e2016-01-28 00:29:58 +0100856
Yann Collet7682e492016-01-31 23:45:35 +0100857 if (hSize + dictContentSize < maxDictSize)
858 memmove((char*)dictBuffer + hSize, (char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize);
859 dictSize = MIN(maxDictSize, hSize+dictContentSize);
Yann Collete618f8e2016-01-28 00:29:58 +0100860 }
861
862 /* clean up */
Yann Collete618f8e2016-01-28 00:29:58 +0100863 free(dictList);
Yann Collet7682e492016-01-31 23:45:35 +0100864 return dictSize;
Yann Collete618f8e2016-01-28 00:29:58 +0100865}
866
Yann Collet7682e492016-01-31 23:45:35 +0100867
Yann Collet7d360282016-02-12 00:07:30 +0100868size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
869 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
870 ZDICT_params_t params)
Yann Collet7682e492016-01-31 23:45:35 +0100871{
872 size_t sBuffSize;
873 void* newBuff;
874 size_t result;
875
Yann Collet7d360282016-02-12 00:07:30 +0100876 { unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; }
Yann Collet7682e492016-01-31 23:45:35 +0100877 newBuff = malloc(sBuffSize + NOISELENGTH);
878 if (!newBuff) return ERROR(memory_allocation);
879
880 memcpy(newBuff, samplesBuffer, sBuffSize);
Yann Collet7d360282016-02-12 00:07:30 +0100881 ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
Yann Collet7682e492016-01-31 23:45:35 +0100882
Yann Collet7d360282016-02-12 00:07:30 +0100883 result = ZDICT_trainFromBuffer_unsafe(dictBuffer, dictBufferCapacity,
884 newBuff, samplesSizes, nbSamples,
Yann Collet7682e492016-01-31 23:45:35 +0100885 params);
886 free(newBuff);
887 return result;
888}
889
890
Yann Collet7d360282016-02-12 00:07:30 +0100891/* issue : samplesBuffer need to be followed by a noisy guard band.
892* work around : duplicate the buffer, and add the noise ? */
893size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
894 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
Yann Collet7682e492016-01-31 23:45:35 +0100895{
Yann Collet7d360282016-02-12 00:07:30 +0100896 ZDICT_params_t params;
897 memset(&params, 0, sizeof(params));
898 return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity,
899 samplesBuffer, samplesSizes, nbSamples,
900 params);
Yann Collet7682e492016-01-31 23:45:35 +0100901}
Yann Collet7d360282016-02-12 00:07:30 +0100902