blob: fe40bea464906bdfa9711fb7bc8cdeefe86f55f3 [file] [log] [blame]
Yann Collete618f8e2016-01-28 00:29:58 +01001/*
Yann Collet7d360282016-02-12 00:07:30 +01002 dictBuilder - dictionary builder for zstd
Yann Collete618f8e2016-01-28 00:29:58 +01003 Copyright (C) Yann Collet 2016
4
Yann Collet7d360282016-02-12 00:07:30 +01005 BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
Yann Collete618f8e2016-01-28 00:29:58 +01006
Yann Collet7d360282016-02-12 00:07:30 +01007 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are
9 met:
Yann Collete618f8e2016-01-28 00:29:58 +010010
Yann Collet7d360282016-02-12 00:07:30 +010011 * Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13 * Redistributions in binary form must reproduce the above
14 copyright notice, this list of conditions and the following disclaimer
15 in the documentation and/or other materials provided with the
16 distribution.
Yann Collete618f8e2016-01-28 00:29:58 +010017
Yann Collet7d360282016-02-12 00:07:30 +010018 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Yann Collete618f8e2016-01-28 00:29:58 +010029
30 You can contact the author at :
Yann Collet7d360282016-02-12 00:07:30 +010031 - Zstd source repository : https://www.zstd.net
Yann Collete618f8e2016-01-28 00:29:58 +010032*/
33
Yann Collet7d360282016-02-12 00:07:30 +010034/*-**************************************
Yann Collete618f8e2016-01-28 00:29:58 +010035* Compiler Options
36****************************************/
37/* Disable some Visual warning messages */
38#ifdef _MSC_VER
39# define _CRT_SECURE_NO_WARNINGS /* fopen */
40# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
41#endif
42
43/* Unix Large Files support (>4GB) */
44#define _FILE_OFFSET_BITS 64
45#if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */
46# define _LARGEFILE_SOURCE
47#elif ! defined(__LP64__) /* No point defining Large file for 64 bit */
48# define _LARGEFILE64_SOURCE
49#endif
50
Yann Collete618f8e2016-01-28 00:29:58 +010051
Yann Collet863ec402016-01-28 17:56:33 +010052/*-*************************************
Yann Collet7d360282016-02-12 00:07:30 +010053* Dependencies
Yann Collete618f8e2016-01-28 00:29:58 +010054***************************************/
Yann Collet7682e492016-01-31 23:45:35 +010055#include <stdlib.h> /* malloc, free */
56#include <string.h> /* memset */
57#include <stdio.h> /* fprintf, fopen, ftello64 */
58#include <sys/types.h> /* stat64 */
59#include <sys/stat.h> /* stat64 */
60#include <time.h> /* clock */
Yann Collete618f8e2016-01-28 00:29:58 +010061
Yann Collet7682e492016-01-31 23:45:35 +010062#include "mem.h" /* read */
Yann Colletf5229e02016-01-29 02:45:26 +010063#include "error_private.h"
Yann Collete618f8e2016-01-28 00:29:58 +010064#include "divsufsort.h"
Yann Collet7d360282016-02-12 00:07:30 +010065#include "dictBuilder_static.h"
66#include "fse.h"
Yann Collete618f8e2016-01-28 00:29:58 +010067#include "huff0_static.h"
Yann Collet7d360282016-02-12 00:07:30 +010068#include "zstd_internal.h"
Yann Collete618f8e2016-01-28 00:29:58 +010069
70
Yann Collet7682e492016-01-31 23:45:35 +010071/*-*************************************
Yann Collete618f8e2016-01-28 00:29:58 +010072* Compiler specifics
73***************************************/
74#if !defined(S_ISREG)
75# define S_ISREG(x) (((x) & S_IFMT) == S_IFREG)
76#endif
77
Yann Collete618f8e2016-01-28 00:29:58 +010078
Yann Collet863ec402016-01-28 17:56:33 +010079/*-*************************************
Yann Collete618f8e2016-01-28 00:29:58 +010080* Constants
81***************************************/
82#define KB *(1 <<10)
83#define MB *(1 <<20)
84#define GB *(1U<<30)
85
86#define DICTLISTSIZE 10000
Yann Collete618f8e2016-01-28 00:29:58 +010087
88#define NOISELENGTH 32
89#define PRIME1 2654435761U
90#define PRIME2 2246822519U
91
92#define MINRATIO 4
Yann Colletf5229e02016-01-29 02:45:26 +010093static const U32 g_compressionLevel_default = 5;
Yann Collet7682e492016-01-31 23:45:35 +010094static const U32 g_selectivity_default = 9;
95static const size_t g_provision_entropySize = 200;
96static const size_t g_min_fast_dictContent = 192;
Yann Collete618f8e2016-01-28 00:29:58 +010097
98
Yann Collet863ec402016-01-28 17:56:33 +010099/*-*************************************
100* Console display
Yann Collete618f8e2016-01-28 00:29:58 +0100101***************************************/
102#define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
103#define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
Yann Colletb31de732016-01-28 10:30:02 +0100104static unsigned g_displayLevel = 0; /* 0 : no display; 1: errors; 2: default; 4: full information */
Yann Collet7d360282016-02-12 00:07:30 +0100105void ZDICT_setNotificationLevel(unsigned l) { g_displayLevel=l; }
Yann Collete618f8e2016-01-28 00:29:58 +0100106
107#define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
Yann Collet7d360282016-02-12 00:07:30 +0100108 if (ZDICT_GetMilliSpan(g_time) > refreshRate) \
Yann Collete618f8e2016-01-28 00:29:58 +0100109 { g_time = clock(); DISPLAY(__VA_ARGS__); \
110 if (g_displayLevel>=4) fflush(stdout); } }
111static const unsigned refreshRate = 300;
112static clock_t g_time = 0;
113
Yann Collet7d360282016-02-12 00:07:30 +0100114void ZDICT_printHex(U32 dlevel, const void* ptr, size_t length)
Yann Collete618f8e2016-01-28 00:29:58 +0100115{
116 const BYTE* const b = (const BYTE*)ptr;
117 size_t u;
118 for (u=0; u<length; u++)
119 {
120 BYTE c = b[u];
121 if (c<32 || c>126) c = '.'; /* non-printable char */
122 DISPLAYLEVEL(dlevel, "%c", c);
123 }
124}
125
126
Yann Collet863ec402016-01-28 17:56:33 +0100127/*-*************************************
Yann Collete618f8e2016-01-28 00:29:58 +0100128* Exceptions
129***************************************/
130#ifndef DEBUG
131# define DEBUG 0
132#endif
133#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
134#define EXM_THROW(error, ...) \
135{ \
136 DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
137 DISPLAYLEVEL(1, "Error %i : ", error); \
138 DISPLAYLEVEL(1, __VA_ARGS__); \
139 DISPLAYLEVEL(1, "\n"); \
140 exit(error); \
141}
142
143
Yann Collet7d360282016-02-12 00:07:30 +0100144/*-********************************************************
Yann Collete618f8e2016-01-28 00:29:58 +0100145* Helper functions
146**********************************************************/
Yann Collet7d360282016-02-12 00:07:30 +0100147static unsigned ZDICT_GetMilliSpan(clock_t nPrevious)
Yann Collete618f8e2016-01-28 00:29:58 +0100148{
149 clock_t nCurrent = clock();
150 unsigned nSpan = (unsigned)(((nCurrent - nPrevious) * 1000) / CLOCKS_PER_SEC);
151 return nSpan;
152}
153
Yann Collet7d360282016-02-12 00:07:30 +0100154unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
Yann Collet7682e492016-01-31 23:45:35 +0100155
Yann Collet7d360282016-02-12 00:07:30 +0100156const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
Yann Collete618f8e2016-01-28 00:29:58 +0100157
158
159/*-********************************************************
160* Dictionary training functions
161**********************************************************/
Yann Collet7d360282016-02-12 00:07:30 +0100162static unsigned ZDICT_NbCommonBytes (register size_t val)
Yann Collete618f8e2016-01-28 00:29:58 +0100163{
164 if (MEM_isLittleEndian()) {
165 if (MEM_64bits()) {
166# if defined(_MSC_VER) && defined(_WIN64)
167 unsigned long r = 0;
168 _BitScanForward64( &r, (U64)val );
169 return (unsigned)(r>>3);
170# elif defined(__GNUC__) && (__GNUC__ >= 3)
171 return (__builtin_ctzll((U64)val) >> 3);
172# else
173 static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
174 return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
175# endif
176 } else { /* 32 bits */
177# if defined(_MSC_VER)
178 unsigned long r=0;
179 _BitScanForward( &r, (U32)val );
180 return (unsigned)(r>>3);
181# elif defined(__GNUC__) && (__GNUC__ >= 3)
182 return (__builtin_ctz((U32)val) >> 3);
183# else
184 static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
185 return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
186# endif
187 }
188 } else { /* Big Endian CPU */
189 if (MEM_64bits()) {
190# if defined(_MSC_VER) && defined(_WIN64)
191 unsigned long r = 0;
192 _BitScanReverse64( &r, val );
193 return (unsigned)(r>>3);
194# elif defined(__GNUC__) && (__GNUC__ >= 3)
195 return (__builtin_clzll(val) >> 3);
196# else
197 unsigned r;
198 const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */
199 if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
200 if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
201 r += (!val);
202 return r;
203# endif
204 } else { /* 32 bits */
205# if defined(_MSC_VER)
206 unsigned long r = 0;
207 _BitScanReverse( &r, (unsigned long)val );
208 return (unsigned)(r>>3);
209# elif defined(__GNUC__) && (__GNUC__ >= 3)
210 return (__builtin_clz((U32)val) >> 3);
211# else
212 unsigned r;
213 if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
214 r += (!val);
215 return r;
216# endif
217 } }
218}
219
220
Yann Collet7d360282016-02-12 00:07:30 +0100221/*! ZDICT_count() :
Yann Collete618f8e2016-01-28 00:29:58 +0100222 Count the nb of common bytes between 2 pointers.
223 Note : this function presumes end of buffer followed by noisy guard band.
224*/
Yann Collet7d360282016-02-12 00:07:30 +0100225static size_t ZDICT_count(const void* pIn, const void* pMatch)
Yann Collete618f8e2016-01-28 00:29:58 +0100226{
227 const char* const pStart = (const char*)pIn;
228 for (;;) {
Yann Collet7d360282016-02-12 00:07:30 +0100229 size_t diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
Yann Collete618f8e2016-01-28 00:29:58 +0100230 if (!diff) { pIn = (const char*)pIn+sizeof(size_t); pMatch = (const char*)pMatch+sizeof(size_t); continue; }
Yann Collet7d360282016-02-12 00:07:30 +0100231 pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff);
Yann Collete618f8e2016-01-28 00:29:58 +0100232 return (size_t)((const char*)pIn - pStart);
233 }
234}
235
236
237typedef struct {
238 U32 pos;
239 U32 length;
240 U32 savings;
241} dictItem;
242
Yann Collet7d360282016-02-12 00:07:30 +0100243static void ZDICT_initDictItem(dictItem* d)
Yann Collete618f8e2016-01-28 00:29:58 +0100244{
245 d->pos = 1;
246 d->length = 0;
247 d->savings = (U32)(-1);
248}
249
250
251#define LLIMIT 64 /* heuristic determined experimentally */
252#define MINMATCHLENGTH 7 /* heuristic determined experimentally */
Yann Collet7d360282016-02-12 00:07:30 +0100253static dictItem ZDICT_analyzePos(
Yann Collete618f8e2016-01-28 00:29:58 +0100254 BYTE* doneMarks,
Yann Collet7d360282016-02-12 00:07:30 +0100255 const int* suffix, U32 start,
Yann Collete618f8e2016-01-28 00:29:58 +0100256 const void* buffer, U32 minRatio)
257{
258 U32 lengthList[LLIMIT] = {0};
259 U32 cumulLength[LLIMIT] = {0};
260 U32 savings[LLIMIT] = {0};
261 const BYTE* b = (const BYTE*)buffer;
262 size_t length;
263 size_t maxLength = LLIMIT;
264 size_t pos = suffix[start];
265 U32 end = start;
266 dictItem solution;
267
Yann Collete618f8e2016-01-28 00:29:58 +0100268 /* init */
269 memset(&solution, 0, sizeof(solution));
270 doneMarks[pos] = 1;
271
272 /* trivial repetition cases */
273 if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2))
274 ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
275 ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
276 /* skip and mark segment */
277 U16 u16 = MEM_read16(b+pos+4);
278 U32 u, e = 6;
279 while (MEM_read16(b+pos+e) == u16) e+=2 ;
280 if (b[pos+e] == b[pos+e-1]) e++;
281 for (u=1; u<e; u++)
282 doneMarks[pos+u] = 1;
283 return solution;
284 }
285
286 /* look forward */
287 do {
288 end++;
Yann Collet7d360282016-02-12 00:07:30 +0100289 length = ZDICT_count(b + pos, b + suffix[end]);
Yann Collete618f8e2016-01-28 00:29:58 +0100290 } while (length >=MINMATCHLENGTH);
291
292 /* look backward */
293 do {
Yann Collet7d360282016-02-12 00:07:30 +0100294 length = ZDICT_count(b + pos, b + *(suffix+start-1));
Yann Collete618f8e2016-01-28 00:29:58 +0100295 if (length >=MINMATCHLENGTH) start--;
296 } while(length >= MINMATCHLENGTH);
297
298 /* exit if not found a minimum nb of repetitions */
299 if (end-start < minRatio) {
300 U32 idx;
301 for(idx=start; idx<end; idx++)
302 doneMarks[suffix[idx]] = 1;
303 return solution;
304 }
305
306 {
307 int i;
308 U32 searchLength;
309 U32 refinedStart = start;
310 U32 refinedEnd = end;
311
312 DISPLAYLEVEL(4, "\n");
313 DISPLAYLEVEL(4, "found %3u matches of length >= %u at pos %7u ", (U32)(end-start), MINMATCHLENGTH, (U32)pos);
314 DISPLAYLEVEL(4, "\n");
315
316 for (searchLength = MINMATCHLENGTH ; ; searchLength++) {
317 BYTE currentChar = 0;
318 U32 currentCount = 0;
319 U32 currentID = refinedStart;
320 U32 id;
321 U32 selectedCount = 0;
322 U32 selectedID = currentID;
323 for (id =refinedStart; id < refinedEnd; id++) {
324 if (b[ suffix[id] + searchLength] != currentChar) {
325 if (currentCount > selectedCount) {
326 selectedCount = currentCount;
327 selectedID = currentID;
328 }
329 currentID = id;
330 currentChar = b[ suffix[id] + searchLength];
331 currentCount = 0;
332 }
333 currentCount ++;
334 }
335 if (currentCount > selectedCount) { /* for last */
336 selectedCount = currentCount;
337 selectedID = currentID;
338 }
339
340 if (selectedCount < minRatio)
341 break;
Yann Collete618f8e2016-01-28 00:29:58 +0100342 refinedStart = selectedID;
343 refinedEnd = refinedStart + selectedCount;
344 }
345
346 /* evaluate gain based on new ref */
347 start = refinedStart;
348 pos = suffix[refinedStart];
349 end = start;
350 memset(lengthList, 0, sizeof(lengthList));
351
352 /* look forward */
353 do {
354 end++;
Yann Collet7d360282016-02-12 00:07:30 +0100355 length = ZDICT_count(b + pos, b + suffix[end]);
Yann Collete618f8e2016-01-28 00:29:58 +0100356 if (length >= LLIMIT) length = LLIMIT-1;
357 lengthList[length]++;
358 } while (length >=MINMATCHLENGTH);
359
360 /* look backward */
361 do {
Yann Collet7d360282016-02-12 00:07:30 +0100362 length = ZDICT_count(b + pos, b + suffix[start-1]);
Yann Collete618f8e2016-01-28 00:29:58 +0100363 if (length >= LLIMIT) length = LLIMIT-1;
364 lengthList[length]++;
365 if (length >=MINMATCHLENGTH) start--;
366 } while(length >= MINMATCHLENGTH);
367
368 /* largest useful length */
369 memset(cumulLength, 0, sizeof(cumulLength));
370 cumulLength[maxLength-1] = lengthList[maxLength-1];
Yann Collet9cadd082016-01-28 15:39:52 +0100371 for (i=(int)(maxLength-2); i>=0; i--)
Yann Collete618f8e2016-01-28 00:29:58 +0100372 cumulLength[i] = cumulLength[i+1] + lengthList[i];
373
374 for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break;
375 maxLength = i;
376
377 /* reduce maxLength in case of final into repetitive data */
378 {
Yann Collet9cadd082016-01-28 15:39:52 +0100379 U32 l = (U32)maxLength;
Yann Collete618f8e2016-01-28 00:29:58 +0100380 BYTE c = b[pos + maxLength-1];
381 while (b[pos+l-2]==c) l--;
382 maxLength = l;
383 }
384 if (maxLength < MINMATCHLENGTH) return solution; /* skip : no long-enough solution */
385
386 /* calculate savings */
387 savings[5] = 0;
388 for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
389 savings[i] = savings[i-1] + (lengthList[i] * (i-3));
390
391 DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n",
392 (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
393
Yann Collet9cadd082016-01-28 15:39:52 +0100394 solution.pos = (U32)pos;
395 solution.length = (U32)maxLength;
Yann Collete618f8e2016-01-28 00:29:58 +0100396 solution.savings = savings[maxLength];
397
398 /* mark positions done */
399 {
400 U32 id;
401 U32 testedPos;
402 for (id=start; id<end; id++) {
403 U32 p, pEnd;
404 testedPos = suffix[id];
405 if (testedPos == pos)
406 length = solution.length;
407 else {
Yann Collet7d360282016-02-12 00:07:30 +0100408 length = ZDICT_count(b+pos, b+testedPos);
Yann Collete618f8e2016-01-28 00:29:58 +0100409 if (length > solution.length) length = solution.length;
410 }
Yann Collet9cadd082016-01-28 15:39:52 +0100411 pEnd = (U32)(testedPos + length);
Yann Collete618f8e2016-01-28 00:29:58 +0100412 for (p=testedPos; p<pEnd; p++)
413 doneMarks[p] = 1;
414 } } }
415
416 return solution;
417}
418
419
Yann Collet7d360282016-02-12 00:07:30 +0100420/*! ZDICT_checkMerge
Yann Collete618f8e2016-01-28 00:29:58 +0100421 check if dictItem can be merged, do it if possible
422 @return : id of destination elt, 0 if not merged
423*/
Yann Collet7d360282016-02-12 00:07:30 +0100424static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
Yann Collete618f8e2016-01-28 00:29:58 +0100425{
426 const U32 tableSize = table->pos;
427 const U32 max = elt.pos + (elt.length-1);
428
429 /* tail overlap */
430 U32 u; for (u=1; u<tableSize; u++) {
431 if (u==eltNbToSkip) continue;
432 if ((table[u].pos > elt.pos) && (table[u].pos < max)) { /* overlap */
433 /* append */
434 U32 addedLength = table[u].pos - elt.pos;
435 table[u].length += addedLength;
436 table[u].pos = elt.pos;
437 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
438 table[u].savings += elt.length / 8; /* rough approx */
439 elt = table[u];
440 while ((u>1) && (table[u-1].savings < elt.savings))
441 table[u] = table[u-1], u--;
442 table[u] = elt;
443 return u;
444 } }
445
446 /* front overlap */
447 for (u=1; u<tableSize; u++) {
448 if (u==eltNbToSkip) continue;
449 if ((table[u].pos + table[u].length > elt.pos) && (table[u].pos < elt.pos)) { /* overlap */
450 /* append */
451 int addedLength = (elt.pos + elt.length) - (table[u].pos + table[u].length);
452 table[u].savings += elt.length / 8; /* rough approx */
453 if (addedLength > 0) { /* otherwise, already included */
454 table[u].length += addedLength;
455 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
456 }
457 elt = table[u];
458 while ((u>1) && (table[u-1].savings < elt.savings))
459 table[u] = table[u-1], u--;
460 table[u] = elt;
461 return u;
462 } }
463
464 return 0;
465}
466
467
Yann Collet7d360282016-02-12 00:07:30 +0100468static void ZDICT_removeDictItem(dictItem* table, U32 id)
Yann Collete618f8e2016-01-28 00:29:58 +0100469{
470 /* convention : first element is nb of elts */
471 U32 max = table->pos;
472 U32 u;
473 if (!id) return; /* protection, should never happen */
Yann Collet9cadd082016-01-28 15:39:52 +0100474 for (u=id; u<max-1; u++)
Yann Collete618f8e2016-01-28 00:29:58 +0100475 table[u] = table[u+1];
476 table->pos--;
477}
478
479
Yann Collet7d360282016-02-12 00:07:30 +0100480static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
Yann Collete618f8e2016-01-28 00:29:58 +0100481{
482 /* merge if possible */
Yann Collet7d360282016-02-12 00:07:30 +0100483 U32 mergeId = ZDICT_checkMerge(table, elt, 0);
Yann Collete618f8e2016-01-28 00:29:58 +0100484 if (mergeId) {
485 U32 newMerge = 1;
486 while (newMerge) {
Yann Collet7d360282016-02-12 00:07:30 +0100487 newMerge = ZDICT_checkMerge(table, table[mergeId], mergeId);
488 if (newMerge) ZDICT_removeDictItem(table, mergeId);
Yann Collete618f8e2016-01-28 00:29:58 +0100489 mergeId = newMerge;
490 }
491 return;
492 }
493
494 /* insert */
495 {
496 U32 current;
497 U32 nextElt = table->pos;
498 if (nextElt >= maxSize) nextElt = maxSize-1;
499 current = nextElt-1;
500 while (table[current].savings < elt.savings) {
501 table[current+1] = table[current];
502 current--;
503 }
504 table[current+1] = elt;
505 table->pos = nextElt+1;
506 }
507}
508
509
Yann Collet7d360282016-02-12 00:07:30 +0100510static U32 ZDICT_dictSize(const dictItem* dictList)
Yann Collete618f8e2016-01-28 00:29:58 +0100511{
512 U32 u, dictSize = 0;
513 for (u=1; u<dictList[0].pos; u++)
514 dictSize += dictList[u].length;
515 return dictSize;
516}
517
518
Yann Collet7d360282016-02-12 00:07:30 +0100519static void ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
Yann Collete618f8e2016-01-28 00:29:58 +0100520 const void* const buffer, const size_t bufferSize, /* buffer must end with noisy guard band */
Yann Collet7682e492016-01-31 23:45:35 +0100521 const size_t* fileSizes, unsigned nbFiles,
522 U32 shiftRatio, unsigned maxDictSize)
Yann Collete618f8e2016-01-28 00:29:58 +0100523{
Yann Collet7d360282016-02-12 00:07:30 +0100524 int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
525 int* const suffix = suffix0+1;
Yann Collete618f8e2016-01-28 00:29:58 +0100526 U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
527 BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */
528 U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
529 U32 minRatio = nbFiles >> shiftRatio;
Yann Collet7d360282016-02-12 00:07:30 +0100530 int divSuftSortResult;
Yann Collete618f8e2016-01-28 00:29:58 +0100531
532 /* init */
533 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
534 if (!suffix0 || !reverseSuffix || !doneMarks || !filePos)
Yann Collet7d360282016-02-12 00:07:30 +0100535 EXM_THROW(1, "not enough memory for ZDICT_trainBuffer");
Yann Collete618f8e2016-01-28 00:29:58 +0100536 if (minRatio < MINRATIO) minRatio = MINRATIO;
537 memset(doneMarks, 0, bufferSize+16);
538
539 /* sort */
Yann Collet7682e492016-01-31 23:45:35 +0100540 DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
Yann Collet7d360282016-02-12 00:07:30 +0100541 divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
542 if (divSuftSortResult != 0) EXM_THROW(2, "sort failed");
543 suffix[bufferSize] = (int)bufferSize; /* leads into noise */
544 suffix0[0] = (int)bufferSize; /* leads into noise */
Yann Collete618f8e2016-01-28 00:29:58 +0100545 {
546 /* build reverse suffix sort */
547 size_t pos;
548 for (pos=0; pos < bufferSize; pos++)
Yann Collet9cadd082016-01-28 15:39:52 +0100549 reverseSuffix[suffix[pos]] = (U32)pos;
Yann Collete618f8e2016-01-28 00:29:58 +0100550 /* build file pos */
551 filePos[0] = 0;
552 for (pos=1; pos<nbFiles; pos++)
Yann Collet9cadd082016-01-28 15:39:52 +0100553 filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
Yann Collete618f8e2016-01-28 00:29:58 +0100554 }
555
556 DISPLAYLEVEL(2, "finding patterns ... \n");
Yann Collet82516192016-01-29 16:48:10 +0100557 DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio);
Yann Collete618f8e2016-01-28 00:29:58 +0100558
559 {
560 U32 cursor; for (cursor=0; cursor < bufferSize; ) {
561 dictItem solution;
Yann Collete618f8e2016-01-28 00:29:58 +0100562 if (doneMarks[cursor]) { cursor++; continue; }
Yann Collet7d360282016-02-12 00:07:30 +0100563 solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio);
Yann Collete618f8e2016-01-28 00:29:58 +0100564 if (solution.length==0) { cursor++; continue; }
Yann Collet7d360282016-02-12 00:07:30 +0100565 ZDICT_insertDictItem(dictList, dictListSize, solution);
Yann Collete618f8e2016-01-28 00:29:58 +0100566 cursor += solution.length;
567 DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
Yann Collet82516192016-01-29 16:48:10 +0100568 } }
Yann Collete618f8e2016-01-28 00:29:58 +0100569
570 /* limit dictionary size */
571 {
572 U32 max = dictList->pos; /* convention : nb of useful elts within dictList */
573 U32 currentSize = 0;
574 U32 n; for (n=1; n<max; n++) {
575 currentSize += dictList[n].length;
576 if (currentSize > maxDictSize) break;
577 }
578 dictList->pos = n;
579 }
580
581 free(suffix0);
582 free(reverseSuffix);
583 free(doneMarks);
584 free(filePos);
585}
586
587
Yann Collet7d360282016-02-12 00:07:30 +0100588static void ZDICT_fillNoise(void* buffer, size_t length)
Yann Collete618f8e2016-01-28 00:29:58 +0100589{
590 unsigned acc = PRIME1;
591 size_t p=0;;
592
593 for (p=0; p<length; p++) {
594 acc *= PRIME2;
595 ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
596 }
597}
598
599
600typedef struct
601{
602 ZSTD_CCtx* ref;
603 ZSTD_CCtx* zc;
604 void* workPlace; /* must be BLOCKSIZE allocated */
605} EStats_ress_t;
606
607
Yann Collet7d360282016-02-12 00:07:30 +0100608static void ZDICT_countEStats(EStats_ress_t esr,
Yann Collete618f8e2016-01-28 00:29:58 +0100609 U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount,
610 const void* src, size_t srcSize)
611{
612 const BYTE* bytePtr;
613 const U32* u32Ptr;
Yann Collet7d360282016-02-12 00:07:30 +0100614 seqStore_t seqStore;
Yann Collete618f8e2016-01-28 00:29:58 +0100615
616 if (srcSize > BLOCKSIZE) srcSize = BLOCKSIZE; /* protection vs large samples */
617 ZSTD_copyCCtx(esr.zc, esr.ref);
618 ZSTD_compressBlock(esr.zc, esr.workPlace, BLOCKSIZE, src, srcSize);
Yann Collet7d360282016-02-12 00:07:30 +0100619 seqStore = ZSTD_copySeqStore(esr.zc);
Yann Collete618f8e2016-01-28 00:29:58 +0100620
621 /* count stats */
Yann Collet7d360282016-02-12 00:07:30 +0100622 for(bytePtr = seqStore.litStart; bytePtr < seqStore.lit; bytePtr++)
Yann Collete618f8e2016-01-28 00:29:58 +0100623 countLit[*bytePtr]++;
Yann Collet7d360282016-02-12 00:07:30 +0100624 for(u32Ptr = seqStore.offsetStart; u32Ptr < seqStore.offset; u32Ptr++) {
Yann Collete618f8e2016-01-28 00:29:58 +0100625 BYTE offcode = (BYTE)ZSTD_highbit(*u32Ptr) + 1;
626 if (*u32Ptr==0) offcode=0;
627 offsetcodeCount[offcode]++;
628 }
Yann Collet7d360282016-02-12 00:07:30 +0100629 for(bytePtr = seqStore.matchLengthStart; bytePtr < seqStore.matchLength; bytePtr++)
Yann Collete618f8e2016-01-28 00:29:58 +0100630 matchlengthCount[*bytePtr]++;
Yann Collet7d360282016-02-12 00:07:30 +0100631 for(bytePtr = seqStore.litLengthStart; bytePtr < seqStore.litLength; bytePtr++)
Yann Collete618f8e2016-01-28 00:29:58 +0100632 litlengthCount[*bytePtr]++;
633}
634
635
Yann Collet7d360282016-02-12 00:07:30 +0100636#define OFFCODE_MAX 18 /* only applicable to first block */
637static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
Yann Colletf5229e02016-01-29 02:45:26 +0100638 unsigned compressionLevel,
Yann Collet7682e492016-01-31 23:45:35 +0100639 const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
Yann Collete618f8e2016-01-28 00:29:58 +0100640 const void* dictBuffer, size_t dictBufferSize)
641{
642 U32 countLit[256];
643 U32 offcodeCount[MaxOff+1];
644 HUF_CREATE_STATIC_CTABLE(hufTable, 255);
645 short offcodeNCount[MaxOff+1];
646 U32 matchLengthCount[MaxML+1];
647 short matchLengthNCount[MaxML+1];
648 U32 litlengthCount[MaxLL+1];
649 short litlengthNCount[MaxLL+1];
650 EStats_ress_t esr;
651 ZSTD_parameters params;
652 U32 u, huffLog = 12, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
653 size_t pos = 0, errorCode;
654 size_t eSize = 0;
655
656 /* init */
657 for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */
658 for (u=0; u<=OFFCODE_MAX; u++) offcodeCount[u]=1;
659 for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
660 for (u=0; u<=MaxLL; u++) litlengthCount[u]=1;
661 esr.ref = ZSTD_createCCtx();
662 esr.zc = ZSTD_createCCtx();
663 esr.workPlace = malloc(BLOCKSIZE);
664 if (!esr.ref || !esr.zc || !esr.workPlace) EXM_THROW(30, "Not enough memory");
Yann Colletf5229e02016-01-29 02:45:26 +0100665 if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
666 params = ZSTD_getParams(compressionLevel, dictBufferSize + 15 KB);
Yann Collete618f8e2016-01-28 00:29:58 +0100667 params.strategy = ZSTD_greedy;
668 ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params);
669
670 /* collect stats on all files */
671 for (u=0; u<nbFiles; u++) {
Yann Collet7d360282016-02-12 00:07:30 +0100672 ZDICT_countEStats(esr,
Yann Collete618f8e2016-01-28 00:29:58 +0100673 countLit, offcodeCount, matchLengthCount, litlengthCount,
674 (const char*)srcBuffer + pos, fileSizes[u]);
675 pos += fileSizes[u];
676 }
677
678 /* analyze */
679 errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
680 if (HUF_isError(errorCode)) EXM_THROW(31, "HUF_buildCTable error");
681 huffLog = (U32)errorCode;
682
683 total=0; for (u=0; u<=OFFCODE_MAX; u++) total+=offcodeCount[u];
684 errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, OFFCODE_MAX);
685 if (FSE_isError(errorCode)) EXM_THROW(32, "FSE_normalizeCount error with offcodeCount");
686 Offlog = (U32)errorCode;
687
688 total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
689 errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
690 if (FSE_isError(errorCode)) EXM_THROW(33, "FSE_normalizeCount error with matchLengthCount");
691 mlLog = (U32)errorCode;
692
693 total=0; for (u=0; u<=MaxLL; u++) total+=litlengthCount[u];
694 errorCode = FSE_normalizeCount(litlengthNCount, llLog, litlengthCount, total, MaxLL);
695 if (FSE_isError(errorCode)) EXM_THROW(34, "FSE_normalizeCount error with litlengthCount");
696 llLog = (U32)errorCode;
697
698 /* write result to buffer */
699 errorCode = HUF_writeCTable(dstBuffer, maxDstSize, hufTable, 255, huffLog);
700 if (HUF_isError(errorCode)) EXM_THROW(41, "HUF_writeCTable error");
701 dstBuffer = (char*)dstBuffer + errorCode;
702 maxDstSize -= errorCode;
703 eSize += errorCode;
704
705 errorCode = FSE_writeNCount(dstBuffer, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
706 if (FSE_isError(errorCode)) EXM_THROW(42, "FSE_writeNCount error with offcodeNCount");
707 dstBuffer = (char*)dstBuffer + errorCode;
708 maxDstSize -= errorCode;
709 eSize += errorCode;
710
711 errorCode = FSE_writeNCount(dstBuffer, maxDstSize, matchLengthNCount, MaxML, mlLog);
712 if (FSE_isError(errorCode)) EXM_THROW(43, "FSE_writeNCount error with matchLengthNCount");
713 dstBuffer = (char*)dstBuffer + errorCode;
714 maxDstSize -= errorCode;
715 eSize += errorCode;
716
717 errorCode = FSE_writeNCount(dstBuffer, maxDstSize, litlengthNCount, MaxLL, llLog);
718 if (FSE_isError(errorCode)) EXM_THROW(43, "FSE_writeNCount error with litlengthNCount");
719 dstBuffer = (char*)dstBuffer + errorCode;
720 maxDstSize -= errorCode;
721 eSize += errorCode;
722
723 /* clean */
724 ZSTD_freeCCtx(esr.ref);
725 ZSTD_freeCCtx(esr.zc);
726 free(esr.workPlace);
727
728 return eSize;
729}
730
731
Yann Colletf5229e02016-01-29 02:45:26 +0100732#define DIB_FASTSEGMENTSIZE 64
Yann Collet7d360282016-02-12 00:07:30 +0100733/*! ZDICT_fastSampling (based on an idea by Giuseppe Ottaviano)
734 Fill `dictBuffer` with stripes of size DIB_FASTSEGMENTSIZE from `samplesBuffer`
735 up to `dictSize`.
736 Filling starts from the end of `dictBuffer`, down to maximum possible.
737 if `dictSize` is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of `dictBuffer` won't be used.
738 @return : amount of data written into `dictBuffer`
739 or an error code
Yann Colletf5229e02016-01-29 02:45:26 +0100740*/
Yann Collet7d360282016-02-12 00:07:30 +0100741static size_t ZDICT_fastSampling(void* dictBuffer, size_t dictSize,
Yann Colletf5229e02016-01-29 02:45:26 +0100742 const void* samplesBuffer, size_t samplesSize)
743{
744 char* dstPtr = (char*)dictBuffer + dictSize;
745 const char* srcPtr = (const char*)samplesBuffer;
746 size_t nbSegments = dictSize / DIB_FASTSEGMENTSIZE;
747 size_t segNb, interSize;
748
749 if (nbSegments <= 2) return ERROR(srcSize_wrong);
750 if (samplesSize < dictSize) return ERROR(srcSize_wrong);
751
752 /* first and last segments are part of dictionary, in case they contain interesting header/footer */
753 dstPtr -= DIB_FASTSEGMENTSIZE;
754 memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
755 dstPtr -= DIB_FASTSEGMENTSIZE;
756 memcpy(dstPtr, srcPtr+samplesSize-DIB_FASTSEGMENTSIZE, DIB_FASTSEGMENTSIZE);
757
758 /* regularly copy a segment */
759 interSize = (samplesSize - nbSegments*DIB_FASTSEGMENTSIZE) / (nbSegments-1);
760 srcPtr += DIB_FASTSEGMENTSIZE;
761 for (segNb=2; segNb < nbSegments; segNb++) {
762 srcPtr += interSize;
763 dstPtr -= DIB_FASTSEGMENTSIZE;
764 memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
765 srcPtr += DIB_FASTSEGMENTSIZE;
766 }
767
768 return nbSegments * DIB_FASTSEGMENTSIZE;
769}
770
771
Yann Collet7d360282016-02-12 00:07:30 +0100772size_t ZDICT_trainFromBuffer_unsafe(
Yann Collet7682e492016-01-31 23:45:35 +0100773 void* dictBuffer, size_t maxDictSize,
774 const void* samplesBuffer, const size_t* sampleSizes, unsigned nbSamples,
Yann Collet7d360282016-02-12 00:07:30 +0100775 ZDICT_params_t params)
Yann Collete618f8e2016-01-28 00:29:58 +0100776{
Yann Colletb35c4642016-02-01 00:03:10 +0100777 const U32 dictListSize = MAX( MAX(DICTLISTSIZE, nbSamples), (U32)(maxDictSize/16));
Yann Collete618f8e2016-01-28 00:29:58 +0100778 dictItem* dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
Yann Collet7682e492016-01-31 23:45:35 +0100779 unsigned selectivity = params.selectivityLevel;
780 unsigned compressionLevel = params.compressionLevel;
781 size_t targetDictSize = maxDictSize - g_provision_entropySize;
782 size_t sBuffSize;
783 size_t dictSize = 0;
784
785 /* checks */
786 if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) return ERROR(dstSize_tooSmall);
Yann Collete618f8e2016-01-28 00:29:58 +0100787
788 /* init */
Yann Collet7682e492016-01-31 23:45:35 +0100789 { unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += sampleSizes[u]; }
Yann Collet7d360282016-02-12 00:07:30 +0100790 if (!dictList) return ERROR(memory_allocation);
791 ZDICT_initDictItem(dictList);
Yann Collet7682e492016-01-31 23:45:35 +0100792 if (selectivity==0) selectivity = g_selectivity_default;
793 if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
Yann Collete618f8e2016-01-28 00:29:58 +0100794
Yann Collet7d360282016-02-12 00:07:30 +0100795 /* build dictionary */
796 if (selectivity>1) { /* selectivity == 1 => fast mode */
797 ZDICT_trainBuffer(dictList, dictListSize,
Yann Collet7682e492016-01-31 23:45:35 +0100798 samplesBuffer, sBuffSize,
799 sampleSizes, nbSamples,
Yann Colletb35c4642016-02-01 00:03:10 +0100800 selectivity, (U32)targetDictSize);
Yann Collete618f8e2016-01-28 00:29:58 +0100801
Yann Colletf5229e02016-01-29 02:45:26 +0100802 /* display best matches */
803 if (g_displayLevel>= 3) {
804 const U32 nb = 25;
805 U32 u;
Yann Collet7d360282016-02-12 00:07:30 +0100806 U32 dictContentSize = ZDICT_dictSize(dictList);
Yann Colletf5229e02016-01-29 02:45:26 +0100807 DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
808 DISPLAYLEVEL(3, "list %u best segments \n", nb);
809 for (u=1; u<=nb; u++) {
810 U32 p = dictList[u].pos;
811 U32 l = dictList[u].length;
812 U32 d = MIN(40, l);
813 DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
814 u, l, p, dictList[u].savings);
Yann Collet7d360282016-02-12 00:07:30 +0100815 ZDICT_printHex(3, (const char*)samplesBuffer+p, d);
Yann Colletf5229e02016-01-29 02:45:26 +0100816 DISPLAYLEVEL(3, "| \n");
817 } } }
Yann Collete618f8e2016-01-28 00:29:58 +0100818
819 /* create dictionary */
820 {
Yann Collet7d360282016-02-12 00:07:30 +0100821 U32 dictContentSize = ZDICT_dictSize(dictList);
Yann Collet7682e492016-01-31 23:45:35 +0100822 size_t hSize;
Yann Collete618f8e2016-01-28 00:29:58 +0100823 BYTE* ptr;
824 U32 u;
825
Yann Collete618f8e2016-01-28 00:29:58 +0100826 /* build dict content */
Yann Collet7682e492016-01-31 23:45:35 +0100827 ptr = (BYTE*)dictBuffer + maxDictSize;
Yann Collete618f8e2016-01-28 00:29:58 +0100828 for (u=1; u<dictList->pos; u++) {
829 U32 l = dictList[u].length;
830 ptr -= l;
Yann Collet7682e492016-01-31 23:45:35 +0100831 if (ptr<(BYTE*)dictBuffer) return ERROR(GENERIC); /* should not happen */
832 memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
Yann Collete618f8e2016-01-28 00:29:58 +0100833 }
834
Yann Collet82516192016-01-29 16:48:10 +0100835 /* fast mode dict content */
Yann Collet7682e492016-01-31 23:45:35 +0100836 if (selectivity==1) { /* note could also be used to complete a dictionary, but not necessarily better */
837 DISPLAYLEVEL(3, "\r%70s\r", ""); /* clean display line */
838 DISPLAYLEVEL(3, "Adding %u KB with fast sampling \n", (U32)(targetDictSize>>10));
Yann Collet7d360282016-02-12 00:07:30 +0100839 dictContentSize = (U32)ZDICT_fastSampling((char*)dictBuffer + g_provision_entropySize,
Yann Collet7682e492016-01-31 23:45:35 +0100840 targetDictSize, samplesBuffer, sBuffSize);
Yann Colletf5229e02016-01-29 02:45:26 +0100841 }
842
Yann Collet7682e492016-01-31 23:45:35 +0100843 /* dictionary header */
844 MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
Yann Collete618f8e2016-01-28 00:29:58 +0100845 hSize = 4;
Yann Collete618f8e2016-01-28 00:29:58 +0100846
847 /* entropic tables */
Yann Collet7682e492016-01-31 23:45:35 +0100848 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
Yann Collete618f8e2016-01-28 00:29:58 +0100849 DISPLAYLEVEL(2, "statistics ... \n");
Yann Collet7d360282016-02-12 00:07:30 +0100850 hSize += ZDICT_analyzeEntropy((char*)dictBuffer+4, maxDictSize-4,
Yann Colletf5229e02016-01-29 02:45:26 +0100851 compressionLevel,
Yann Collet7682e492016-01-31 23:45:35 +0100852 samplesBuffer, sampleSizes, nbSamples,
853 (char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize);
Yann Collete618f8e2016-01-28 00:29:58 +0100854
Yann Collet7682e492016-01-31 23:45:35 +0100855 if (hSize + dictContentSize < maxDictSize)
856 memmove((char*)dictBuffer + hSize, (char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize);
857 dictSize = MIN(maxDictSize, hSize+dictContentSize);
Yann Collete618f8e2016-01-28 00:29:58 +0100858 }
859
860 /* clean up */
Yann Collete618f8e2016-01-28 00:29:58 +0100861 free(dictList);
Yann Collet7682e492016-01-31 23:45:35 +0100862 return dictSize;
Yann Collete618f8e2016-01-28 00:29:58 +0100863}
864
Yann Collet7682e492016-01-31 23:45:35 +0100865
Yann Collet7d360282016-02-12 00:07:30 +0100866size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
867 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
868 ZDICT_params_t params)
Yann Collet7682e492016-01-31 23:45:35 +0100869{
870 size_t sBuffSize;
871 void* newBuff;
872 size_t result;
873
Yann Collet7d360282016-02-12 00:07:30 +0100874 { unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; }
Yann Collet7682e492016-01-31 23:45:35 +0100875 newBuff = malloc(sBuffSize + NOISELENGTH);
876 if (!newBuff) return ERROR(memory_allocation);
877
878 memcpy(newBuff, samplesBuffer, sBuffSize);
Yann Collet7d360282016-02-12 00:07:30 +0100879 ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
Yann Collet7682e492016-01-31 23:45:35 +0100880
Yann Collet7d360282016-02-12 00:07:30 +0100881 result = ZDICT_trainFromBuffer_unsafe(dictBuffer, dictBufferCapacity,
882 newBuff, samplesSizes, nbSamples,
Yann Collet7682e492016-01-31 23:45:35 +0100883 params);
884 free(newBuff);
885 return result;
886}
887
888
Yann Collet7d360282016-02-12 00:07:30 +0100889/* issue : samplesBuffer need to be followed by a noisy guard band.
890* work around : duplicate the buffer, and add the noise ? */
891size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
892 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
Yann Collet7682e492016-01-31 23:45:35 +0100893{
Yann Collet7d360282016-02-12 00:07:30 +0100894 ZDICT_params_t params;
895 memset(&params, 0, sizeof(params));
896 return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity,
897 samplesBuffer, samplesSizes, nbSamples,
898 params);
Yann Collet7682e492016-01-31 23:45:35 +0100899}
Yann Collet7d360282016-02-12 00:07:30 +0100900