blob: fed9b18dfbce68c863a3f7d91472b9e6002f64b2 [file] [log] [blame]
Yann Collete618f8e2016-01-28 00:29:58 +01001/*
Yann Collet863ec402016-01-28 17:56:33 +01002 dictBuilder - dictionary builder for LZ algorithms
Yann Collete618f8e2016-01-28 00:29:58 +01003 Copyright (C) Yann Collet 2016
4
5 GPL v2 License
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License along
18 with this program; if not, write to the Free Software Foundation, Inc.,
19 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20
21 You can contact the author at :
22 - zstd source repository : https://github.com/Cyan4973/zstd
Yann Collete618f8e2016-01-28 00:29:58 +010023*/
24
25/* **************************************
26* Compiler Options
27****************************************/
28/* Disable some Visual warning messages */
29#ifdef _MSC_VER
30# define _CRT_SECURE_NO_WARNINGS /* fopen */
31# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
32#endif
33
34/* Unix Large Files support (>4GB) */
35#define _FILE_OFFSET_BITS 64
36#if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */
37# define _LARGEFILE_SOURCE
38#elif ! defined(__LP64__) /* No point defining Large file for 64 bit */
39# define _LARGEFILE64_SOURCE
40#endif
41
Yann Collete618f8e2016-01-28 00:29:58 +010042
Yann Collet863ec402016-01-28 17:56:33 +010043/*-*************************************
Yann Collete618f8e2016-01-28 00:29:58 +010044* Includes
45***************************************/
46#include <stdlib.h> /* malloc, free */
47#include <string.h> /* memset */
48#include <stdio.h> /* fprintf, fopen, ftello64 */
49#include <sys/types.h> /* stat64 */
50#include <sys/stat.h> /* stat64 */
51#include <time.h> /* clock */
52
53#include "mem.h" /* read */
Yann Colletf5229e02016-01-29 02:45:26 +010054#include "error_private.h"
Yann Collete618f8e2016-01-28 00:29:58 +010055#include "divsufsort.h"
56#include "dictBuilder.h"
57#include "zstd_compress.c"
58#include "huff0_static.h"
59
60
61/* *************************************
62* Compiler specifics
63***************************************/
64#if !defined(S_ISREG)
65# define S_ISREG(x) (((x) & S_IFMT) == S_IFREG)
66#endif
67
68#ifdef _MSC_VER
69#define snprintf sprintf_s
70#endif
71
72
Yann Collet863ec402016-01-28 17:56:33 +010073/*-*************************************
Yann Collete618f8e2016-01-28 00:29:58 +010074* Constants
75***************************************/
76#define KB *(1 <<10)
77#define MB *(1 <<20)
78#define GB *(1U<<30)
79
80#define DICTLISTSIZE 10000
81#define MEMMULT 11
Yann Collet35f7de52016-01-31 02:51:03 +010082static const size_t maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
Yann Collete618f8e2016-01-28 00:29:58 +010083
84#define NOISELENGTH 32
85#define PRIME1 2654435761U
86#define PRIME2 2246822519U
87
88#define MINRATIO 4
Yann Colletf5229e02016-01-29 02:45:26 +010089static const U32 g_compressionLevel_default = 5;
Yann Collete618f8e2016-01-28 00:29:58 +010090
91
Yann Collet863ec402016-01-28 17:56:33 +010092/*-*************************************
93* Console display
Yann Collete618f8e2016-01-28 00:29:58 +010094***************************************/
95#define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
96#define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
Yann Colletb31de732016-01-28 10:30:02 +010097static unsigned g_displayLevel = 0; /* 0 : no display; 1: errors; 2: default; 4: full information */
98void DiB_setNotificationLevel(unsigned l) { g_displayLevel=l; }
Yann Collete618f8e2016-01-28 00:29:58 +010099
100#define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
101 if (DiB_GetMilliSpan(g_time) > refreshRate) \
102 { g_time = clock(); DISPLAY(__VA_ARGS__); \
103 if (g_displayLevel>=4) fflush(stdout); } }
104static const unsigned refreshRate = 300;
105static clock_t g_time = 0;
106
107void DiB_printHex(U32 dlevel, const void* ptr, size_t length)
108{
109 const BYTE* const b = (const BYTE*)ptr;
110 size_t u;
111 for (u=0; u<length; u++)
112 {
113 BYTE c = b[u];
114 if (c<32 || c>126) c = '.'; /* non-printable char */
115 DISPLAYLEVEL(dlevel, "%c", c);
116 }
117}
118
119
Yann Collet863ec402016-01-28 17:56:33 +0100120/*-*************************************
Yann Collete618f8e2016-01-28 00:29:58 +0100121* Exceptions
122***************************************/
123#ifndef DEBUG
124# define DEBUG 0
125#endif
126#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
127#define EXM_THROW(error, ...) \
128{ \
129 DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
130 DISPLAYLEVEL(1, "Error %i : ", error); \
131 DISPLAYLEVEL(1, __VA_ARGS__); \
132 DISPLAYLEVEL(1, "\n"); \
133 exit(error); \
134}
135
136
137/* ********************************************************
138* Helper functions
139**********************************************************/
140unsigned DiB_versionNumber (void) { return DiB_VERSION_NUMBER; }
141
142static unsigned DiB_GetMilliSpan(clock_t nPrevious)
143{
144 clock_t nCurrent = clock();
145 unsigned nSpan = (unsigned)(((nCurrent - nPrevious) * 1000) / CLOCKS_PER_SEC);
146 return nSpan;
147}
148
149
150/* ********************************************************
151* File related operations
152**********************************************************/
153static unsigned long long DiB_getFileSize(const char* infilename)
154{
155 int r;
156#if defined(_MSC_VER)
157 struct _stat64 statbuf;
158 r = _stat64(infilename, &statbuf);
159#else
160 struct stat statbuf;
161 r = stat(infilename, &statbuf);
162#endif
163 if (r || !S_ISREG(statbuf.st_mode)) return 0; /* No good... */
164 return (unsigned long long)statbuf.st_size;
165}
166
167
168static unsigned long long DiB_getTotalFileSize(const char** fileNamesTable, unsigned nbFiles)
169{
170 unsigned long long total = 0;
171 unsigned n;
172 for (n=0; n<nbFiles; n++)
173 total += DiB_getFileSize(fileNamesTable[n]);
174 return total;
175}
176
177
178static void DiB_loadFiles(void* buffer, size_t bufferSize,
179 size_t* fileSizes,
180 const char** fileNamesTable, unsigned nbFiles)
181{
182 char* buff = (char*)buffer;
183 size_t pos = 0;
184 unsigned n;
185
186 for (n=0; n<nbFiles; n++) {
187 size_t readSize;
188 unsigned long long fileSize = DiB_getFileSize(fileNamesTable[n]);
189 FILE* f = fopen(fileNamesTable[n], "rb");
190 if (f==NULL) EXM_THROW(10, "impossible to open file %s", fileNamesTable[n]);
191 DISPLAYLEVEL(2, "Loading %s... \r", fileNamesTable[n]);
192 if (fileSize > bufferSize-pos) fileSize = 0; /* stop there, not enough memory to load all files */
193 readSize = fread(buff+pos, 1, (size_t)fileSize, f);
194 if (readSize != (size_t)fileSize) EXM_THROW(11, "could not read %s", fileNamesTable[n]);
195 pos += readSize;
196 fileSizes[n] = (size_t)fileSize;
197 fclose(f);
198 }
199}
200
201
202/*-********************************************************
203* Dictionary training functions
204**********************************************************/
205static size_t DiB_read_ARCH(const void* p) { size_t r; memcpy(&r, p, sizeof(r)); return r; }
206
207static unsigned DiB_NbCommonBytes (register size_t val)
208{
209 if (MEM_isLittleEndian()) {
210 if (MEM_64bits()) {
211# if defined(_MSC_VER) && defined(_WIN64)
212 unsigned long r = 0;
213 _BitScanForward64( &r, (U64)val );
214 return (unsigned)(r>>3);
215# elif defined(__GNUC__) && (__GNUC__ >= 3)
216 return (__builtin_ctzll((U64)val) >> 3);
217# else
218 static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
219 return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
220# endif
221 } else { /* 32 bits */
222# if defined(_MSC_VER)
223 unsigned long r=0;
224 _BitScanForward( &r, (U32)val );
225 return (unsigned)(r>>3);
226# elif defined(__GNUC__) && (__GNUC__ >= 3)
227 return (__builtin_ctz((U32)val) >> 3);
228# else
229 static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
230 return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
231# endif
232 }
233 } else { /* Big Endian CPU */
234 if (MEM_64bits()) {
235# if defined(_MSC_VER) && defined(_WIN64)
236 unsigned long r = 0;
237 _BitScanReverse64( &r, val );
238 return (unsigned)(r>>3);
239# elif defined(__GNUC__) && (__GNUC__ >= 3)
240 return (__builtin_clzll(val) >> 3);
241# else
242 unsigned r;
243 const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */
244 if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
245 if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
246 r += (!val);
247 return r;
248# endif
249 } else { /* 32 bits */
250# if defined(_MSC_VER)
251 unsigned long r = 0;
252 _BitScanReverse( &r, (unsigned long)val );
253 return (unsigned)(r>>3);
254# elif defined(__GNUC__) && (__GNUC__ >= 3)
255 return (__builtin_clz((U32)val) >> 3);
256# else
257 unsigned r;
258 if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
259 r += (!val);
260 return r;
261# endif
262 } }
263}
264
265
266/*! DiB_count() :
267 Count the nb of common bytes between 2 pointers.
268 Note : this function presumes end of buffer followed by noisy guard band.
269*/
270static size_t DiB_count(const void* pIn, const void* pMatch)
271{
272 const char* const pStart = (const char*)pIn;
273 for (;;) {
274 size_t diff = DiB_read_ARCH(pMatch) ^ DiB_read_ARCH(pIn);
275 if (!diff) { pIn = (const char*)pIn+sizeof(size_t); pMatch = (const char*)pMatch+sizeof(size_t); continue; }
276 pIn = (const char*)pIn+DiB_NbCommonBytes(diff);
277 return (size_t)((const char*)pIn - pStart);
278 }
279}
280
281
282typedef struct {
283 U32 pos;
284 U32 length;
285 U32 savings;
286} dictItem;
287
288void DiB_initDictItem(dictItem* d)
289{
290 d->pos = 1;
291 d->length = 0;
292 d->savings = (U32)(-1);
293}
294
295
296#define LLIMIT 64 /* heuristic determined experimentally */
297#define MINMATCHLENGTH 7 /* heuristic determined experimentally */
298static dictItem DiB_analyzePos(
299 BYTE* doneMarks,
300 const saidx_t* suffix, U32 start,
301 const void* buffer, U32 minRatio)
302{
303 U32 lengthList[LLIMIT] = {0};
304 U32 cumulLength[LLIMIT] = {0};
305 U32 savings[LLIMIT] = {0};
306 const BYTE* b = (const BYTE*)buffer;
307 size_t length;
308 size_t maxLength = LLIMIT;
309 size_t pos = suffix[start];
310 U32 end = start;
311 dictItem solution;
312
Yann Collete618f8e2016-01-28 00:29:58 +0100313 /* init */
314 memset(&solution, 0, sizeof(solution));
315 doneMarks[pos] = 1;
316
317 /* trivial repetition cases */
318 if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2))
319 ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
320 ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
321 /* skip and mark segment */
322 U16 u16 = MEM_read16(b+pos+4);
323 U32 u, e = 6;
324 while (MEM_read16(b+pos+e) == u16) e+=2 ;
325 if (b[pos+e] == b[pos+e-1]) e++;
326 for (u=1; u<e; u++)
327 doneMarks[pos+u] = 1;
328 return solution;
329 }
330
331 /* look forward */
332 do {
333 end++;
334 length = DiB_count(b + pos, b + suffix[end]);
335 } while (length >=MINMATCHLENGTH);
336
337 /* look backward */
338 do {
339 length = DiB_count(b + pos, b + *(suffix+start-1));
340 if (length >=MINMATCHLENGTH) start--;
341 } while(length >= MINMATCHLENGTH);
342
343 /* exit if not found a minimum nb of repetitions */
344 if (end-start < minRatio) {
345 U32 idx;
346 for(idx=start; idx<end; idx++)
347 doneMarks[suffix[idx]] = 1;
348 return solution;
349 }
350
351 {
352 int i;
353 U32 searchLength;
354 U32 refinedStart = start;
355 U32 refinedEnd = end;
356
357 DISPLAYLEVEL(4, "\n");
358 DISPLAYLEVEL(4, "found %3u matches of length >= %u at pos %7u ", (U32)(end-start), MINMATCHLENGTH, (U32)pos);
359 DISPLAYLEVEL(4, "\n");
360
361 for (searchLength = MINMATCHLENGTH ; ; searchLength++) {
362 BYTE currentChar = 0;
363 U32 currentCount = 0;
364 U32 currentID = refinedStart;
365 U32 id;
366 U32 selectedCount = 0;
367 U32 selectedID = currentID;
368 for (id =refinedStart; id < refinedEnd; id++) {
369 if (b[ suffix[id] + searchLength] != currentChar) {
370 if (currentCount > selectedCount) {
371 selectedCount = currentCount;
372 selectedID = currentID;
373 }
374 currentID = id;
375 currentChar = b[ suffix[id] + searchLength];
376 currentCount = 0;
377 }
378 currentCount ++;
379 }
380 if (currentCount > selectedCount) { /* for last */
381 selectedCount = currentCount;
382 selectedID = currentID;
383 }
384
385 if (selectedCount < minRatio)
386 break;
Yann Collete618f8e2016-01-28 00:29:58 +0100387 refinedStart = selectedID;
388 refinedEnd = refinedStart + selectedCount;
389 }
390
391 /* evaluate gain based on new ref */
392 start = refinedStart;
393 pos = suffix[refinedStart];
394 end = start;
395 memset(lengthList, 0, sizeof(lengthList));
396
397 /* look forward */
398 do {
399 end++;
400 length = DiB_count(b + pos, b + suffix[end]);
401 if (length >= LLIMIT) length = LLIMIT-1;
402 lengthList[length]++;
403 } while (length >=MINMATCHLENGTH);
404
405 /* look backward */
406 do {
407 length = DiB_count(b + pos, b + suffix[start-1]);
408 if (length >= LLIMIT) length = LLIMIT-1;
409 lengthList[length]++;
410 if (length >=MINMATCHLENGTH) start--;
411 } while(length >= MINMATCHLENGTH);
412
413 /* largest useful length */
414 memset(cumulLength, 0, sizeof(cumulLength));
415 cumulLength[maxLength-1] = lengthList[maxLength-1];
Yann Collet9cadd082016-01-28 15:39:52 +0100416 for (i=(int)(maxLength-2); i>=0; i--)
Yann Collete618f8e2016-01-28 00:29:58 +0100417 cumulLength[i] = cumulLength[i+1] + lengthList[i];
418
419 for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break;
420 maxLength = i;
421
422 /* reduce maxLength in case of final into repetitive data */
423 {
Yann Collet9cadd082016-01-28 15:39:52 +0100424 U32 l = (U32)maxLength;
Yann Collete618f8e2016-01-28 00:29:58 +0100425 BYTE c = b[pos + maxLength-1];
426 while (b[pos+l-2]==c) l--;
427 maxLength = l;
428 }
429 if (maxLength < MINMATCHLENGTH) return solution; /* skip : no long-enough solution */
430
431 /* calculate savings */
432 savings[5] = 0;
433 for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
434 savings[i] = savings[i-1] + (lengthList[i] * (i-3));
435
436 DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n",
437 (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
438
Yann Collet9cadd082016-01-28 15:39:52 +0100439 solution.pos = (U32)pos;
440 solution.length = (U32)maxLength;
Yann Collete618f8e2016-01-28 00:29:58 +0100441 solution.savings = savings[maxLength];
442
443 /* mark positions done */
444 {
445 U32 id;
446 U32 testedPos;
447 for (id=start; id<end; id++) {
448 U32 p, pEnd;
449 testedPos = suffix[id];
450 if (testedPos == pos)
451 length = solution.length;
452 else {
453 length = DiB_count(b+pos, b+testedPos);
454 if (length > solution.length) length = solution.length;
455 }
Yann Collet9cadd082016-01-28 15:39:52 +0100456 pEnd = (U32)(testedPos + length);
Yann Collete618f8e2016-01-28 00:29:58 +0100457 for (p=testedPos; p<pEnd; p++)
458 doneMarks[p] = 1;
459 } } }
460
461 return solution;
462}
463
464
465/*! DiB_checkMerge
466 check if dictItem can be merged, do it if possible
467 @return : id of destination elt, 0 if not merged
468*/
469static U32 DiB_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
470{
471 const U32 tableSize = table->pos;
472 const U32 max = elt.pos + (elt.length-1);
473
474 /* tail overlap */
475 U32 u; for (u=1; u<tableSize; u++) {
476 if (u==eltNbToSkip) continue;
477 if ((table[u].pos > elt.pos) && (table[u].pos < max)) { /* overlap */
478 /* append */
479 U32 addedLength = table[u].pos - elt.pos;
480 table[u].length += addedLength;
481 table[u].pos = elt.pos;
482 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
483 table[u].savings += elt.length / 8; /* rough approx */
484 elt = table[u];
485 while ((u>1) && (table[u-1].savings < elt.savings))
486 table[u] = table[u-1], u--;
487 table[u] = elt;
488 return u;
489 } }
490
491 /* front overlap */
492 for (u=1; u<tableSize; u++) {
493 if (u==eltNbToSkip) continue;
494 if ((table[u].pos + table[u].length > elt.pos) && (table[u].pos < elt.pos)) { /* overlap */
495 /* append */
496 int addedLength = (elt.pos + elt.length) - (table[u].pos + table[u].length);
497 table[u].savings += elt.length / 8; /* rough approx */
498 if (addedLength > 0) { /* otherwise, already included */
499 table[u].length += addedLength;
500 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
501 }
502 elt = table[u];
503 while ((u>1) && (table[u-1].savings < elt.savings))
504 table[u] = table[u-1], u--;
505 table[u] = elt;
506 return u;
507 } }
508
509 return 0;
510}
511
512
513static void DiB_removeDictItem(dictItem* table, U32 id)
514{
515 /* convention : first element is nb of elts */
516 U32 max = table->pos;
517 U32 u;
518 if (!id) return; /* protection, should never happen */
Yann Collet9cadd082016-01-28 15:39:52 +0100519 for (u=id; u<max-1; u++)
Yann Collete618f8e2016-01-28 00:29:58 +0100520 table[u] = table[u+1];
521 table->pos--;
522}
523
524
525static void DiB_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
526{
527 /* merge if possible */
528 U32 mergeId = DiB_checkMerge(table, elt, 0);
529 if (mergeId) {
530 U32 newMerge = 1;
531 while (newMerge) {
532 newMerge = DiB_checkMerge(table, table[mergeId], mergeId);
533 if (newMerge) DiB_removeDictItem(table, mergeId);
534 mergeId = newMerge;
535 }
536 return;
537 }
538
539 /* insert */
540 {
541 U32 current;
542 U32 nextElt = table->pos;
543 if (nextElt >= maxSize) nextElt = maxSize-1;
544 current = nextElt-1;
545 while (table[current].savings < elt.savings) {
546 table[current+1] = table[current];
547 current--;
548 }
549 table[current+1] = elt;
550 table->pos = nextElt+1;
551 }
552}
553
554
555static U32 DiB_dictSize(const dictItem* dictList)
556{
557 U32 u, dictSize = 0;
558 for (u=1; u<dictList[0].pos; u++)
559 dictSize += dictList[u].length;
560 return dictSize;
561}
562
563
564static void DiB_trainBuffer(dictItem* dictList, U32 dictListSize,
565 const void* const buffer, const size_t bufferSize, /* buffer must end with noisy guard band */
566 const char* displayName,
Yann Collet9cadd082016-01-28 15:39:52 +0100567 const size_t* fileSizes, unsigned nbFiles, unsigned maxDictSize,
Yann Collete618f8e2016-01-28 00:29:58 +0100568 U32 shiftRatio)
569{
570 saidx_t* const suffix0 = (saidx_t*)malloc((bufferSize+2)*sizeof(*suffix0));
571 saidx_t* const suffix = suffix0+1;
572 U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
573 BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */
574 U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
575 U32 minRatio = nbFiles >> shiftRatio;
576 saint_t errorCode;
577
578 /* init */
579 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
580 if (!suffix0 || !reverseSuffix || !doneMarks || !filePos)
581 EXM_THROW(1, "not enough memory for DiB_trainBuffer");
582 if (minRatio < MINRATIO) minRatio = MINRATIO;
583 memset(doneMarks, 0, bufferSize+16);
584
585 /* sort */
586 DISPLAYLEVEL(2, "sorting %s ...\n", displayName);
Yann Collet9cadd082016-01-28 15:39:52 +0100587 errorCode = divsufsort((const sauchar_t*)buffer, suffix, (saidx_t)bufferSize);
Yann Collete618f8e2016-01-28 00:29:58 +0100588 if (errorCode != 0) EXM_THROW(2, "sort failed");
Yann Collet9cadd082016-01-28 15:39:52 +0100589 suffix[bufferSize] = (saidx_t)bufferSize; /* leads into noise */
590 suffix0[0] = (saidx_t)bufferSize; /* leads into noise */
Yann Collete618f8e2016-01-28 00:29:58 +0100591 {
592 /* build reverse suffix sort */
593 size_t pos;
594 for (pos=0; pos < bufferSize; pos++)
Yann Collet9cadd082016-01-28 15:39:52 +0100595 reverseSuffix[suffix[pos]] = (U32)pos;
Yann Collete618f8e2016-01-28 00:29:58 +0100596 /* build file pos */
597 filePos[0] = 0;
598 for (pos=1; pos<nbFiles; pos++)
Yann Collet9cadd082016-01-28 15:39:52 +0100599 filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
Yann Collete618f8e2016-01-28 00:29:58 +0100600 }
601
602 DISPLAYLEVEL(2, "finding patterns ... \n");
Yann Collet82516192016-01-29 16:48:10 +0100603 DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio);
Yann Collete618f8e2016-01-28 00:29:58 +0100604
605 {
606 U32 cursor; for (cursor=0; cursor < bufferSize; ) {
607 dictItem solution;
Yann Collete618f8e2016-01-28 00:29:58 +0100608 if (doneMarks[cursor]) { cursor++; continue; }
609 solution = DiB_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio);
610 if (solution.length==0) { cursor++; continue; }
611 DiB_insertDictItem(dictList, dictListSize, solution);
612 cursor += solution.length;
613 DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
Yann Collet82516192016-01-29 16:48:10 +0100614 } }
Yann Collete618f8e2016-01-28 00:29:58 +0100615
616 /* limit dictionary size */
617 {
618 U32 max = dictList->pos; /* convention : nb of useful elts within dictList */
619 U32 currentSize = 0;
620 U32 n; for (n=1; n<max; n++) {
621 currentSize += dictList[n].length;
622 if (currentSize > maxDictSize) break;
623 }
624 dictList->pos = n;
625 }
626
627 free(suffix0);
628 free(reverseSuffix);
629 free(doneMarks);
630 free(filePos);
631}
632
633
634static size_t DiB_findMaxMem(unsigned long long requiredMem)
635{
636 size_t step = 8 MB;
637 void* testmem = NULL;
638
639 requiredMem = (((requiredMem >> 23) + 1) << 23);
640 requiredMem += 2 * step;
641 if (requiredMem > maxMemory) requiredMem = maxMemory;
642
643 while (!testmem) {
644 requiredMem -= step;
645 testmem = malloc((size_t)requiredMem);
646 }
647
648 free(testmem);
649 return (size_t)(requiredMem - step);
650}
651
652
653static void DiB_fillNoise(void* buffer, size_t length)
654{
655 unsigned acc = PRIME1;
656 size_t p=0;;
657
658 for (p=0; p<length; p++) {
659 acc *= PRIME2;
660 ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
661 }
662}
663
664
665typedef struct
666{
667 ZSTD_CCtx* ref;
668 ZSTD_CCtx* zc;
669 void* workPlace; /* must be BLOCKSIZE allocated */
670} EStats_ress_t;
671
672
673static void DiB_countEStats(EStats_ress_t esr,
674 U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount,
675 const void* src, size_t srcSize)
676{
677 const BYTE* bytePtr;
678 const U32* u32Ptr;
679
680 if (srcSize > BLOCKSIZE) srcSize = BLOCKSIZE; /* protection vs large samples */
681 ZSTD_copyCCtx(esr.zc, esr.ref);
682 ZSTD_compressBlock(esr.zc, esr.workPlace, BLOCKSIZE, src, srcSize);
683
684 /* count stats */
685 for(bytePtr = esr.zc->seqStore.litStart; bytePtr < esr.zc->seqStore.lit; bytePtr++)
686 countLit[*bytePtr]++;
687 for(u32Ptr = esr.zc->seqStore.offsetStart; u32Ptr < esr.zc->seqStore.offset; u32Ptr++) {
688 BYTE offcode = (BYTE)ZSTD_highbit(*u32Ptr) + 1;
689 if (*u32Ptr==0) offcode=0;
690 offsetcodeCount[offcode]++;
691 }
692 for(bytePtr = esr.zc->seqStore.matchLengthStart; bytePtr < esr.zc->seqStore.matchLength; bytePtr++)
693 matchlengthCount[*bytePtr]++;
694 for(bytePtr = esr.zc->seqStore.litLengthStart; bytePtr < esr.zc->seqStore.litLength; bytePtr++)
695 litlengthCount[*bytePtr]++;
696}
697
698
699#define OFFCODE_MAX 18
700static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
Yann Colletf5229e02016-01-29 02:45:26 +0100701 unsigned compressionLevel,
Yann Collete618f8e2016-01-28 00:29:58 +0100702 const void* srcBuffer, size_t* fileSizes, unsigned nbFiles,
703 const void* dictBuffer, size_t dictBufferSize)
704{
705 U32 countLit[256];
706 U32 offcodeCount[MaxOff+1];
707 HUF_CREATE_STATIC_CTABLE(hufTable, 255);
708 short offcodeNCount[MaxOff+1];
709 U32 matchLengthCount[MaxML+1];
710 short matchLengthNCount[MaxML+1];
711 U32 litlengthCount[MaxLL+1];
712 short litlengthNCount[MaxLL+1];
713 EStats_ress_t esr;
714 ZSTD_parameters params;
715 U32 u, huffLog = 12, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
716 size_t pos = 0, errorCode;
717 size_t eSize = 0;
718
719 /* init */
720 for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */
721 for (u=0; u<=OFFCODE_MAX; u++) offcodeCount[u]=1;
722 for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
723 for (u=0; u<=MaxLL; u++) litlengthCount[u]=1;
724 esr.ref = ZSTD_createCCtx();
725 esr.zc = ZSTD_createCCtx();
726 esr.workPlace = malloc(BLOCKSIZE);
727 if (!esr.ref || !esr.zc || !esr.workPlace) EXM_THROW(30, "Not enough memory");
Yann Colletf5229e02016-01-29 02:45:26 +0100728 if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
729 params = ZSTD_getParams(compressionLevel, dictBufferSize + 15 KB);
Yann Collete618f8e2016-01-28 00:29:58 +0100730 params.strategy = ZSTD_greedy;
731 ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params);
732
733 /* collect stats on all files */
734 for (u=0; u<nbFiles; u++) {
735 DiB_countEStats(esr,
736 countLit, offcodeCount, matchLengthCount, litlengthCount,
737 (const char*)srcBuffer + pos, fileSizes[u]);
738 pos += fileSizes[u];
739 }
740
741 /* analyze */
742 errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
743 if (HUF_isError(errorCode)) EXM_THROW(31, "HUF_buildCTable error");
744 huffLog = (U32)errorCode;
745
746 total=0; for (u=0; u<=OFFCODE_MAX; u++) total+=offcodeCount[u];
747 errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, OFFCODE_MAX);
748 if (FSE_isError(errorCode)) EXM_THROW(32, "FSE_normalizeCount error with offcodeCount");
749 Offlog = (U32)errorCode;
750
751 total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
752 errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
753 if (FSE_isError(errorCode)) EXM_THROW(33, "FSE_normalizeCount error with matchLengthCount");
754 mlLog = (U32)errorCode;
755
756 total=0; for (u=0; u<=MaxLL; u++) total+=litlengthCount[u];
757 errorCode = FSE_normalizeCount(litlengthNCount, llLog, litlengthCount, total, MaxLL);
758 if (FSE_isError(errorCode)) EXM_THROW(34, "FSE_normalizeCount error with litlengthCount");
759 llLog = (U32)errorCode;
760
761 /* write result to buffer */
762 errorCode = HUF_writeCTable(dstBuffer, maxDstSize, hufTable, 255, huffLog);
763 if (HUF_isError(errorCode)) EXM_THROW(41, "HUF_writeCTable error");
764 dstBuffer = (char*)dstBuffer + errorCode;
765 maxDstSize -= errorCode;
766 eSize += errorCode;
767
768 errorCode = FSE_writeNCount(dstBuffer, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
769 if (FSE_isError(errorCode)) EXM_THROW(42, "FSE_writeNCount error with offcodeNCount");
770 dstBuffer = (char*)dstBuffer + errorCode;
771 maxDstSize -= errorCode;
772 eSize += errorCode;
773
774 errorCode = FSE_writeNCount(dstBuffer, maxDstSize, matchLengthNCount, MaxML, mlLog);
775 if (FSE_isError(errorCode)) EXM_THROW(43, "FSE_writeNCount error with matchLengthNCount");
776 dstBuffer = (char*)dstBuffer + errorCode;
777 maxDstSize -= errorCode;
778 eSize += errorCode;
779
780 errorCode = FSE_writeNCount(dstBuffer, maxDstSize, litlengthNCount, MaxLL, llLog);
781 if (FSE_isError(errorCode)) EXM_THROW(43, "FSE_writeNCount error with litlengthNCount");
782 dstBuffer = (char*)dstBuffer + errorCode;
783 maxDstSize -= errorCode;
784 eSize += errorCode;
785
786 /* clean */
787 ZSTD_freeCCtx(esr.ref);
788 ZSTD_freeCCtx(esr.zc);
789 free(esr.workPlace);
790
791 return eSize;
792}
793
794
795static void DiB_saveDict(const char* dictFileName,
796 const void* buff1, size_t buff1Size,
797 const void* buff2, size_t buff2Size)
798{
799 FILE* f;
800 size_t n;
801
802 f = fopen(dictFileName, "wb");
803 if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
804
805 n = fwrite(buff1, 1, buff1Size, f);
806 if (n!=buff1Size) EXM_THROW(4, "%s : write error", dictFileName)
807
808 n = fwrite(buff2, 1, buff2Size, f);
809 if (n!=buff2Size) EXM_THROW(4, "%s : write error", dictFileName)
810
811 n = (size_t)fclose(f);
812 if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName)
813}
814
815
Yann Colletf5229e02016-01-29 02:45:26 +0100816#define DIB_FASTSEGMENTSIZE 64
817/*! DiB_fastSampling (based on an idea by Giuseppe Ottaviano)
818 Fill @dictBuffer with stripes of size DIB_FASTSEGMENTSIZE from @samplesBuffer
819 up to @dictSize.
820 Filling starts from the end of @dictBuffer, down to maximum possible.
821 if @dictSize is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of @dictBuffer won't be used.
822 @return : amount of data written into @dictBuffer
823 or an error Code (if @dictSize or @samplesSize too small)
824*/
825static size_t DiB_fastSampling(void* dictBuffer, size_t dictSize,
826 const void* samplesBuffer, size_t samplesSize)
827{
828 char* dstPtr = (char*)dictBuffer + dictSize;
829 const char* srcPtr = (const char*)samplesBuffer;
830 size_t nbSegments = dictSize / DIB_FASTSEGMENTSIZE;
831 size_t segNb, interSize;
832
833 if (nbSegments <= 2) return ERROR(srcSize_wrong);
834 if (samplesSize < dictSize) return ERROR(srcSize_wrong);
835
836 /* first and last segments are part of dictionary, in case they contain interesting header/footer */
837 dstPtr -= DIB_FASTSEGMENTSIZE;
838 memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
839 dstPtr -= DIB_FASTSEGMENTSIZE;
840 memcpy(dstPtr, srcPtr+samplesSize-DIB_FASTSEGMENTSIZE, DIB_FASTSEGMENTSIZE);
841
842 /* regularly copy a segment */
843 interSize = (samplesSize - nbSegments*DIB_FASTSEGMENTSIZE) / (nbSegments-1);
844 srcPtr += DIB_FASTSEGMENTSIZE;
845 for (segNb=2; segNb < nbSegments; segNb++) {
846 srcPtr += interSize;
847 dstPtr -= DIB_FASTSEGMENTSIZE;
848 memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
849 srcPtr += DIB_FASTSEGMENTSIZE;
850 }
851
852 return nbSegments * DIB_FASTSEGMENTSIZE;
853}
854
855
856int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize,
857 unsigned shiftRatio, unsigned compressionLevel,
858 const char** fileNamesTable, unsigned nbFiles)
Yann Collete618f8e2016-01-28 00:29:58 +0100859{
860 void* srcBuffer;
861 size_t benchedSize;
862 size_t* fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
863 unsigned long long totalSizeToLoad = DiB_getTotalFileSize(fileNamesTable, nbFiles);
Yann Collet82516192016-01-29 16:48:10 +0100864 const U32 dictListSize = MAX( MAX(DICTLISTSIZE, nbFiles), maxDictSize/16);
Yann Collete618f8e2016-01-28 00:29:58 +0100865 dictItem* dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
866 char mfName[20] = {0};
867 const char* displayName = NULL;
868
869 /* init */
870 benchedSize = DiB_findMaxMem(totalSizeToLoad * MEMMULT) / MEMMULT;
871 if ((unsigned long long)benchedSize > totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad;
872 if (benchedSize < totalSizeToLoad)
Yann Collet82516192016-01-29 16:48:10 +0100873 DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20));
Yann Collete618f8e2016-01-28 00:29:58 +0100874
875 /* Memory allocation & restrictions */
876 srcBuffer = malloc(benchedSize+NOISELENGTH); /* + noise */
877 if ((!fileSizes) || (!srcBuffer) || (!dictList)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
878 DiB_initDictItem(dictList);
879
880 /* Load input buffer */
881 DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles);
Yann Colletf5229e02016-01-29 02:45:26 +0100882 DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
Yann Collete618f8e2016-01-28 00:29:58 +0100883
Yann Collet82516192016-01-29 16:48:10 +0100884 /* analyze sequences (non-fast mode) */
Yann Colletf5229e02016-01-29 02:45:26 +0100885 if (shiftRatio>0)
886 {
Yann Colletf5229e02016-01-29 02:45:26 +0100887 snprintf (mfName, sizeof(mfName), " %u files", nbFiles);
888 if (nbFiles > 1) displayName = mfName;
889 else displayName = fileNamesTable[0];
Yann Collete618f8e2016-01-28 00:29:58 +0100890
Yann Colletf5229e02016-01-29 02:45:26 +0100891 DiB_trainBuffer(dictList, dictListSize,
892 srcBuffer, benchedSize,
893 displayName,
894 fileSizes, nbFiles, maxDictSize,
895 shiftRatio);
Yann Collete618f8e2016-01-28 00:29:58 +0100896
Yann Colletf5229e02016-01-29 02:45:26 +0100897 /* display best matches */
898 if (g_displayLevel>= 3) {
899 const U32 nb = 25;
900 U32 u;
901 U32 dictContentSize = DiB_dictSize(dictList);
902 DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
903 DISPLAYLEVEL(3, "list %u best segments \n", nb);
904 for (u=1; u<=nb; u++) {
905 U32 p = dictList[u].pos;
906 U32 l = dictList[u].length;
907 U32 d = MIN(40, l);
908 DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
909 u, l, p, dictList[u].savings);
910 DiB_printHex(3, (char*)srcBuffer+p, d);
911 DISPLAYLEVEL(3, "| \n");
912 } } }
Yann Collete618f8e2016-01-28 00:29:58 +0100913
914 /* create dictionary */
915 {
916 void* dictContent;
917 U32 dictContentSize = DiB_dictSize(dictList);
918 void* dictHeader;
Yann Colletf5229e02016-01-29 02:45:26 +0100919 size_t dictHeaderSize, hSize, addedContentLength;
Yann Collete618f8e2016-01-28 00:29:58 +0100920 BYTE* ptr;
921 U32 u;
922
923 /* build dict */
924 #define EBSIZE (2 KB)
925 dictHeaderSize = EBSIZE;
926 dictHeader = malloc(dictHeaderSize);
Yann Colletf5229e02016-01-29 02:45:26 +0100927 dictContent = malloc(maxDictSize);
Yann Collete618f8e2016-01-28 00:29:58 +0100928 if (!dictHeader || !dictContent) EXM_THROW(2, "not enough memory");
929
930 /* build dict content */
Yann Colletf5229e02016-01-29 02:45:26 +0100931 ptr = (BYTE*)dictContent + maxDictSize;
Yann Collete618f8e2016-01-28 00:29:58 +0100932 for (u=1; u<dictList->pos; u++) {
933 U32 l = dictList[u].length;
934 ptr -= l;
935 memcpy(ptr, (char*)srcBuffer+dictList[u].pos, l);
936 }
937
Yann Collet82516192016-01-29 16:48:10 +0100938 /* fast mode dict content */
939 if (shiftRatio==0) { /* note could also be used to complete a dictionary, but not necessarily better */
Yann Colletf5229e02016-01-29 02:45:26 +0100940 addedContentLength = ptr-(BYTE*)dictContent;
941 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
942 DISPLAYLEVEL(2, "Adding %u KB from fast sampling \n", (U32)(addedContentLength>>10));
943 addedContentLength = DiB_fastSampling(dictContent, addedContentLength, srcBuffer, benchedSize);
944 if (!ERR_isError(addedContentLength))
945 ptr -= addedContentLength, dictContentSize += addedContentLength;
946 }
947
Yann Collete618f8e2016-01-28 00:29:58 +0100948 /* dictionary header */
949 MEM_writeLE32(dictHeader, ZSTD_DICT_MAGIC);
950 hSize = 4;
951 dictHeaderSize -= 4;
952
953 /* entropic tables */
954 DISPLAYLEVEL(2, "statistics ... \n");
955 hSize += DiB_analyzeEntropy((char*)dictHeader+4, dictHeaderSize,
Yann Colletf5229e02016-01-29 02:45:26 +0100956 compressionLevel,
957 srcBuffer, fileSizes, nbFiles,
958 ptr, dictContentSize);
Yann Collete618f8e2016-01-28 00:29:58 +0100959
960 /* save dict */
961 {
962 size_t dictSize = hSize + dictContentSize;
963 DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
Yann Colletf5229e02016-01-29 02:45:26 +0100964 DiB_saveDict(dictFileName, dictHeader, hSize, ptr, dictContentSize);
Yann Collete618f8e2016-01-28 00:29:58 +0100965 //DiB_saveDict(dictFileName, NULL, 0, dictContent, dictContentSize); // content only
966 }
967 /* clean */
968 free(dictHeader);
969 free(dictContent);
970 }
971
972 /* clean up */
973 free(srcBuffer);
974 free(fileSizes);
975 free(dictList);
976 return 0;
977}
978