blob: b41259677169388d4c5cea2c6760322a206d1311 [file] [log] [blame]
Yann Collete618f8e2016-01-28 00:29:58 +01001/*
2 dictBuilder.c
3 Copyright (C) Yann Collet 2016
4
5 GPL v2 License
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License along
18 with this program; if not, write to the Free Software Foundation, Inc.,
19 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20
21 You can contact the author at :
22 - zstd source repository : https://github.com/Cyan4973/zstd
23 - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
24*/
25
26/* **************************************
27* Compiler Options
28****************************************/
29/* Disable some Visual warning messages */
30#ifdef _MSC_VER
31# define _CRT_SECURE_NO_WARNINGS /* fopen */
32# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
33#endif
34
35/* Unix Large Files support (>4GB) */
36#define _FILE_OFFSET_BITS 64
37#if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */
38# define _LARGEFILE_SOURCE
39#elif ! defined(__LP64__) /* No point defining Large file for 64 bit */
40# define _LARGEFILE64_SOURCE
41#endif
42
43/* S_ISREG & gettimeofday() are not supported by MSVC */
44#if defined(_MSC_VER) || defined(_WIN32)
45# define BMK_LEGACY_TIMER 1
46#endif
47
48
49/* *************************************
50* Includes
51***************************************/
52#include <stdlib.h> /* malloc, free */
53#include <string.h> /* memset */
54#include <stdio.h> /* fprintf, fopen, ftello64 */
55#include <sys/types.h> /* stat64 */
56#include <sys/stat.h> /* stat64 */
57#include <time.h> /* clock */
58
59#include "mem.h" /* read */
60#include "divsufsort.h"
61#include "dictBuilder.h"
62#include "zstd_compress.c"
63#include "huff0_static.h"
64
65
66/* *************************************
67* Compiler specifics
68***************************************/
69#if !defined(S_ISREG)
70# define S_ISREG(x) (((x) & S_IFMT) == S_IFREG)
71#endif
72
73#ifdef _MSC_VER
74#define snprintf sprintf_s
75#endif
76
77
78/* *************************************
79* Constants
80***************************************/
81#define KB *(1 <<10)
82#define MB *(1 <<20)
83#define GB *(1U<<30)
84
85#define DICTLISTSIZE 10000
86#define MEMMULT 11
87static const size_t maxMemory = (sizeof(size_t)==4) ? (2 GB - 64 MB) : (size_t)(3 GB) * MEMMULT;
88
89#define NOISELENGTH 32
90#define PRIME1 2654435761U
91#define PRIME2 2246822519U
92
93#define MINRATIO 4
94
95
96/* *************************************
97* console display
98***************************************/
99#define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
100#define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
Yann Colletb31de732016-01-28 10:30:02 +0100101static unsigned g_displayLevel = 0; /* 0 : no display; 1: errors; 2: default; 4: full information */
102void DiB_setNotificationLevel(unsigned l) { g_displayLevel=l; }
Yann Collete618f8e2016-01-28 00:29:58 +0100103
104#define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
105 if (DiB_GetMilliSpan(g_time) > refreshRate) \
106 { g_time = clock(); DISPLAY(__VA_ARGS__); \
107 if (g_displayLevel>=4) fflush(stdout); } }
108static const unsigned refreshRate = 300;
109static clock_t g_time = 0;
110
111void DiB_printHex(U32 dlevel, const void* ptr, size_t length)
112{
113 const BYTE* const b = (const BYTE*)ptr;
114 size_t u;
115 for (u=0; u<length; u++)
116 {
117 BYTE c = b[u];
118 if (c<32 || c>126) c = '.'; /* non-printable char */
119 DISPLAYLEVEL(dlevel, "%c", c);
120 }
121}
122
123
124/* *************************************
125* Exceptions
126***************************************/
127#ifndef DEBUG
128# define DEBUG 0
129#endif
130#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
131#define EXM_THROW(error, ...) \
132{ \
133 DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
134 DISPLAYLEVEL(1, "Error %i : ", error); \
135 DISPLAYLEVEL(1, __VA_ARGS__); \
136 DISPLAYLEVEL(1, "\n"); \
137 exit(error); \
138}
139
140
141/* ********************************************************
142* Helper functions
143**********************************************************/
144unsigned DiB_versionNumber (void) { return DiB_VERSION_NUMBER; }
145
146static unsigned DiB_GetMilliSpan(clock_t nPrevious)
147{
148 clock_t nCurrent = clock();
149 unsigned nSpan = (unsigned)(((nCurrent - nPrevious) * 1000) / CLOCKS_PER_SEC);
150 return nSpan;
151}
152
153
154/* ********************************************************
155* File related operations
156**********************************************************/
157static unsigned long long DiB_getFileSize(const char* infilename)
158{
159 int r;
160#if defined(_MSC_VER)
161 struct _stat64 statbuf;
162 r = _stat64(infilename, &statbuf);
163#else
164 struct stat statbuf;
165 r = stat(infilename, &statbuf);
166#endif
167 if (r || !S_ISREG(statbuf.st_mode)) return 0; /* No good... */
168 return (unsigned long long)statbuf.st_size;
169}
170
171
172static unsigned long long DiB_getTotalFileSize(const char** fileNamesTable, unsigned nbFiles)
173{
174 unsigned long long total = 0;
175 unsigned n;
176 for (n=0; n<nbFiles; n++)
177 total += DiB_getFileSize(fileNamesTable[n]);
178 return total;
179}
180
181
182static void DiB_loadFiles(void* buffer, size_t bufferSize,
183 size_t* fileSizes,
184 const char** fileNamesTable, unsigned nbFiles)
185{
186 char* buff = (char*)buffer;
187 size_t pos = 0;
188 unsigned n;
189
190 for (n=0; n<nbFiles; n++) {
191 size_t readSize;
192 unsigned long long fileSize = DiB_getFileSize(fileNamesTable[n]);
193 FILE* f = fopen(fileNamesTable[n], "rb");
194 if (f==NULL) EXM_THROW(10, "impossible to open file %s", fileNamesTable[n]);
195 DISPLAYLEVEL(2, "Loading %s... \r", fileNamesTable[n]);
196 if (fileSize > bufferSize-pos) fileSize = 0; /* stop there, not enough memory to load all files */
197 readSize = fread(buff+pos, 1, (size_t)fileSize, f);
198 if (readSize != (size_t)fileSize) EXM_THROW(11, "could not read %s", fileNamesTable[n]);
199 pos += readSize;
200 fileSizes[n] = (size_t)fileSize;
201 fclose(f);
202 }
203}
204
205
206/*-********************************************************
207* Dictionary training functions
208**********************************************************/
209static size_t DiB_read_ARCH(const void* p) { size_t r; memcpy(&r, p, sizeof(r)); return r; }
210
211static unsigned DiB_NbCommonBytes (register size_t val)
212{
213 if (MEM_isLittleEndian()) {
214 if (MEM_64bits()) {
215# if defined(_MSC_VER) && defined(_WIN64)
216 unsigned long r = 0;
217 _BitScanForward64( &r, (U64)val );
218 return (unsigned)(r>>3);
219# elif defined(__GNUC__) && (__GNUC__ >= 3)
220 return (__builtin_ctzll((U64)val) >> 3);
221# else
222 static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
223 return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
224# endif
225 } else { /* 32 bits */
226# if defined(_MSC_VER)
227 unsigned long r=0;
228 _BitScanForward( &r, (U32)val );
229 return (unsigned)(r>>3);
230# elif defined(__GNUC__) && (__GNUC__ >= 3)
231 return (__builtin_ctz((U32)val) >> 3);
232# else
233 static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
234 return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
235# endif
236 }
237 } else { /* Big Endian CPU */
238 if (MEM_64bits()) {
239# if defined(_MSC_VER) && defined(_WIN64)
240 unsigned long r = 0;
241 _BitScanReverse64( &r, val );
242 return (unsigned)(r>>3);
243# elif defined(__GNUC__) && (__GNUC__ >= 3)
244 return (__builtin_clzll(val) >> 3);
245# else
246 unsigned r;
247 const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */
248 if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
249 if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
250 r += (!val);
251 return r;
252# endif
253 } else { /* 32 bits */
254# if defined(_MSC_VER)
255 unsigned long r = 0;
256 _BitScanReverse( &r, (unsigned long)val );
257 return (unsigned)(r>>3);
258# elif defined(__GNUC__) && (__GNUC__ >= 3)
259 return (__builtin_clz((U32)val) >> 3);
260# else
261 unsigned r;
262 if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
263 r += (!val);
264 return r;
265# endif
266 } }
267}
268
269
270/*! DiB_count() :
271 Count the nb of common bytes between 2 pointers.
272 Note : this function presumes end of buffer followed by noisy guard band.
273*/
274static size_t DiB_count(const void* pIn, const void* pMatch)
275{
276 const char* const pStart = (const char*)pIn;
277 for (;;) {
278 size_t diff = DiB_read_ARCH(pMatch) ^ DiB_read_ARCH(pIn);
279 if (!diff) { pIn = (const char*)pIn+sizeof(size_t); pMatch = (const char*)pMatch+sizeof(size_t); continue; }
280 pIn = (const char*)pIn+DiB_NbCommonBytes(diff);
281 return (size_t)((const char*)pIn - pStart);
282 }
283}
284
285
286typedef struct {
287 U32 pos;
288 U32 length;
289 U32 savings;
290} dictItem;
291
292void DiB_initDictItem(dictItem* d)
293{
294 d->pos = 1;
295 d->length = 0;
296 d->savings = (U32)(-1);
297}
298
299
300#define LLIMIT 64 /* heuristic determined experimentally */
301#define MINMATCHLENGTH 7 /* heuristic determined experimentally */
302static dictItem DiB_analyzePos(
303 BYTE* doneMarks,
304 const saidx_t* suffix, U32 start,
305 const void* buffer, U32 minRatio)
306{
307 U32 lengthList[LLIMIT] = {0};
308 U32 cumulLength[LLIMIT] = {0};
309 U32 savings[LLIMIT] = {0};
310 const BYTE* b = (const BYTE*)buffer;
311 size_t length;
312 size_t maxLength = LLIMIT;
313 size_t pos = suffix[start];
314 U32 end = start;
315 dictItem solution;
316
317
318 /* init */
319 memset(&solution, 0, sizeof(solution));
320 doneMarks[pos] = 1;
321
322 /* trivial repetition cases */
323 if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2))
324 ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
325 ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
326 /* skip and mark segment */
327 U16 u16 = MEM_read16(b+pos+4);
328 U32 u, e = 6;
329 while (MEM_read16(b+pos+e) == u16) e+=2 ;
330 if (b[pos+e] == b[pos+e-1]) e++;
331 for (u=1; u<e; u++)
332 doneMarks[pos+u] = 1;
333 return solution;
334 }
335
336 /* look forward */
337 do {
338 end++;
339 length = DiB_count(b + pos, b + suffix[end]);
340 } while (length >=MINMATCHLENGTH);
341
342 /* look backward */
343 do {
344 length = DiB_count(b + pos, b + *(suffix+start-1));
345 if (length >=MINMATCHLENGTH) start--;
346 } while(length >= MINMATCHLENGTH);
347
348 /* exit if not found a minimum nb of repetitions */
349 if (end-start < minRatio) {
350 U32 idx;
351 for(idx=start; idx<end; idx++)
352 doneMarks[suffix[idx]] = 1;
353 return solution;
354 }
355
356 {
357 int i;
358 U32 searchLength;
359 U32 refinedStart = start;
360 U32 refinedEnd = end;
361
362 DISPLAYLEVEL(4, "\n");
363 DISPLAYLEVEL(4, "found %3u matches of length >= %u at pos %7u ", (U32)(end-start), MINMATCHLENGTH, (U32)pos);
364 DISPLAYLEVEL(4, "\n");
365
366 for (searchLength = MINMATCHLENGTH ; ; searchLength++) {
367 BYTE currentChar = 0;
368 U32 currentCount = 0;
369 U32 currentID = refinedStart;
370 U32 id;
371 U32 selectedCount = 0;
372 U32 selectedID = currentID;
373 for (id =refinedStart; id < refinedEnd; id++) {
374 if (b[ suffix[id] + searchLength] != currentChar) {
375 if (currentCount > selectedCount) {
376 selectedCount = currentCount;
377 selectedID = currentID;
378 }
379 currentID = id;
380 currentChar = b[ suffix[id] + searchLength];
381 currentCount = 0;
382 }
383 currentCount ++;
384 }
385 if (currentCount > selectedCount) { /* for last */
386 selectedCount = currentCount;
387 selectedID = currentID;
388 }
389
390 if (selectedCount < minRatio)
391 break;
392 //DISPLAYLEVEL(4, "best char at length %u: %02X (seen %u times) (pos %u) \n", searchLength+1, selectedChar, selectedCount, selectedRef);
393 refinedStart = selectedID;
394 refinedEnd = refinedStart + selectedCount;
395 }
396
397 /* evaluate gain based on new ref */
398 start = refinedStart;
399 pos = suffix[refinedStart];
400 end = start;
401 memset(lengthList, 0, sizeof(lengthList));
402
403 /* look forward */
404 do {
405 end++;
406 length = DiB_count(b + pos, b + suffix[end]);
407 if (length >= LLIMIT) length = LLIMIT-1;
408 lengthList[length]++;
409 } while (length >=MINMATCHLENGTH);
410
411 /* look backward */
412 do {
413 length = DiB_count(b + pos, b + suffix[start-1]);
414 if (length >= LLIMIT) length = LLIMIT-1;
415 lengthList[length]++;
416 if (length >=MINMATCHLENGTH) start--;
417 } while(length >= MINMATCHLENGTH);
418
419 /* largest useful length */
420 memset(cumulLength, 0, sizeof(cumulLength));
421 cumulLength[maxLength-1] = lengthList[maxLength-1];
Yann Collet9cadd082016-01-28 15:39:52 +0100422 for (i=(int)(maxLength-2); i>=0; i--)
Yann Collete618f8e2016-01-28 00:29:58 +0100423 cumulLength[i] = cumulLength[i+1] + lengthList[i];
424
425 for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break;
426 maxLength = i;
427
428 /* reduce maxLength in case of final into repetitive data */
429 {
Yann Collet9cadd082016-01-28 15:39:52 +0100430 U32 l = (U32)maxLength;
Yann Collete618f8e2016-01-28 00:29:58 +0100431 BYTE c = b[pos + maxLength-1];
432 while (b[pos+l-2]==c) l--;
433 maxLength = l;
434 }
435 if (maxLength < MINMATCHLENGTH) return solution; /* skip : no long-enough solution */
436
437 /* calculate savings */
438 savings[5] = 0;
439 for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
440 savings[i] = savings[i-1] + (lengthList[i] * (i-3));
441
442 DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n",
443 (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
444
Yann Collet9cadd082016-01-28 15:39:52 +0100445 solution.pos = (U32)pos;
446 solution.length = (U32)maxLength;
Yann Collete618f8e2016-01-28 00:29:58 +0100447 solution.savings = savings[maxLength];
448
449 /* mark positions done */
450 {
451 U32 id;
452 U32 testedPos;
453 for (id=start; id<end; id++) {
454 U32 p, pEnd;
455 testedPos = suffix[id];
456 if (testedPos == pos)
457 length = solution.length;
458 else {
459 length = DiB_count(b+pos, b+testedPos);
460 if (length > solution.length) length = solution.length;
461 }
Yann Collet9cadd082016-01-28 15:39:52 +0100462 pEnd = (U32)(testedPos + length);
Yann Collete618f8e2016-01-28 00:29:58 +0100463 for (p=testedPos; p<pEnd; p++)
464 doneMarks[p] = 1;
465 } } }
466
467 return solution;
468}
469
470
471/*! DiB_checkMerge
472 check if dictItem can be merged, do it if possible
473 @return : id of destination elt, 0 if not merged
474*/
475static U32 DiB_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
476{
477 const U32 tableSize = table->pos;
478 const U32 max = elt.pos + (elt.length-1);
479
480 /* tail overlap */
481 U32 u; for (u=1; u<tableSize; u++) {
482 if (u==eltNbToSkip) continue;
483 if ((table[u].pos > elt.pos) && (table[u].pos < max)) { /* overlap */
484 /* append */
485 U32 addedLength = table[u].pos - elt.pos;
486 table[u].length += addedLength;
487 table[u].pos = elt.pos;
488 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
489 table[u].savings += elt.length / 8; /* rough approx */
490 elt = table[u];
491 while ((u>1) && (table[u-1].savings < elt.savings))
492 table[u] = table[u-1], u--;
493 table[u] = elt;
494 return u;
495 } }
496
497 /* front overlap */
498 for (u=1; u<tableSize; u++) {
499 if (u==eltNbToSkip) continue;
500 if ((table[u].pos + table[u].length > elt.pos) && (table[u].pos < elt.pos)) { /* overlap */
501 /* append */
502 int addedLength = (elt.pos + elt.length) - (table[u].pos + table[u].length);
503 table[u].savings += elt.length / 8; /* rough approx */
504 if (addedLength > 0) { /* otherwise, already included */
505 table[u].length += addedLength;
506 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
507 }
508 elt = table[u];
509 while ((u>1) && (table[u-1].savings < elt.savings))
510 table[u] = table[u-1], u--;
511 table[u] = elt;
512 return u;
513 } }
514
515 return 0;
516}
517
518
519static void DiB_removeDictItem(dictItem* table, U32 id)
520{
521 /* convention : first element is nb of elts */
522 U32 max = table->pos;
523 U32 u;
524 if (!id) return; /* protection, should never happen */
Yann Collet9cadd082016-01-28 15:39:52 +0100525 for (u=id; u<max-1; u++)
Yann Collete618f8e2016-01-28 00:29:58 +0100526 table[u] = table[u+1];
527 table->pos--;
528}
529
530
531static void DiB_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
532{
533 /* merge if possible */
534 U32 mergeId = DiB_checkMerge(table, elt, 0);
535 if (mergeId) {
536 U32 newMerge = 1;
537 while (newMerge) {
538 newMerge = DiB_checkMerge(table, table[mergeId], mergeId);
539 if (newMerge) DiB_removeDictItem(table, mergeId);
540 mergeId = newMerge;
541 }
542 return;
543 }
544
545 /* insert */
546 {
547 U32 current;
548 U32 nextElt = table->pos;
549 if (nextElt >= maxSize) nextElt = maxSize-1;
550 current = nextElt-1;
551 while (table[current].savings < elt.savings) {
552 table[current+1] = table[current];
553 current--;
554 }
555 table[current+1] = elt;
556 table->pos = nextElt+1;
557 }
558}
559
560
561static U32 DiB_dictSize(const dictItem* dictList)
562{
563 U32 u, dictSize = 0;
564 for (u=1; u<dictList[0].pos; u++)
565 dictSize += dictList[u].length;
566 return dictSize;
567}
568
569
570static void DiB_trainBuffer(dictItem* dictList, U32 dictListSize,
571 const void* const buffer, const size_t bufferSize, /* buffer must end with noisy guard band */
572 const char* displayName,
Yann Collet9cadd082016-01-28 15:39:52 +0100573 const size_t* fileSizes, unsigned nbFiles, unsigned maxDictSize,
Yann Collete618f8e2016-01-28 00:29:58 +0100574 U32 shiftRatio)
575{
576 saidx_t* const suffix0 = (saidx_t*)malloc((bufferSize+2)*sizeof(*suffix0));
577 saidx_t* const suffix = suffix0+1;
578 U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
579 BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */
580 U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
581 U32 minRatio = nbFiles >> shiftRatio;
582 saint_t errorCode;
583
584 /* init */
585 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
586 if (!suffix0 || !reverseSuffix || !doneMarks || !filePos)
587 EXM_THROW(1, "not enough memory for DiB_trainBuffer");
588 if (minRatio < MINRATIO) minRatio = MINRATIO;
589 memset(doneMarks, 0, bufferSize+16);
590
591 /* sort */
592 DISPLAYLEVEL(2, "sorting %s ...\n", displayName);
Yann Collet9cadd082016-01-28 15:39:52 +0100593 errorCode = divsufsort((const sauchar_t*)buffer, suffix, (saidx_t)bufferSize);
Yann Collete618f8e2016-01-28 00:29:58 +0100594 if (errorCode != 0) EXM_THROW(2, "sort failed");
Yann Collet9cadd082016-01-28 15:39:52 +0100595 suffix[bufferSize] = (saidx_t)bufferSize; /* leads into noise */
596 suffix0[0] = (saidx_t)bufferSize; /* leads into noise */
Yann Collete618f8e2016-01-28 00:29:58 +0100597 {
598 /* build reverse suffix sort */
599 size_t pos;
600 for (pos=0; pos < bufferSize; pos++)
Yann Collet9cadd082016-01-28 15:39:52 +0100601 reverseSuffix[suffix[pos]] = (U32)pos;
Yann Collete618f8e2016-01-28 00:29:58 +0100602 /* build file pos */
603 filePos[0] = 0;
604 for (pos=1; pos<nbFiles; pos++)
Yann Collet9cadd082016-01-28 15:39:52 +0100605 filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
Yann Collete618f8e2016-01-28 00:29:58 +0100606 }
607
608 DISPLAYLEVEL(2, "finding patterns ... \n");
609 DISPLAYLEVEL(4, "minimum ratio : %u \n", minRatio);
610
611 {
612 U32 cursor; for (cursor=0; cursor < bufferSize; ) {
613 dictItem solution;
614
615 if (doneMarks[cursor]) { cursor++; continue; }
616 solution = DiB_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio);
617 if (solution.length==0) { cursor++; continue; }
618 DiB_insertDictItem(dictList, dictListSize, solution);
619 cursor += solution.length;
620 DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
621 }
622
623#if 0
624 /* 2nd scan */
625 for (cursor=0; cursor < bufferSize; cursor++ )
626 {
627 dictItem solution;
628
629 if (doneMarks[cursor]) continue;
630 solution = DiB_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio);
631 if (solution.length==0) continue;
632 DiB_insertDictItem(dictList, dictListSize, solution);
633 DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
634 }
635#endif
636 }
637
638 /* limit dictionary size */
639 {
640 U32 max = dictList->pos; /* convention : nb of useful elts within dictList */
641 U32 currentSize = 0;
642 U32 n; for (n=1; n<max; n++) {
643 currentSize += dictList[n].length;
644 if (currentSize > maxDictSize) break;
645 }
646 dictList->pos = n;
647 }
648
649 free(suffix0);
650 free(reverseSuffix);
651 free(doneMarks);
652 free(filePos);
653}
654
655
656static size_t DiB_findMaxMem(unsigned long long requiredMem)
657{
658 size_t step = 8 MB;
659 void* testmem = NULL;
660
661 requiredMem = (((requiredMem >> 23) + 1) << 23);
662 requiredMem += 2 * step;
663 if (requiredMem > maxMemory) requiredMem = maxMemory;
664
665 while (!testmem) {
666 requiredMem -= step;
667 testmem = malloc((size_t)requiredMem);
668 }
669
670 free(testmem);
671 return (size_t)(requiredMem - step);
672}
673
674
675static void DiB_fillNoise(void* buffer, size_t length)
676{
677 unsigned acc = PRIME1;
678 size_t p=0;;
679
680 for (p=0; p<length; p++) {
681 acc *= PRIME2;
682 ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
683 }
684}
685
686
687typedef struct
688{
689 ZSTD_CCtx* ref;
690 ZSTD_CCtx* zc;
691 void* workPlace; /* must be BLOCKSIZE allocated */
692} EStats_ress_t;
693
694
695static void DiB_countEStats(EStats_ress_t esr,
696 U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount,
697 const void* src, size_t srcSize)
698{
699 const BYTE* bytePtr;
700 const U32* u32Ptr;
701
702 if (srcSize > BLOCKSIZE) srcSize = BLOCKSIZE; /* protection vs large samples */
703 ZSTD_copyCCtx(esr.zc, esr.ref);
704 ZSTD_compressBlock(esr.zc, esr.workPlace, BLOCKSIZE, src, srcSize);
705
706 /* count stats */
707 for(bytePtr = esr.zc->seqStore.litStart; bytePtr < esr.zc->seqStore.lit; bytePtr++)
708 countLit[*bytePtr]++;
709 for(u32Ptr = esr.zc->seqStore.offsetStart; u32Ptr < esr.zc->seqStore.offset; u32Ptr++) {
710 BYTE offcode = (BYTE)ZSTD_highbit(*u32Ptr) + 1;
711 if (*u32Ptr==0) offcode=0;
712 offsetcodeCount[offcode]++;
713 }
714 for(bytePtr = esr.zc->seqStore.matchLengthStart; bytePtr < esr.zc->seqStore.matchLength; bytePtr++)
715 matchlengthCount[*bytePtr]++;
716 for(bytePtr = esr.zc->seqStore.litLengthStart; bytePtr < esr.zc->seqStore.litLength; bytePtr++)
717 litlengthCount[*bytePtr]++;
718}
719
720
721#define OFFCODE_MAX 18
722static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
723 const void* srcBuffer, size_t* fileSizes, unsigned nbFiles,
724 const void* dictBuffer, size_t dictBufferSize)
725{
726 U32 countLit[256];
727 U32 offcodeCount[MaxOff+1];
728 HUF_CREATE_STATIC_CTABLE(hufTable, 255);
729 short offcodeNCount[MaxOff+1];
730 U32 matchLengthCount[MaxML+1];
731 short matchLengthNCount[MaxML+1];
732 U32 litlengthCount[MaxLL+1];
733 short litlengthNCount[MaxLL+1];
734 EStats_ress_t esr;
735 ZSTD_parameters params;
736 U32 u, huffLog = 12, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
737 size_t pos = 0, errorCode;
738 size_t eSize = 0;
739
740 /* init */
741 for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */
742 for (u=0; u<=OFFCODE_MAX; u++) offcodeCount[u]=1;
743 for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
744 for (u=0; u<=MaxLL; u++) litlengthCount[u]=1;
745 esr.ref = ZSTD_createCCtx();
746 esr.zc = ZSTD_createCCtx();
747 esr.workPlace = malloc(BLOCKSIZE);
748 if (!esr.ref || !esr.zc || !esr.workPlace) EXM_THROW(30, "Not enough memory");
749 params = ZSTD_getParams(5, dictBufferSize + 15 KB);
750 params.strategy = ZSTD_greedy;
751 ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params);
752
753 /* collect stats on all files */
754 for (u=0; u<nbFiles; u++) {
755 DiB_countEStats(esr,
756 countLit, offcodeCount, matchLengthCount, litlengthCount,
757 (const char*)srcBuffer + pos, fileSizes[u]);
758 pos += fileSizes[u];
759 }
760
761 /* analyze */
762 errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
763 if (HUF_isError(errorCode)) EXM_THROW(31, "HUF_buildCTable error");
764 huffLog = (U32)errorCode;
765
766 total=0; for (u=0; u<=OFFCODE_MAX; u++) total+=offcodeCount[u];
767 errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, OFFCODE_MAX);
768 if (FSE_isError(errorCode)) EXM_THROW(32, "FSE_normalizeCount error with offcodeCount");
769 Offlog = (U32)errorCode;
770
771 total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
772 errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
773 if (FSE_isError(errorCode)) EXM_THROW(33, "FSE_normalizeCount error with matchLengthCount");
774 mlLog = (U32)errorCode;
775
776 total=0; for (u=0; u<=MaxLL; u++) total+=litlengthCount[u];
777 errorCode = FSE_normalizeCount(litlengthNCount, llLog, litlengthCount, total, MaxLL);
778 if (FSE_isError(errorCode)) EXM_THROW(34, "FSE_normalizeCount error with litlengthCount");
779 llLog = (U32)errorCode;
780
781 /* write result to buffer */
782 errorCode = HUF_writeCTable(dstBuffer, maxDstSize, hufTable, 255, huffLog);
783 if (HUF_isError(errorCode)) EXM_THROW(41, "HUF_writeCTable error");
784 dstBuffer = (char*)dstBuffer + errorCode;
785 maxDstSize -= errorCode;
786 eSize += errorCode;
787
788 errorCode = FSE_writeNCount(dstBuffer, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
789 if (FSE_isError(errorCode)) EXM_THROW(42, "FSE_writeNCount error with offcodeNCount");
790 dstBuffer = (char*)dstBuffer + errorCode;
791 maxDstSize -= errorCode;
792 eSize += errorCode;
793
794 errorCode = FSE_writeNCount(dstBuffer, maxDstSize, matchLengthNCount, MaxML, mlLog);
795 if (FSE_isError(errorCode)) EXM_THROW(43, "FSE_writeNCount error with matchLengthNCount");
796 dstBuffer = (char*)dstBuffer + errorCode;
797 maxDstSize -= errorCode;
798 eSize += errorCode;
799
800 errorCode = FSE_writeNCount(dstBuffer, maxDstSize, litlengthNCount, MaxLL, llLog);
801 if (FSE_isError(errorCode)) EXM_THROW(43, "FSE_writeNCount error with litlengthNCount");
802 dstBuffer = (char*)dstBuffer + errorCode;
803 maxDstSize -= errorCode;
804 eSize += errorCode;
805
806 /* clean */
807 ZSTD_freeCCtx(esr.ref);
808 ZSTD_freeCCtx(esr.zc);
809 free(esr.workPlace);
810
811 return eSize;
812}
813
814
815static void DiB_saveDict(const char* dictFileName,
816 const void* buff1, size_t buff1Size,
817 const void* buff2, size_t buff2Size)
818{
819 FILE* f;
820 size_t n;
821
822 f = fopen(dictFileName, "wb");
823 if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
824
825 n = fwrite(buff1, 1, buff1Size, f);
826 if (n!=buff1Size) EXM_THROW(4, "%s : write error", dictFileName)
827
828 n = fwrite(buff2, 1, buff2Size, f);
829 if (n!=buff2Size) EXM_THROW(4, "%s : write error", dictFileName)
830
831 n = (size_t)fclose(f);
832 if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName)
833}
834
835
Yann Colletb31de732016-01-28 10:30:02 +0100836int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned shiftRatio,
837 const char** fileNamesTable, unsigned nbFiles)
Yann Collete618f8e2016-01-28 00:29:58 +0100838{
839 void* srcBuffer;
840 size_t benchedSize;
841 size_t* fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
842 unsigned long long totalSizeToLoad = DiB_getTotalFileSize(fileNamesTable, nbFiles);
843 const U32 dictListSize = DICTLISTSIZE;
844 dictItem* dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
845 char mfName[20] = {0};
846 const char* displayName = NULL;
847
848 /* init */
849 benchedSize = DiB_findMaxMem(totalSizeToLoad * MEMMULT) / MEMMULT;
850 if ((unsigned long long)benchedSize > totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad;
851 if (benchedSize < totalSizeToLoad)
852 DISPLAY("Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20));
853
854 /* Memory allocation & restrictions */
855 srcBuffer = malloc(benchedSize+NOISELENGTH); /* + noise */
856 if ((!fileSizes) || (!srcBuffer) || (!dictList)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
857 DiB_initDictItem(dictList);
858
859 /* Load input buffer */
860 DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles);
861 DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* for end of buffer condition */
862
863 /* Train */
864 snprintf (mfName, sizeof(mfName), " %u files", nbFiles);
865 if (nbFiles > 1) displayName = mfName;
866 else displayName = fileNamesTable[0];
867
868 DiB_trainBuffer(dictList, dictListSize,
869 srcBuffer, benchedSize,
870 displayName,
871 fileSizes, nbFiles, maxDictSize,
872 shiftRatio);
873
874 /* display best matches */
875 if (g_displayLevel>= 3) {
876 const U32 nb = 25;
877 U32 u;
878 U32 dictContentSize = DiB_dictSize(dictList);
879 DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
880 DISPLAYLEVEL(3, "list %u best segments \n", nb);
881 for (u=1; u<=nb; u++) {
882 U32 p = dictList[u].pos;
883 U32 l = dictList[u].length;
884 U32 d = MIN(40, l);
885 DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
886 u, l, p, dictList[u].savings);
887 DiB_printHex(3, (char*)srcBuffer+p, d);
888 DISPLAYLEVEL(3, "| \n");
889 } }
890
891 /* create dictionary */
892 {
893 void* dictContent;
894 U32 dictContentSize = DiB_dictSize(dictList);
895 void* dictHeader;
896 size_t dictHeaderSize, hSize;
897 BYTE* ptr;
898 U32 u;
899
900 /* build dict */
901 #define EBSIZE (2 KB)
902 dictHeaderSize = EBSIZE;
903 dictHeader = malloc(dictHeaderSize);
904 dictContent = malloc(dictContentSize);
905 if (!dictHeader || !dictContent) EXM_THROW(2, "not enough memory");
906
907 /* build dict content */
908 ptr = (BYTE*)dictContent + dictContentSize;
909
910 for (u=1; u<dictList->pos; u++) {
911 U32 l = dictList[u].length;
912 ptr -= l;
913 memcpy(ptr, (char*)srcBuffer+dictList[u].pos, l);
914 }
915
916 /* dictionary header */
917 MEM_writeLE32(dictHeader, ZSTD_DICT_MAGIC);
918 hSize = 4;
919 dictHeaderSize -= 4;
920
921 /* entropic tables */
922 DISPLAYLEVEL(2, "statistics ... \n");
923 hSize += DiB_analyzeEntropy((char*)dictHeader+4, dictHeaderSize,
924 srcBuffer, fileSizes, nbFiles,
925 dictContent, dictContentSize);
926
927 /* save dict */
928 {
929 size_t dictSize = hSize + dictContentSize;
930 DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
931 DiB_saveDict(dictFileName, dictHeader, hSize, dictContent, dictContentSize);
932 //DiB_saveDict(dictFileName, NULL, 0, dictContent, dictContentSize); // content only
933 }
934 /* clean */
935 free(dictHeader);
936 free(dictContent);
937 }
938
939 /* clean up */
940 free(srcBuffer);
941 free(fileSizes);
942 free(dictList);
943 return 0;
944}
945