blob: 04860dbbfa6726f43e9ab8792bf580c8a20874b3 [file] [log] [blame]
Yann Collet32fb4072017-08-18 16:52:05 -07001/*
Nick Terrella4943082021-03-29 14:23:36 -07002 * Copyright (c) Yann Collet, Facebook, Inc.
Yann Collet4ded9e52016-08-30 10:04:33 -07003 * All rights reserved.
4 *
Yann Collet32fb4072017-08-18 16:52:05 -07005 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
Yann Collet3128e032017-09-08 00:09:23 -07008 * You may select, at your option, one of the above-listed licenses.
Yann Collet4ded9e52016-08-30 10:04:33 -07009 */
Yann Collet71eafdd2016-02-12 02:31:57 +010010
Yann Collet71eafdd2016-02-12 02:31:57 +010011
Yann Collet71eafdd2016-02-12 02:31:57 +010012
Przemyslaw Skibinski2f6ccee2016-12-21 13:23:34 +010013/* **************************************
14* Compiler Warnings
15****************************************/
16#ifdef _MSC_VER
Yann Collet77c137b2017-09-14 15:12:57 -070017# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
Przemyslaw Skibinski2f6ccee2016-12-21 13:23:34 +010018#endif
19
20
Yann Collet71eafdd2016-02-12 02:31:57 +010021/*-*************************************
22* Includes
23***************************************/
Przemyslaw Skibinski7a8a03c2016-12-21 15:08:44 +010024#include "platform.h" /* Large Files support */
Przemyslaw Skibinski2f6ccee2016-12-21 13:23:34 +010025#include "util.h" /* UTIL_getFileSize, UTIL_getTotalFileSize */
Yann Collet71eafdd2016-02-12 02:31:57 +010026#include <stdlib.h> /* malloc, free */
27#include <string.h> /* memset */
28#include <stdio.h> /* fprintf, fopen, ftello64 */
Yann Colleta3d03a32016-07-06 16:27:17 +020029#include <errno.h> /* errno */
Yann Collet42a02ab2018-08-15 14:35:38 -070030#include <assert.h>
Yann Collet71eafdd2016-02-12 02:31:57 +010031
Yann Collet59a71162019-04-10 12:37:03 -070032#include "timefn.h" /* UTIL_time_t, UTIL_clockSpanMicro, UTIL_getTime */
W. Felix Handte7dcca6b2020-05-01 16:20:40 -040033#include "../lib/common/mem.h" /* read */
34#include "../lib/common/error_private.h"
inikep23a08892016-04-22 12:43:18 +020035#include "dibio.h"
Yann Collet71eafdd2016-02-12 02:31:57 +010036
37
38/*-*************************************
39* Constants
40***************************************/
41#define KB *(1 <<10)
42#define MB *(1 <<20)
43#define GB *(1U<<30)
44
Yann Collet1496c3d2016-12-18 11:58:23 +010045#define SAMPLESIZE_MAX (128 KB)
46#define MEMMULT 11 /* rough estimation : memory cost to analyze 1 byte of sample */
Nick Terrelldf8415c2016-12-31 21:08:24 -080047#define COVER_MEMMULT 9 /* rough estimation : memory cost to analyze 1 byte of sample */
Jennifer Liu9d6ed9d2018-08-23 12:06:20 -070048#define FASTCOVER_MEMMULT 1 /* rough estimation : memory cost to analyze 1 byte of sample */
Yann Collet77c137b2017-09-14 15:12:57 -070049static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
Yann Collet71eafdd2016-02-12 02:31:57 +010050
51#define NOISELENGTH 32
stanjo7452598d52021-10-04 17:47:52 -070052#define MAX_SAMPLES_SIZE (2 GB) /* training dataset limited to 2GB */
Yann Collet71eafdd2016-02-12 02:31:57 +010053
54
55/*-*************************************
56* Console display
57***************************************/
58#define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
Yann Collet086b9592017-09-14 16:45:10 -070059#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
Yann Collet71eafdd2016-02-12 02:31:57 +010060
Nick Terrell9a2f6f42017-11-29 19:11:12 -080061static const U64 g_refreshRate = SEC_TO_MICRO / 6;
62static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
Yann Colletf6ca09b2016-05-09 04:44:45 +020063
Nick Terrell9a2f6f42017-11-29 19:11:12 -080064#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
65 if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
66 { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
67 if (displayLevel>=4) fflush(stderr); } } }
Yann Collet71eafdd2016-02-12 02:31:57 +010068
69/*-*************************************
70* Exceptions
71***************************************/
72#ifndef DEBUG
73# define DEBUG 0
74#endif
75#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
76#define EXM_THROW(error, ...) \
77{ \
78 DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
Yann Collet086b9592017-09-14 16:45:10 -070079 DISPLAY("Error %i : ", error); \
80 DISPLAY(__VA_ARGS__); \
81 DISPLAY("\n"); \
Yann Collet71eafdd2016-02-12 02:31:57 +010082 exit(error); \
83}
84
85
86/* ********************************************************
87* Helper functions
88**********************************************************/
Sean Purcell42bac7f2017-04-13 15:35:05 -070089#undef MIN
90#define MIN(a,b) ((a) < (b) ? (a) : (b))
Yann Colletbcb5f772016-07-06 15:41:03 +020091
stanjo7452598d52021-10-04 17:47:52 -070092/**
93 Returns the size of a file.
94 If error returns -1.
95*/
96static S64 DiB_getFileSize (const char * fileName)
97{
98 U64 const fileSize = UTIL_getFileSize(fileName);
99 return (fileSize == UTIL_FILESIZE_UNKNOWN) ? -1 : (S64)fileSize;
100}
Yann Collet71eafdd2016-02-12 02:31:57 +0100101
102/* ********************************************************
103* File related operations
104**********************************************************/
Yann Collet290aaa72016-05-30 21:18:52 +0200105/** DiB_loadFiles() :
Yann Colletc68d17f2017-09-15 15:31:31 -0700106 * load samples from files listed in fileNamesTable into buffer.
107 * works even if buffer is too small to load all samples.
108 * Also provides the size of each sample into sampleSizes table
109 * which must be sized correctly, using DiB_fileStats().
110 * @return : nb of samples effectively loaded into `buffer`
111 * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
112 * sampleSizes is filled with the size of each sample.
113 */
stanjo7452598d52021-10-04 17:47:52 -0700114static int DiB_loadFiles(
115 void* buffer, size_t* bufferSizePtr,
116 size_t* sampleSizes, int sstSize,
117 const char** fileNamesTable, int nbFiles,
118 size_t targetChunkSize, int displayLevel )
Yann Collet71eafdd2016-02-12 02:31:57 +0100119{
Yann Collet290aaa72016-05-30 21:18:52 +0200120 char* const buff = (char*)buffer;
stanjo7452598d52021-10-04 17:47:52 -0700121 size_t totalDataLoaded = 0;
122 int nbSamplesLoaded = 0;
123 int fileIndex = 0;
124 FILE * f = NULL;
Yann Collet71eafdd2016-02-12 02:31:57 +0100125
stanjo7452598d52021-10-04 17:47:52 -0700126 assert(targetChunkSize <= SAMPLESIZE_MAX);
127
128 while ( nbSamplesLoaded < sstSize && fileIndex < nbFiles ) {
129 size_t fileDataLoaded;
130 S64 const fileSize = DiB_getFileSize(fileNamesTable[fileIndex]);
131 if (fileSize <= 0) /* skip if zero-size or file error */
132 continue;
133
134 f = fopen( fileNamesTable[fileIndex], "rb");
135 if (f == NULL)
136 EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileNamesTable[fileIndex], strerror(errno));
137 DISPLAYUPDATE(2, "Loading %s... \r", fileNamesTable[fileIndex]);
138
139 /* Load the first chunk of data from the file */
140 fileDataLoaded = targetChunkSize > 0 ?
141 (size_t)MIN(fileSize, (S64)targetChunkSize) :
142 (size_t)MIN(fileSize, SAMPLESIZE_MAX );
143 if (totalDataLoaded + fileDataLoaded > *bufferSizePtr)
144 break;
145 if (fread( buff+totalDataLoaded, 1, fileDataLoaded, f ) != fileDataLoaded)
146 EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
147 sampleSizes[nbSamplesLoaded++] = fileDataLoaded;
148 totalDataLoaded += fileDataLoaded;
149
150 /* If file-chunking is enabled, load the rest of the file as more samples */
151 if (targetChunkSize > 0) {
152 while( (S64)fileDataLoaded < fileSize && nbSamplesLoaded < sstSize ) {
153 size_t const chunkSize = MIN((size_t)(fileSize-fileDataLoaded), targetChunkSize);
154 if (totalDataLoaded + chunkSize > *bufferSizePtr) /* buffer is full */
Yann Colletc68d17f2017-09-15 15:31:31 -0700155 break;
stanjo7452598d52021-10-04 17:47:52 -0700156
157 if (fread( buff+totalDataLoaded, 1, chunkSize, f ) != chunkSize)
158 EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
159 sampleSizes[nbSamplesLoaded++] = chunkSize;
160 totalDataLoaded += chunkSize;
161 fileDataLoaded += chunkSize;
162 }
163 }
164 fileIndex += 1;
165 fclose(f); f = NULL;
Yann Collet086b9592017-09-14 16:45:10 -0700166 }
stanjo7452598d52021-10-04 17:47:52 -0700167 if (f != NULL)
168 fclose(f);
169
Nick Terrelldf8415c2016-12-31 21:08:24 -0800170 DISPLAYLEVEL(2, "\r%79s\r", "");
stanjo7452598d52021-10-04 17:47:52 -0700171 DISPLAYLEVEL(4, "Loaded %d KB total training data, %d nb samples \n",
172 (int)(totalDataLoaded / (1 KB)), nbSamplesLoaded );
173 *bufferSizePtr = totalDataLoaded;
174 return nbSamplesLoaded;
Yann Collet71eafdd2016-02-12 02:31:57 +0100175}
176
Nick Terrelldf8415c2016-12-31 21:08:24 -0800177#define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r)))
178static U32 DiB_rand(U32* src)
179{
180 static const U32 prime1 = 2654435761U;
181 static const U32 prime2 = 2246822519U;
182 U32 rand32 = *src;
183 rand32 *= prime1;
184 rand32 ^= prime2;
185 rand32 = DiB_rotl32(rand32, 13);
186 *src = rand32;
187 return rand32 >> 5;
188}
189
Yann Collet77c137b2017-09-14 15:12:57 -0700190/* DiB_shuffle() :
191 * shuffle a table of file names in a semi-random way
192 * It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
193 * it will load random elements from it, instead of just the first ones. */
Nick Terrelldf8415c2016-12-31 21:08:24 -0800194static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
Yann Collet77c137b2017-09-14 15:12:57 -0700195 U32 seed = 0xFD2FB528;
196 unsigned i;
Yann Collet42a02ab2018-08-15 14:35:38 -0700197 assert(nbFiles >= 1);
Yann Collet77c137b2017-09-14 15:12:57 -0700198 for (i = nbFiles - 1; i > 0; --i) {
199 unsigned const j = DiB_rand(&seed) % (i + 1);
200 const char* const tmp = fileNamesTable[j];
201 fileNamesTable[j] = fileNamesTable[i];
202 fileNamesTable[i] = tmp;
203 }
Nick Terrelldf8415c2016-12-31 21:08:24 -0800204}
205
Yann Collet71eafdd2016-02-12 02:31:57 +0100206
207/*-********************************************************
208* Dictionary training functions
209**********************************************************/
210static size_t DiB_findMaxMem(unsigned long long requiredMem)
211{
Yann Collet290aaa72016-05-30 21:18:52 +0200212 size_t const step = 8 MB;
Yann Collet71eafdd2016-02-12 02:31:57 +0100213 void* testmem = NULL;
214
215 requiredMem = (((requiredMem >> 23) + 1) << 23);
Yann Colletbcb5f772016-07-06 15:41:03 +0200216 requiredMem += step;
Yann Collet77c137b2017-09-14 15:12:57 -0700217 if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
Yann Collet71eafdd2016-02-12 02:31:57 +0100218
219 while (!testmem) {
Yann Collet71eafdd2016-02-12 02:31:57 +0100220 testmem = malloc((size_t)requiredMem);
Yann Colletbcb5f772016-07-06 15:41:03 +0200221 requiredMem -= step;
Yann Collet71eafdd2016-02-12 02:31:57 +0100222 }
223
224 free(testmem);
Yann Colletbcb5f772016-07-06 15:41:03 +0200225 return (size_t)requiredMem;
Yann Collet71eafdd2016-02-12 02:31:57 +0100226}
227
228
229static void DiB_fillNoise(void* buffer, size_t length)
230{
Yann Colletbcb5f772016-07-06 15:41:03 +0200231 unsigned const prime1 = 2654435761U;
232 unsigned const prime2 = 2246822519U;
233 unsigned acc = prime1;
Ed Masteb81d7cc2019-08-15 21:17:06 -0400234 size_t p=0;
Yann Collet71eafdd2016-02-12 02:31:57 +0100235
236 for (p=0; p<length; p++) {
Yann Colletbcb5f772016-07-06 15:41:03 +0200237 acc *= prime2;
Yann Collet71eafdd2016-02-12 02:31:57 +0100238 ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
239 }
240}
241
242
243static void DiB_saveDict(const char* dictFileName,
244 const void* buff, size_t buffSize)
245{
Yann Collet290aaa72016-05-30 21:18:52 +0200246 FILE* const f = fopen(dictFileName, "wb");
Yann Collet71eafdd2016-02-12 02:31:57 +0100247 if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
248
Yann Colletf6ca09b2016-05-09 04:44:45 +0200249 { size_t const n = fwrite(buff, 1, buffSize, f);
250 if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }
Yann Collet71eafdd2016-02-12 02:31:57 +0100251
Yann Colletf6ca09b2016-05-09 04:44:45 +0200252 { size_t const n = (size_t)fclose(f);
253 if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
Yann Collet71eafdd2016-02-12 02:31:57 +0100254}
255
Yann Collet086b9592017-09-14 16:45:10 -0700256typedef struct {
stanjo7452598d52021-10-04 17:47:52 -0700257 S64 totalSizeToLoad;
258 int nbSamples;
259 int oneSampleTooLarge;
Yann Collet086b9592017-09-14 16:45:10 -0700260} fileStats;
261
Yann Colletc68d17f2017-09-15 15:31:31 -0700262/*! DiB_fileStats() :
263 * Given a list of files, and a chunkSize (0 == no chunk, whole files)
264 * provides the amount of data to be loaded and the resulting nb of samples.
265 * This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
266 */
stanjo7452598d52021-10-04 17:47:52 -0700267static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t chunkSize, int displayLevel)
Yann Collet1496c3d2016-12-18 11:58:23 +0100268{
Yann Collet086b9592017-09-14 16:45:10 -0700269 fileStats fs;
stanjo7452598d52021-10-04 17:47:52 -0700270 int n;
Yann Collet086b9592017-09-14 16:45:10 -0700271 memset(&fs, 0, sizeof(fs));
stanjo7452598d52021-10-04 17:47:52 -0700272
Dimitris Apostolouebbd6752021-11-13 10:04:04 +0200273 // We assume that if chunking is requested, the chunk size is < SAMPLESIZE_MAX
stanjo7452598d52021-10-04 17:47:52 -0700274 assert( chunkSize <= SAMPLESIZE_MAX );
275
Yann Collet1496c3d2016-12-18 11:58:23 +0100276 for (n=0; n<nbFiles; n++) {
stanjo7452598d52021-10-04 17:47:52 -0700277 S64 const fileSize = DiB_getFileSize(fileNamesTable[n]);
278 // TODO: is there a minimum sample size? What if the file is 1-byte?
279 if (fileSize == 0) {
280 DISPLAYLEVEL(3, "Sample file '%s' has zero size, skipping...\n", fileNamesTable[n]);
281 continue;
282 }
283
284 /* the case where we are breaking up files in sample chunks */
285 if (chunkSize > 0)
286 {
287 // TODO: is there a minimum sample size? Can we have a 1-byte sample?
288 fs.nbSamples += (int)((fileSize + chunkSize-1) / chunkSize);
289 fs.totalSizeToLoad += fileSize;
290 }
291 else {
292 /* the case where one file is one sample */
293 if (fileSize > SAMPLESIZE_MAX) {
294 /* flag excessively large sample files */
295 fs.oneSampleTooLarge |= (fileSize > 2*SAMPLESIZE_MAX);
296
297 /* Limit to the first SAMPLESIZE_MAX (128kB) of the file */
298 DISPLAYLEVEL(3, "Sample file '%s' is too large, limiting to %d KB",
299 fileNamesTable[n], SAMPLESIZE_MAX / (1 KB));
300 }
301 fs.nbSamples += 1;
302 fs.totalSizeToLoad += MIN(fileSize, SAMPLESIZE_MAX);
303 }
Yann Collet1496c3d2016-12-18 11:58:23 +0100304 }
stanjo7452598d52021-10-04 17:47:52 -0700305 DISPLAYLEVEL(4, "Found training data %d files, %d KB, %d samples\n", nbFiles, (int)(fs.totalSizeToLoad / (1 KB)), fs.nbSamples);
Yann Collet086b9592017-09-14 16:45:10 -0700306 return fs;
Yann Collet1496c3d2016-12-18 11:58:23 +0100307}
308
stanjo7452598d52021-10-04 17:47:52 -0700309int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
310 const char** fileNamesTable, int nbFiles, size_t chunkSize,
Jennifer Liu9d6ed9d2018-08-23 12:06:20 -0700311 ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
Elliot Gorokhovsky71c0c072021-12-10 16:19:40 -0500312 ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit)
Yann Collet71eafdd2016-02-12 02:31:57 +0100313{
stanjo7452598d52021-10-04 17:47:52 -0700314 fileStats fs;
315 size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */
316 int nbSamplesLoaded; /* nb of samples effectively loaded in srcBuffer */
317 size_t loadedSize; /* total data loaded in srcBuffer for all samples */
318 void* srcBuffer /* contiguous buffer with training data/samples */;
Yann Collet290aaa72016-05-30 21:18:52 +0200319 void* const dictBuffer = malloc(maxDictSize);
Yann Collet71eafdd2016-02-12 02:31:57 +0100320 int result = 0;
321
stanjo7452598d52021-10-04 17:47:52 -0700322 int const displayLevel = params ? params->zParams.notificationLevel :
323 coverParams ? coverParams->zParams.notificationLevel :
324 fastCoverParams ? fastCoverParams->zParams.notificationLevel : 0;
325
326 /* Shuffle input files before we start assessing how much sample datA to load.
327 The purpose of the shuffle is to pick random samples when the sample
328 set is larger than what we can load in memory. */
329 DISPLAYLEVEL(3, "Shuffling input files\n");
330 DiB_shuffle(fileNamesTable, nbFiles);
331
332 /* Figure out how much sample data to load with how many samples */
333 fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
334
335 {
336 int const memMult = params ? MEMMULT :
337 coverParams ? COVER_MEMMULT:
338 FASTCOVER_MEMMULT;
339 size_t const maxMem = DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
340 /* Limit the size of the training data to the free memory */
341 /* Limit the size of the training data to 2GB */
Dimitris Apostolouebbd6752021-11-13 10:04:04 +0200342 /* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */
stanjo7452598d52021-10-04 17:47:52 -0700343 loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );
Elliot Gorokhovsky71c0c072021-12-10 16:19:40 -0500344 if (memLimit != 0) {
345 DISPLAYLEVEL(2, "! Warning : setting manual memory limit for dictionary training data at %u MB \n",
346 (unsigned)(memLimit / (1 MB)));
347 loadedSize = (size_t)MIN(loadedSize, memLimit);
348 }
stanjo7452598d52021-10-04 17:47:52 -0700349 srcBuffer = malloc(loadedSize+NOISELENGTH);
350 sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
351 }
352
Yann Collet290aaa72016-05-30 21:18:52 +0200353 /* Checks */
Yann Colletc68d17f2017-09-15 15:31:31 -0700354 if ((!sampleSizes) || (!srcBuffer) || (!dictBuffer))
Yann Collet77c137b2017-09-14 15:12:57 -0700355 EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
Yann Collet086b9592017-09-14 16:45:10 -0700356 if (fs.oneSampleTooLarge) {
357 DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n");
358 DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n");
359 DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
Yann Collet1496c3d2016-12-18 11:58:23 +0100360 }
Yann Colletc68d17f2017-09-15 15:31:31 -0700361 if (fs.nbSamples < 5) {
Yann Collet49d105c2016-08-18 15:02:11 +0200362 DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n");
363 DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n");
Yann Collet17220552017-09-15 16:23:50 -0700364 DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
Yann Collet086b9592017-09-14 16:45:10 -0700365 EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */
366 }
stanjo7452598d52021-10-04 17:47:52 -0700367 if (fs.totalSizeToLoad < (S64)maxDictSize * 8) {
Yann Collet086b9592017-09-14 16:45:10 -0700368 DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n");
369 DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n");
Yann Colletdd25a272016-07-27 12:35:29 +0200370 }
Yann Collet290aaa72016-05-30 21:18:52 +0200371
Yann Collet71eafdd2016-02-12 02:31:57 +0100372 /* init */
stanjo7452598d52021-10-04 17:47:52 -0700373 if ((S64)loadedSize < fs.totalSizeToLoad)
374 DISPLAYLEVEL(1, "Training samples set too large (%u MB); training on %u MB only...\n",
375 (unsigned)(fs.totalSizeToLoad / (1 MB)),
376 (unsigned)(loadedSize / (1 MB)));
Yann Collet71eafdd2016-02-12 02:31:57 +0100377
Yann Collet71eafdd2016-02-12 02:31:57 +0100378 /* Load input buffer */
stanjo7452598d52021-10-04 17:47:52 -0700379 nbSamplesLoaded = DiB_loadFiles(
380 srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable,
381 nbFiles, chunkSize, displayLevel);
Yann Collet71eafdd2016-02-12 02:31:57 +0100382
Yann Collet77c137b2017-09-14 15:12:57 -0700383 { size_t dictSize;
Nick Terrelldf8415c2016-12-31 21:08:24 -0800384 if (params) {
Yann Collet086b9592017-09-14 16:45:10 -0700385 DiB_fillNoise((char*)srcBuffer + loadedSize, NOISELENGTH); /* guard band, for end of buffer condition */
Yann Collet890d85b2021-01-06 16:19:42 -0800386 dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize,
stanjo7452598d52021-10-04 17:47:52 -0700387 srcBuffer, sampleSizes, nbSamplesLoaded,
Yann Collet890d85b2021-01-06 16:19:42 -0800388 *params);
Jennifer Liu9d6ed9d2018-08-23 12:06:20 -0700389 } else if (coverParams) {
390 if (optimize) {
391 dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
stanjo7452598d52021-10-04 17:47:52 -0700392 srcBuffer, sampleSizes, nbSamplesLoaded,
Jennifer Liu9d6ed9d2018-08-23 12:06:20 -0700393 coverParams);
394 if (!ZDICT_isError(dictSize)) {
395 unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
396 DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d,
397 coverParams->steps, splitPercentage);
398 }
399 } else {
400 dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
stanjo7452598d52021-10-04 17:47:52 -0700401 sampleSizes, nbSamplesLoaded, *coverParams);
Nick Terrelldf8415c2016-12-31 21:08:24 -0800402 }
403 } else {
Jennifer Liu9d6ed9d2018-08-23 12:06:20 -0700404 assert(fastCoverParams != NULL);
405 if (optimize) {
406 dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize,
stanjo7452598d52021-10-04 17:47:52 -0700407 srcBuffer, sampleSizes, nbSamplesLoaded,
Jennifer Liu9d6ed9d2018-08-23 12:06:20 -0700408 fastCoverParams);
409 if (!ZDICT_isError(dictSize)) {
410 unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100);
411 DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastCoverParams->k,
412 fastCoverParams->d, fastCoverParams->f, fastCoverParams->steps, splitPercentage,
413 fastCoverParams->accel);
414 }
415 } else {
416 dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer,
stanjo7452598d52021-10-04 17:47:52 -0700417 sampleSizes, nbSamplesLoaded, *fastCoverParams);
Jennifer Liu9d6ed9d2018-08-23 12:06:20 -0700418 }
Nick Terrelldf8415c2016-12-31 21:08:24 -0800419 }
Yann Collet290aaa72016-05-30 21:18:52 +0200420 if (ZDICT_isError(dictSize)) {
421 DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */
422 result = 1;
423 goto _cleanup;
424 }
425 /* save dict */
Yann Colletededcfc2018-12-21 16:19:44 -0800426 DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (unsigned)dictSize, dictFileName);
Yann Collet290aaa72016-05-30 21:18:52 +0200427 DiB_saveDict(dictFileName, dictBuffer, dictSize);
Yann Collet71eafdd2016-02-12 02:31:57 +0100428 }
429
Yann Collet71eafdd2016-02-12 02:31:57 +0100430 /* clean up */
431_cleanup:
432 free(srcBuffer);
Yann Colletc68d17f2017-09-15 15:31:31 -0700433 free(sampleSizes);
Yann Collet71eafdd2016-02-12 02:31:57 +0100434 free(dictBuffer);
Yann Collet71eafdd2016-02-12 02:31:57 +0100435 return result;
436}