blob: 4d147e11279c3fa5f67a09d3c9b4e4883730c6e2 [file] [log] [blame]
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +02001#include "./sieve.h"
2
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -05003/* Pointer to position in (combined corpus) text. */
4typedef uint32_t TextIdx;
5
6/* Index of sample / generation. */
7typedef uint16_t SampleIdx;
8
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +02009typedef struct Slot {
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -050010 TextIdx next;
11 TextIdx offset;
12 SampleIdx presence;
13 SampleIdx mark;
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +020014} Slot;
15
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -050016static const TextIdx kNowhere = static_cast<TextIdx>(-1);
17
18static TextIdx dryRun(TextIdx sliceLen, Slot* map, TextIdx* shortcut,
19 TextIdx end, TextIdx middle, SampleIdx minPresence, SampleIdx iteration) {
20 TextIdx from = kNowhere;
21 TextIdx to = kNowhere;
22 TextIdx result = 0;
23 SampleIdx targetPresence = minPresence;
24 for (TextIdx i = 0; i < end; ++i) {
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +020025 if (i == middle) {
26 targetPresence++;
27 }
28 Slot& item = map[shortcut[i]];
29 if (item.mark != iteration) {
30 item.mark = iteration;
31 if (item.presence >= targetPresence) {
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -050032 if ((to == kNowhere) || (to < i)) {
33 if (from != kNowhere) {
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +020034 result += to - from;
35 }
36 from = i;
37 }
38 to = i + sliceLen;
39 }
40 }
41 }
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -050042 if (from != kNowhere) {
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +020043 result += to - from;
44 }
45 return result;
46}
47
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -050048static std::string createDictionary(const uint8_t* data, TextIdx sliceLen,
49 Slot* map, TextIdx* shortcut, TextIdx end, TextIdx middle,
50 SampleIdx minPresence, SampleIdx iteration) {
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +020051 std::string output;
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -050052 TextIdx from = kNowhere;
53 TextIdx to = kNowhere;
54 SampleIdx targetPresence = minPresence;
55 for (TextIdx i = 0; i < end; ++i) {
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +020056 if (i == middle) {
57 targetPresence++;
58 }
59 Slot& item = map[shortcut[i]];
60 if (item.mark != iteration) {
61 item.mark = iteration;
62 if (item.presence >= targetPresence) {
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -050063 if ((to == kNowhere) || (to < i)) {
64 if (from != kNowhere) {
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +020065 output.insert(output.end(), &data[from], &data[to]);
66 }
67 from = i;
68 }
69 to = i + sliceLen;
70 }
71 }
72 }
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -050073 if (from != kNowhere) {
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +020074 output.insert(output.end(), &data[from], &data[to]);
75 }
76 return output;
77}
78
79std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
80 const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
81 /* Parameters aliasing */
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -050082 TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
83 if (targetSize != dictionary_size_limit) {
84 fprintf(stderr, "dictionary_size_limit is too large\n");
85 return "";
86 }
87 TextIdx sliceLen = static_cast<TextIdx>(slice_len);
88 if (sliceLen != slice_len) {
89 fprintf(stderr, "slice_len is too large\n");
90 return "";
91 }
92 if (sliceLen < 1) {
93 fprintf(stderr, "slice_len is too small\n");
94 return "";
95 }
96 SampleIdx numSamples = static_cast<SampleIdx>(sample_sizes.size());
97 if ((numSamples != sample_sizes.size()) || (numSamples * 2 < numSamples)) {
98 fprintf(stderr, "too many samples\n");
99 return "";
100 }
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +0200101 const uint8_t* data = sample_data;
102
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500103 TextIdx total = 0;
104 std::vector<TextIdx> offsets;
105 for (SampleIdx i = 0; i < numSamples; ++i) {
106 TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
107 if (delta != sample_sizes[i]) {
108 fprintf(stderr, "sample is too large\n");
109 return "";
110 }
111 if (delta == 0) {
112 fprintf(stderr, "empty samples are prohibited\n");
113 return "";
114 }
115 if (total + delta <= total) {
116 fprintf(stderr, "corpus is too large\n");
117 return "";
118 }
119 total += delta;
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +0200120 offsets.push_back(total);
121 }
122
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500123 if (total * 2 < total) {
124 fprintf(stderr, "corpus is too large\n");
125 return "";
126 }
127
128 if (total < sliceLen) {
129 fprintf(stderr, "slice_len is larger than corpus size\n");
130 return "";
131 }
132
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +0200133 /*****************************************************************************
134 * Build coverage map.
135 ****************************************************************************/
136 std::vector<Slot> map;
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500137 std::vector<TextIdx> shortcut;
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +0200138 map.push_back({0, 0, 0, 0});
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500139 TextIdx end = total - sliceLen;
140 TextIdx hashLen = 11;
141 while (hashLen < 29 && ((1u << hashLen) < end)) {
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +0200142 hashLen += 3;
143 }
144 hashLen -= 3;
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500145 TextIdx hashMask = (1u << hashLen) - 1u;
146 std::vector<TextIdx> hashHead(1 << hashLen);
147 TextIdx hashSlot = 1;
148 SampleIdx piece = 0;
149 TextIdx hash = 0;
150 TextIdx lShift = 3;
151 TextIdx rShift = hashLen - lShift;
152 for (TextIdx i = 0; i < sliceLen - 1; ++i) {
153 TextIdx v = data[i];
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +0200154 hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
155 }
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500156 TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen;
157 TextIdx rShiftX = hashLen - lShiftX;
158 for (TextIdx i = 0; i < end; ++i) {
159 TextIdx v = data[i + sliceLen - 1];
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +0200160 hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
161
162 if (offsets[piece] == i) {
163 piece++;
164 }
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500165 TextIdx slot = hashHead[hash];
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +0200166 while (slot != 0) {
167 Slot& item = map[slot];
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500168 TextIdx start = item.offset;
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +0200169 bool miss = false;
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500170 for (TextIdx j = 0; j < sliceLen; ++j) {
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +0200171 if (data[i + j] != data[start + j]) {
172 miss = true;
173 break;
174 }
175 }
176 if (!miss) {
177 if (item.mark != piece) {
178 item.mark = piece;
179 item.presence++;
180 }
181 shortcut.push_back(slot);
182 break;
183 }
184 slot = item.next;
185 }
186 if (slot == 0) {
187 map.push_back({hashHead[hash], i, 1, piece});
188 hashHead[hash] = hashSlot;
189 shortcut.push_back(hashSlot);
190 hashSlot++;
191 }
192 v = data[i];
193 hash ^= ((v << lShiftX) | (v >> rShiftX)) & hashMask;
194 }
195
196 /*****************************************************************************
197 * Build dictionary of specified size.
198 ****************************************************************************/
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500199 SampleIdx a = 1;
200 TextIdx size = dryRun(
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +0200201 sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
202 /* Maximal output is smaller than target. */
203 if (size <= targetSize) {
204 return createDictionary(
205 data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
206 }
207
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500208 SampleIdx b = numSamples;
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +0200209 size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
210 if (size == targetSize) {
211 return createDictionary(
212 data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
213 }
214 /* Run binary search. */
215 if (size < targetSize) {
216 /* size(a) > targetSize > size(b) && a < m < b */
217 while (a + 1 < b) {
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500218 SampleIdx m = static_cast<SampleIdx>((a + b) / 2);
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +0200219 size = dryRun(
220 sliceLen, map.data(), shortcut.data(), end, end, m, ++piece);
221 if (size < targetSize) {
222 b = m;
223 } else if (size > targetSize) {
224 a = m;
225 } else {
226 return createDictionary(
227 data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
228 }
229 }
230 } else {
231 a = b;
232 }
233 /* size(minPresence) > targetSize > size(minPresence + 1) */
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500234 SampleIdx minPresence = a;
235 TextIdx c = 0;
236 TextIdx d = end;
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +0200237 /* size(a) < targetSize < size(b) && a < m < b */
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500238 while (c + 1 < d) {
239 TextIdx m = (c + d) / 2;
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +0200240 size = dryRun(
241 sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece);
242 if (size < targetSize) {
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500243 c = m;
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +0200244 } else if (size > targetSize) {
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500245 d = m;
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +0200246 } else {
247 return createDictionary(data, sliceLen, map.data(), shortcut.data(), end,
248 m, minPresence, ++piece);
249 }
250 }
251
252 bool unrestricted = false;
253 if (minPresence <= 2 && !unrestricted) {
254 minPresence = 2;
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500255 c = end;
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +0200256 }
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500257 return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, c,
Eugene Kliuchnikov39ef4bb2017-10-13 11:25:03 +0200258 minPresence, ++piece);
259}