blob: 4d147e11279c3fa5f67a09d3c9b4e4883730c6e2 [file] [log] [blame]
Tianjie Xua5dcb7c2018-09-25 12:25:15 -07001#include "./sieve.h"
2
3/* Pointer to position in (combined corpus) text. */
4typedef uint32_t TextIdx;
5
6/* Index of sample / generation. */
7typedef uint16_t SampleIdx;
8
9typedef struct Slot {
10 TextIdx next;
11 TextIdx offset;
12 SampleIdx presence;
13 SampleIdx mark;
14} Slot;
15
16static const TextIdx kNowhere = static_cast<TextIdx>(-1);
17
18static TextIdx dryRun(TextIdx sliceLen, Slot* map, TextIdx* shortcut,
19 TextIdx end, TextIdx middle, SampleIdx minPresence, SampleIdx iteration) {
20 TextIdx from = kNowhere;
21 TextIdx to = kNowhere;
22 TextIdx result = 0;
23 SampleIdx targetPresence = minPresence;
24 for (TextIdx i = 0; i < end; ++i) {
25 if (i == middle) {
26 targetPresence++;
27 }
28 Slot& item = map[shortcut[i]];
29 if (item.mark != iteration) {
30 item.mark = iteration;
31 if (item.presence >= targetPresence) {
32 if ((to == kNowhere) || (to < i)) {
33 if (from != kNowhere) {
34 result += to - from;
35 }
36 from = i;
37 }
38 to = i + sliceLen;
39 }
40 }
41 }
42 if (from != kNowhere) {
43 result += to - from;
44 }
45 return result;
46}
47
48static std::string createDictionary(const uint8_t* data, TextIdx sliceLen,
49 Slot* map, TextIdx* shortcut, TextIdx end, TextIdx middle,
50 SampleIdx minPresence, SampleIdx iteration) {
51 std::string output;
52 TextIdx from = kNowhere;
53 TextIdx to = kNowhere;
54 SampleIdx targetPresence = minPresence;
55 for (TextIdx i = 0; i < end; ++i) {
56 if (i == middle) {
57 targetPresence++;
58 }
59 Slot& item = map[shortcut[i]];
60 if (item.mark != iteration) {
61 item.mark = iteration;
62 if (item.presence >= targetPresence) {
63 if ((to == kNowhere) || (to < i)) {
64 if (from != kNowhere) {
65 output.insert(output.end(), &data[from], &data[to]);
66 }
67 from = i;
68 }
69 to = i + sliceLen;
70 }
71 }
72 }
73 if (from != kNowhere) {
74 output.insert(output.end(), &data[from], &data[to]);
75 }
76 return output;
77}
78
79std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
80 const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
81 /* Parameters aliasing */
82 TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
83 if (targetSize != dictionary_size_limit) {
84 fprintf(stderr, "dictionary_size_limit is too large\n");
85 return "";
86 }
87 TextIdx sliceLen = static_cast<TextIdx>(slice_len);
88 if (sliceLen != slice_len) {
89 fprintf(stderr, "slice_len is too large\n");
90 return "";
91 }
92 if (sliceLen < 1) {
93 fprintf(stderr, "slice_len is too small\n");
94 return "";
95 }
96 SampleIdx numSamples = static_cast<SampleIdx>(sample_sizes.size());
97 if ((numSamples != sample_sizes.size()) || (numSamples * 2 < numSamples)) {
98 fprintf(stderr, "too many samples\n");
99 return "";
100 }
101 const uint8_t* data = sample_data;
102
103 TextIdx total = 0;
104 std::vector<TextIdx> offsets;
105 for (SampleIdx i = 0; i < numSamples; ++i) {
106 TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
107 if (delta != sample_sizes[i]) {
108 fprintf(stderr, "sample is too large\n");
109 return "";
110 }
111 if (delta == 0) {
112 fprintf(stderr, "empty samples are prohibited\n");
113 return "";
114 }
115 if (total + delta <= total) {
116 fprintf(stderr, "corpus is too large\n");
117 return "";
118 }
119 total += delta;
120 offsets.push_back(total);
121 }
122
123 if (total * 2 < total) {
124 fprintf(stderr, "corpus is too large\n");
125 return "";
126 }
127
128 if (total < sliceLen) {
129 fprintf(stderr, "slice_len is larger than corpus size\n");
130 return "";
131 }
132
133 /*****************************************************************************
134 * Build coverage map.
135 ****************************************************************************/
136 std::vector<Slot> map;
137 std::vector<TextIdx> shortcut;
138 map.push_back({0, 0, 0, 0});
139 TextIdx end = total - sliceLen;
140 TextIdx hashLen = 11;
141 while (hashLen < 29 && ((1u << hashLen) < end)) {
142 hashLen += 3;
143 }
144 hashLen -= 3;
145 TextIdx hashMask = (1u << hashLen) - 1u;
146 std::vector<TextIdx> hashHead(1 << hashLen);
147 TextIdx hashSlot = 1;
148 SampleIdx piece = 0;
149 TextIdx hash = 0;
150 TextIdx lShift = 3;
151 TextIdx rShift = hashLen - lShift;
152 for (TextIdx i = 0; i < sliceLen - 1; ++i) {
153 TextIdx v = data[i];
154 hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
155 }
156 TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen;
157 TextIdx rShiftX = hashLen - lShiftX;
158 for (TextIdx i = 0; i < end; ++i) {
159 TextIdx v = data[i + sliceLen - 1];
160 hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
161
162 if (offsets[piece] == i) {
163 piece++;
164 }
165 TextIdx slot = hashHead[hash];
166 while (slot != 0) {
167 Slot& item = map[slot];
168 TextIdx start = item.offset;
169 bool miss = false;
170 for (TextIdx j = 0; j < sliceLen; ++j) {
171 if (data[i + j] != data[start + j]) {
172 miss = true;
173 break;
174 }
175 }
176 if (!miss) {
177 if (item.mark != piece) {
178 item.mark = piece;
179 item.presence++;
180 }
181 shortcut.push_back(slot);
182 break;
183 }
184 slot = item.next;
185 }
186 if (slot == 0) {
187 map.push_back({hashHead[hash], i, 1, piece});
188 hashHead[hash] = hashSlot;
189 shortcut.push_back(hashSlot);
190 hashSlot++;
191 }
192 v = data[i];
193 hash ^= ((v << lShiftX) | (v >> rShiftX)) & hashMask;
194 }
195
196 /*****************************************************************************
197 * Build dictionary of specified size.
198 ****************************************************************************/
199 SampleIdx a = 1;
200 TextIdx size = dryRun(
201 sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
202 /* Maximal output is smaller than target. */
203 if (size <= targetSize) {
204 return createDictionary(
205 data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
206 }
207
208 SampleIdx b = numSamples;
209 size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
210 if (size == targetSize) {
211 return createDictionary(
212 data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
213 }
214 /* Run binary search. */
215 if (size < targetSize) {
216 /* size(a) > targetSize > size(b) && a < m < b */
217 while (a + 1 < b) {
218 SampleIdx m = static_cast<SampleIdx>((a + b) / 2);
219 size = dryRun(
220 sliceLen, map.data(), shortcut.data(), end, end, m, ++piece);
221 if (size < targetSize) {
222 b = m;
223 } else if (size > targetSize) {
224 a = m;
225 } else {
226 return createDictionary(
227 data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
228 }
229 }
230 } else {
231 a = b;
232 }
233 /* size(minPresence) > targetSize > size(minPresence + 1) */
234 SampleIdx minPresence = a;
235 TextIdx c = 0;
236 TextIdx d = end;
237 /* size(a) < targetSize < size(b) && a < m < b */
238 while (c + 1 < d) {
239 TextIdx m = (c + d) / 2;
240 size = dryRun(
241 sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece);
242 if (size < targetSize) {
243 c = m;
244 } else if (size > targetSize) {
245 d = m;
246 } else {
247 return createDictionary(data, sliceLen, map.data(), shortcut.data(), end,
248 m, minPresence, ++piece);
249 }
250 }
251
252 bool unrestricted = false;
253 if (minPresence <= 2 && !unrestricted) {
254 minPresence = 2;
255 c = end;
256 }
257 return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, c,
258 minPresence, ++piece);
259}