blob: d15b7ee55c4391b921b324a21af26ba7e05e3832 [file] [log] [blame]
Eugene Kliuchnikov52441062017-07-21 10:07:24 +02001#include "./deorummolae.h"
2
3#include <array>
Tianjie Xua5dcb7c2018-09-25 12:25:15 -07004#include <cstdio>
Eugene Kliuchnikov52441062017-07-21 10:07:24 +02005
6#include "./esaxx/sais.hxx"
7
8/* Used for quick SA-entry to file mapping. Each file is padded to size that
9 is a multiple of chunk size. */
10#define CHUNK_SIZE 64
11/* Length of substring that is considered to be covered by dictionary string. */
12#define CUT_MATCH 6
13/* Minimal dictionary entry size. */
14#define MIN_MATCH 24
15
16/* Non tunable definitions. */
17#define CHUNK_MASK (CHUNK_SIZE - 1)
Tianjie Xua5dcb7c2018-09-25 12:25:15 -070018#define COVERAGE_SIZE (1 << (DM_LOG_MAX_FILES - 6))
Eugene Kliuchnikov52441062017-07-21 10:07:24 +020019
20/* File coverage: every bit set to 1 denotes a file covered by an isle. */
21typedef std::array<uint64_t, COVERAGE_SIZE> Coverage;
22
Tianjie Xua5dcb7c2018-09-25 12:25:15 -070023/* Symbol of text alphabet. */
24typedef int32_t TextChar;
25
26/* Pointer to position in text. */
27typedef uint32_t TextIdx;
28
29/* SAIS sarray_type; unfortunately, must be a signed type. */
30typedef int32_t TextSaIdx;
31
32static size_t popcount(uint64_t u) {
33 return static_cast<size_t>(__builtin_popcountll(u));
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -050034}
Eugene Kliuchnikov52441062017-07-21 10:07:24 +020035
36/* Condense terminators and pad file entries. */
Tianjie Xua5dcb7c2018-09-25 12:25:15 -070037static void rewriteText(std::vector<TextChar>* text) {
38 TextChar terminator = text->back();
39 TextChar prev = terminator;
40 TextIdx to = 0;
41 for (TextIdx from = 0; from < text->size(); ++from) {
42 TextChar next = text->at(from);
Eugene Kliuchnikov52441062017-07-21 10:07:24 +020043 if (next < 256 || prev < 256) {
44 text->at(to++) = next;
45 if (next >= 256) terminator = next;
46 }
47 prev = next;
48 }
49 text->resize(to);
50 if (text->empty()) text->push_back(terminator);
51 while (text->size() & CHUNK_MASK) text->push_back(terminator);
52}
53
54/* Reenumerate terminators for smaller alphabet. */
Tianjie Xua5dcb7c2018-09-25 12:25:15 -070055static void remapTerminators(std::vector<TextChar>* text,
56 TextChar* next_terminator) {
57 TextChar prev = -1;
58 TextChar x = 256;
59 for (TextIdx i = 0; i < text->size(); ++i) {
60 TextChar next = text->at(i);
Eugene Kliuchnikov52441062017-07-21 10:07:24 +020061 if (next < 256) { // Char.
62 // Do nothing.
63 } else if (prev < 256) { // Terminator after char.
64 next = x++;
65 } else { // Terminator after terminator.
66 next = prev;
67 }
68 text->at(i) = next;
69 prev = next;
70 }
71 *next_terminator = x;
72}
73
74/* Combine all file entries; create mapping position->file. */
Tianjie Xua5dcb7c2018-09-25 12:25:15 -070075static void buildFullText(std::vector<std::vector<TextChar>>* data,
76 std::vector<TextChar>* full_text, std::vector<TextIdx>* file_map,
77 std::vector<TextIdx>* file_offset, TextChar* next_terminator) {
Eugene Kliuchnikov52441062017-07-21 10:07:24 +020078 file_map->resize(0);
79 file_offset->resize(0);
80 full_text->resize(0);
Tianjie Xua5dcb7c2018-09-25 12:25:15 -070081 for (TextIdx i = 0; i < data->size(); ++i) {
Eugene Kliuchnikov52441062017-07-21 10:07:24 +020082 file_offset->push_back(full_text->size());
Tianjie Xua5dcb7c2018-09-25 12:25:15 -070083 std::vector<TextChar>& file = data->at(i);
Eugene Kliuchnikov52441062017-07-21 10:07:24 +020084 rewriteText(&file);
85 full_text->insert(full_text->end(), file.begin(), file.end());
86 file_map->insert(file_map->end(), file.size() / CHUNK_SIZE, i);
87 }
88 if (false) remapTerminators(full_text, next_terminator);
89}
90
91/* Build longest-common-prefix based on suffix array and text.
92 TODO: borrowed -> unknown efficiency. */
Tianjie Xua5dcb7c2018-09-25 12:25:15 -070093static void buildLcp(std::vector<TextChar>* text, std::vector<TextIdx>* sa,
94 std::vector<TextIdx>* lcp, std::vector<TextIdx>* invese_sa) {
95 TextIdx size = static_cast<TextIdx>(text->size());
Eugene Kliuchnikov52441062017-07-21 10:07:24 +020096 lcp->resize(size);
Tianjie Xua5dcb7c2018-09-25 12:25:15 -070097 TextIdx k = 0;
Eugene Kliuchnikov52441062017-07-21 10:07:24 +020098 lcp->at(size - 1) = 0;
Tianjie Xua5dcb7c2018-09-25 12:25:15 -070099 for (TextIdx i = 0; i < size; ++i) {
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200100 if (invese_sa->at(i) == size - 1) {
101 k = 0;
102 continue;
103 }
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700104 // Suffix which follow i-th suffix.
105 TextIdx j = sa->at(invese_sa->at(i) + 1);
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200106 while (i + k < size && j + k < size && text->at(i + k) == text->at(j + k)) {
107 ++k;
108 }
109 lcp->at(invese_sa->at(i)) = k;
110 if (k > 0) --k;
111 }
112}
113
114/* Isle is a range in SA with LCP not less than some value.
115 When we raise the LCP requirement, the isle sunks and smaller isles appear
116 instead. */
117typedef struct {
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700118 TextIdx lcp;
119 TextIdx l;
120 TextIdx r;
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200121 Coverage coverage;
122} Isle;
123
124/* Helper routine for `cutMatch`. */
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700125static void poisonData(TextIdx pos, TextIdx length,
126 std::vector<std::vector<TextChar>>* data, std::vector<TextIdx>* file_map,
127 std::vector<TextIdx>* file_offset, TextChar* next_terminator) {
128 TextIdx f = file_map->at(pos / CHUNK_SIZE);
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200129 pos -= file_offset->at(f);
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700130 std::vector<TextChar>& file = data->at(f);
131 TextIdx l = (length == CUT_MATCH) ? CUT_MATCH : 1;
132 for (TextIdx j = 0; j < l; j++, pos++) {
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200133 if (file[pos] >= 256) continue;
134 if (file[pos + 1] >= 256) {
135 file[pos] = file[pos + 1];
136 } else if (pos > 0 && file[pos - 1] >= 256) {
137 file[pos] = file[pos - 1];
138 } else {
139 file[pos] = (*next_terminator)++;
140 }
141 }
142}
143
144/* Remove substrings of a given match from files.
145 Substrings are replaced with unique terminators, so next iteration SA would
146 not allow to cross removed areas. */
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700147static void cutMatch(std::vector<std::vector<TextChar>>* data, TextIdx index,
148 TextIdx length, std::vector<TextIdx>* sa, std::vector<TextIdx>* lcp,
149 std::vector<TextIdx>* invese_sa, TextChar* next_terminator,
150 std::vector<TextIdx>* file_map, std::vector<TextIdx>* file_offset) {
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200151 while (length >= CUT_MATCH) {
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700152 TextIdx i = index;
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200153 while (lcp->at(i) >= length) {
154 i++;
155 poisonData(
156 sa->at(i), length, data, file_map, file_offset, next_terminator);
157 }
158 while (true) {
159 poisonData(
160 sa->at(index), length, data, file_map, file_offset, next_terminator);
161 if (index == 0 || lcp->at(index - 1) < length) break;
162 index--;
163 }
164 length--;
165 index = invese_sa->at(sa->at(index) + 1);
166 }
167}
168
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700169std::string DM_generate(size_t dictionary_size_limit,
170 const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200171 {
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700172 TextIdx tmp = static_cast<TextIdx>(dictionary_size_limit);
173 if ((tmp != dictionary_size_limit) || (tmp > 1u << 30)) {
174 fprintf(stderr, "dictionary_size_limit is too large\n");
175 return "";
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200176 }
177 }
178
179 /* Could use 256 + '0' for easier debugging. */
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700180 TextChar next_terminator = 256;
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200181
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700182 std::string output;
183 std::vector<std::vector<TextChar>> data;
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200184
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700185 TextIdx offset = 0;
186 size_t num_samples = sample_sizes.size();
187 if (num_samples > DM_MAX_FILES) num_samples = DM_MAX_FILES;
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200188 for (size_t n = 0; n < num_samples; ++n) {
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700189 TextIdx delta = static_cast<TextIdx>(sample_sizes[n]);
190 if (delta != sample_sizes[n]) {
191 fprintf(stderr, "sample is too large\n");
192 return "";
193 }
194 if (delta == 0) {
195 fprintf(stderr, "0-length samples are prohibited\n");
196 return "";
197 }
198 TextIdx next_offset = offset + delta;
199 if (next_offset <= offset) {
200 fprintf(stderr, "corpus is too large\n");
201 return "";
202 }
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200203 data.push_back(
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700204 std::vector<TextChar>(sample_data + offset, sample_data + next_offset));
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200205 offset = next_offset;
206 data.back().push_back(next_terminator++);
207 }
208
209 /* Most arrays are allocated once, and then just resized to smaller and
210 smaller sizes. */
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700211 std::vector<TextChar> full_text;
212 std::vector<TextIdx> file_map;
213 std::vector<TextIdx> file_offset;
214 std::vector<TextIdx> sa;
215 std::vector<TextIdx> invese_sa;
216 std::vector<TextIdx> lcp;
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200217 std::vector<Isle> isles;
218 std::vector<char> output_data;
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700219 TextIdx total = 0;
220 TextIdx total_cost = 0;
221 TextIdx best_cost;
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200222 Isle best_isle;
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700223 size_t min_count = num_samples;
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200224
225 while (true) {
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700226 TextIdx max_match = static_cast<TextIdx>(dictionary_size_limit) - total;
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200227 buildFullText(&data, &full_text, &file_map, &file_offset, &next_terminator);
228 sa.resize(full_text.size());
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700229 /* Hopefully, non-negative TextSaIdx is the same sa TextIdx counterpart. */
230 saisxx(full_text.data(), reinterpret_cast<TextSaIdx*>(sa.data()),
231 static_cast<TextChar>(full_text.size()), next_terminator);
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200232 invese_sa.resize(full_text.size());
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700233 for (TextIdx i = 0; i < full_text.size(); ++i) {
234 invese_sa[sa[i]] = i;
235 }
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200236 buildLcp(&full_text, &sa, &lcp, &invese_sa);
237
238 /* Do not rebuild SA/LCP, just use different selection. */
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700239 retry:
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200240 best_cost = 0;
241 best_isle = {0, 0, 0, {{0}}};
242 isles.resize(0);
243 isles.push_back(best_isle);
244
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700245 for (TextIdx i = 0; i < lcp.size(); ++i) {
246 TextIdx l = i;
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200247 Coverage cov = {{0}};
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700248 size_t f = file_map[sa[i] / CHUNK_SIZE];
249 cov[f >> 6] = (static_cast<uint64_t>(1)) << (f & 63);
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200250 while (lcp[i] < isles.back().lcp) {
251 Isle& top = isles.back();
252 top.r = i;
253 l = top.l;
254 for (size_t x = 0; x < cov.size(); ++x) cov[x] |= top.coverage[x];
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700255 size_t count = 0;
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200256 for (size_t x = 0; x < cov.size(); ++x) count += popcount(cov[x]);
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700257 TextIdx effective_lcp = top.lcp;
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200258 /* Restrict (last) dictionary entry length. */
259 if (effective_lcp > max_match) effective_lcp = max_match;
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700260 TextIdx cost = count * effective_lcp;
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200261 if (cost > best_cost && count >= min_count &&
262 effective_lcp >= MIN_MATCH) {
263 best_cost = cost;
264 best_isle = top;
265 best_isle.lcp = effective_lcp;
266 }
267 isles.pop_back();
268 for (size_t x = 0; x < cov.size(); ++x) {
269 isles.back().coverage[x] |= cov[x];
270 }
271 }
272 if (lcp[i] > isles.back().lcp) isles.push_back({lcp[i], l, 0, {{0}}});
273 for (size_t x = 0; x < cov.size(); ++x) {
274 isles.back().coverage[x] |= cov[x];
275 }
276 }
277
278 /* When saturated matches do not match length restrictions, lower the
279 saturation requirements. */
280 if (best_cost == 0 || best_isle.lcp < MIN_MATCH) {
281 if (min_count >= 8) {
282 min_count = (min_count * 7) / 8;
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700283 fprintf(stderr, "Retry: min_count=%zu\n", min_count);
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200284 goto retry;
285 }
286 break;
287 }
288
289 /* Save the entry. */
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700290 fprintf(stderr, "Savings: %d+%d, dictionary: %d+%d\n",
291 total_cost, best_cost, total, best_isle.lcp);
292 int* piece = &full_text[sa[best_isle.l]];
293 output.insert(output.end(), piece, piece + best_isle.lcp);
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200294 total += best_isle.lcp;
295 total_cost += best_cost;
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700296 cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp, &invese_sa,
297 &next_terminator, &file_map, &file_offset);
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200298 if (total >= dictionary_size_limit) break;
299 }
Tianjie Xua5dcb7c2018-09-25 12:25:15 -0700300
301 return output;
Eugene Kliuchnikov52441062017-07-21 10:07:24 +0200302}