Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 1 | #include "./deorummolae.h" |
| 2 | |
| 3 | #include <array> |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 4 | #include <cstdio> |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 5 | |
| 6 | #include "./esaxx/sais.hxx" |
| 7 | |
| 8 | /* Used for quick SA-entry to file mapping. Each file is padded to size that |
| 9 | is a multiple of chunk size. */ |
| 10 | #define CHUNK_SIZE 64 |
| 11 | /* Length of substring that is considered to be covered by dictionary string. */ |
| 12 | #define CUT_MATCH 6 |
| 13 | /* Minimal dictionary entry size. */ |
| 14 | #define MIN_MATCH 24 |
| 15 | |
| 16 | /* Non tunable definitions. */ |
| 17 | #define CHUNK_MASK (CHUNK_SIZE - 1) |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 18 | #define COVERAGE_SIZE (1 << (DM_LOG_MAX_FILES - 6)) |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 19 | |
| 20 | /* File coverage: every bit set to 1 denotes a file covered by an isle. */ |
| 21 | typedef std::array<uint64_t, COVERAGE_SIZE> Coverage; |
| 22 | |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 23 | /* Symbol of text alphabet. */ |
| 24 | typedef int32_t TextChar; |
| 25 | |
| 26 | /* Pointer to position in text. */ |
| 27 | typedef uint32_t TextIdx; |
| 28 | |
| 29 | /* SAIS sarray_type; unfortunately, must be a signed type. */ |
| 30 | typedef int32_t TextSaIdx; |
| 31 | |
| 32 | static size_t popcount(uint64_t u) { |
| 33 | return static_cast<size_t>(__builtin_popcountll(u)); |
Eugene Kliuchnikov | 35e69fc | 2018-02-26 09:04:36 -0500 | [diff] [blame] | 34 | } |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 35 | |
| 36 | /* Condense terminators and pad file entries. */ |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 37 | static void rewriteText(std::vector<TextChar>* text) { |
| 38 | TextChar terminator = text->back(); |
| 39 | TextChar prev = terminator; |
| 40 | TextIdx to = 0; |
| 41 | for (TextIdx from = 0; from < text->size(); ++from) { |
| 42 | TextChar next = text->at(from); |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 43 | if (next < 256 || prev < 256) { |
| 44 | text->at(to++) = next; |
| 45 | if (next >= 256) terminator = next; |
| 46 | } |
| 47 | prev = next; |
| 48 | } |
| 49 | text->resize(to); |
| 50 | if (text->empty()) text->push_back(terminator); |
| 51 | while (text->size() & CHUNK_MASK) text->push_back(terminator); |
| 52 | } |
| 53 | |
| 54 | /* Reenumerate terminators for smaller alphabet. */ |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 55 | static void remapTerminators(std::vector<TextChar>* text, |
| 56 | TextChar* next_terminator) { |
| 57 | TextChar prev = -1; |
| 58 | TextChar x = 256; |
| 59 | for (TextIdx i = 0; i < text->size(); ++i) { |
| 60 | TextChar next = text->at(i); |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 61 | if (next < 256) { // Char. |
| 62 | // Do nothing. |
| 63 | } else if (prev < 256) { // Terminator after char. |
| 64 | next = x++; |
| 65 | } else { // Terminator after terminator. |
| 66 | next = prev; |
| 67 | } |
| 68 | text->at(i) = next; |
| 69 | prev = next; |
| 70 | } |
| 71 | *next_terminator = x; |
| 72 | } |
| 73 | |
| 74 | /* Combine all file entries; create mapping position->file. */ |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 75 | static void buildFullText(std::vector<std::vector<TextChar>>* data, |
| 76 | std::vector<TextChar>* full_text, std::vector<TextIdx>* file_map, |
| 77 | std::vector<TextIdx>* file_offset, TextChar* next_terminator) { |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 78 | file_map->resize(0); |
| 79 | file_offset->resize(0); |
| 80 | full_text->resize(0); |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 81 | for (TextIdx i = 0; i < data->size(); ++i) { |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 82 | file_offset->push_back(full_text->size()); |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 83 | std::vector<TextChar>& file = data->at(i); |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 84 | rewriteText(&file); |
| 85 | full_text->insert(full_text->end(), file.begin(), file.end()); |
| 86 | file_map->insert(file_map->end(), file.size() / CHUNK_SIZE, i); |
| 87 | } |
| 88 | if (false) remapTerminators(full_text, next_terminator); |
| 89 | } |
| 90 | |
| 91 | /* Build longest-common-prefix based on suffix array and text. |
| 92 | TODO: borrowed -> unknown efficiency. */ |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 93 | static void buildLcp(std::vector<TextChar>* text, std::vector<TextIdx>* sa, |
| 94 | std::vector<TextIdx>* lcp, std::vector<TextIdx>* invese_sa) { |
| 95 | TextIdx size = static_cast<TextIdx>(text->size()); |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 96 | lcp->resize(size); |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 97 | TextIdx k = 0; |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 98 | lcp->at(size - 1) = 0; |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 99 | for (TextIdx i = 0; i < size; ++i) { |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 100 | if (invese_sa->at(i) == size - 1) { |
| 101 | k = 0; |
| 102 | continue; |
| 103 | } |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 104 | // Suffix which follow i-th suffix. |
| 105 | TextIdx j = sa->at(invese_sa->at(i) + 1); |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 106 | while (i + k < size && j + k < size && text->at(i + k) == text->at(j + k)) { |
| 107 | ++k; |
| 108 | } |
| 109 | lcp->at(invese_sa->at(i)) = k; |
| 110 | if (k > 0) --k; |
| 111 | } |
| 112 | } |
| 113 | |
| 114 | /* Isle is a range in SA with LCP not less than some value. |
| 115 | When we raise the LCP requirement, the isle sunks and smaller isles appear |
| 116 | instead. */ |
| 117 | typedef struct { |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 118 | TextIdx lcp; |
| 119 | TextIdx l; |
| 120 | TextIdx r; |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 121 | Coverage coverage; |
| 122 | } Isle; |
| 123 | |
| 124 | /* Helper routine for `cutMatch`. */ |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 125 | static void poisonData(TextIdx pos, TextIdx length, |
| 126 | std::vector<std::vector<TextChar>>* data, std::vector<TextIdx>* file_map, |
| 127 | std::vector<TextIdx>* file_offset, TextChar* next_terminator) { |
| 128 | TextIdx f = file_map->at(pos / CHUNK_SIZE); |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 129 | pos -= file_offset->at(f); |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 130 | std::vector<TextChar>& file = data->at(f); |
| 131 | TextIdx l = (length == CUT_MATCH) ? CUT_MATCH : 1; |
| 132 | for (TextIdx j = 0; j < l; j++, pos++) { |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 133 | if (file[pos] >= 256) continue; |
| 134 | if (file[pos + 1] >= 256) { |
| 135 | file[pos] = file[pos + 1]; |
| 136 | } else if (pos > 0 && file[pos - 1] >= 256) { |
| 137 | file[pos] = file[pos - 1]; |
| 138 | } else { |
| 139 | file[pos] = (*next_terminator)++; |
| 140 | } |
| 141 | } |
| 142 | } |
| 143 | |
| 144 | /* Remove substrings of a given match from files. |
| 145 | Substrings are replaced with unique terminators, so next iteration SA would |
| 146 | not allow to cross removed areas. */ |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 147 | static void cutMatch(std::vector<std::vector<TextChar>>* data, TextIdx index, |
| 148 | TextIdx length, std::vector<TextIdx>* sa, std::vector<TextIdx>* lcp, |
| 149 | std::vector<TextIdx>* invese_sa, TextChar* next_terminator, |
| 150 | std::vector<TextIdx>* file_map, std::vector<TextIdx>* file_offset) { |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 151 | while (length >= CUT_MATCH) { |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 152 | TextIdx i = index; |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 153 | while (lcp->at(i) >= length) { |
| 154 | i++; |
| 155 | poisonData( |
| 156 | sa->at(i), length, data, file_map, file_offset, next_terminator); |
| 157 | } |
| 158 | while (true) { |
| 159 | poisonData( |
| 160 | sa->at(index), length, data, file_map, file_offset, next_terminator); |
| 161 | if (index == 0 || lcp->at(index - 1) < length) break; |
| 162 | index--; |
| 163 | } |
| 164 | length--; |
| 165 | index = invese_sa->at(sa->at(index) + 1); |
| 166 | } |
| 167 | } |
| 168 | |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 169 | std::string DM_generate(size_t dictionary_size_limit, |
| 170 | const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) { |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 171 | { |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 172 | TextIdx tmp = static_cast<TextIdx>(dictionary_size_limit); |
| 173 | if ((tmp != dictionary_size_limit) || (tmp > 1u << 30)) { |
| 174 | fprintf(stderr, "dictionary_size_limit is too large\n"); |
| 175 | return ""; |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 176 | } |
| 177 | } |
| 178 | |
| 179 | /* Could use 256 + '0' for easier debugging. */ |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 180 | TextChar next_terminator = 256; |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 181 | |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 182 | std::string output; |
| 183 | std::vector<std::vector<TextChar>> data; |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 184 | |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 185 | TextIdx offset = 0; |
| 186 | size_t num_samples = sample_sizes.size(); |
| 187 | if (num_samples > DM_MAX_FILES) num_samples = DM_MAX_FILES; |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 188 | for (size_t n = 0; n < num_samples; ++n) { |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 189 | TextIdx delta = static_cast<TextIdx>(sample_sizes[n]); |
| 190 | if (delta != sample_sizes[n]) { |
| 191 | fprintf(stderr, "sample is too large\n"); |
| 192 | return ""; |
| 193 | } |
| 194 | if (delta == 0) { |
| 195 | fprintf(stderr, "0-length samples are prohibited\n"); |
| 196 | return ""; |
| 197 | } |
| 198 | TextIdx next_offset = offset + delta; |
| 199 | if (next_offset <= offset) { |
| 200 | fprintf(stderr, "corpus is too large\n"); |
| 201 | return ""; |
| 202 | } |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 203 | data.push_back( |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 204 | std::vector<TextChar>(sample_data + offset, sample_data + next_offset)); |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 205 | offset = next_offset; |
| 206 | data.back().push_back(next_terminator++); |
| 207 | } |
| 208 | |
| 209 | /* Most arrays are allocated once, and then just resized to smaller and |
| 210 | smaller sizes. */ |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 211 | std::vector<TextChar> full_text; |
| 212 | std::vector<TextIdx> file_map; |
| 213 | std::vector<TextIdx> file_offset; |
| 214 | std::vector<TextIdx> sa; |
| 215 | std::vector<TextIdx> invese_sa; |
| 216 | std::vector<TextIdx> lcp; |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 217 | std::vector<Isle> isles; |
| 218 | std::vector<char> output_data; |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 219 | TextIdx total = 0; |
| 220 | TextIdx total_cost = 0; |
| 221 | TextIdx best_cost; |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 222 | Isle best_isle; |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 223 | size_t min_count = num_samples; |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 224 | |
| 225 | while (true) { |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 226 | TextIdx max_match = static_cast<TextIdx>(dictionary_size_limit) - total; |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 227 | buildFullText(&data, &full_text, &file_map, &file_offset, &next_terminator); |
| 228 | sa.resize(full_text.size()); |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 229 | /* Hopefully, non-negative TextSaIdx is the same sa TextIdx counterpart. */ |
| 230 | saisxx(full_text.data(), reinterpret_cast<TextSaIdx*>(sa.data()), |
| 231 | static_cast<TextChar>(full_text.size()), next_terminator); |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 232 | invese_sa.resize(full_text.size()); |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 233 | for (TextIdx i = 0; i < full_text.size(); ++i) { |
| 234 | invese_sa[sa[i]] = i; |
| 235 | } |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 236 | buildLcp(&full_text, &sa, &lcp, &invese_sa); |
| 237 | |
| 238 | /* Do not rebuild SA/LCP, just use different selection. */ |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 239 | retry: |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 240 | best_cost = 0; |
| 241 | best_isle = {0, 0, 0, {{0}}}; |
| 242 | isles.resize(0); |
| 243 | isles.push_back(best_isle); |
| 244 | |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 245 | for (TextIdx i = 0; i < lcp.size(); ++i) { |
| 246 | TextIdx l = i; |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 247 | Coverage cov = {{0}}; |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 248 | size_t f = file_map[sa[i] / CHUNK_SIZE]; |
| 249 | cov[f >> 6] = (static_cast<uint64_t>(1)) << (f & 63); |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 250 | while (lcp[i] < isles.back().lcp) { |
| 251 | Isle& top = isles.back(); |
| 252 | top.r = i; |
| 253 | l = top.l; |
| 254 | for (size_t x = 0; x < cov.size(); ++x) cov[x] |= top.coverage[x]; |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 255 | size_t count = 0; |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 256 | for (size_t x = 0; x < cov.size(); ++x) count += popcount(cov[x]); |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 257 | TextIdx effective_lcp = top.lcp; |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 258 | /* Restrict (last) dictionary entry length. */ |
| 259 | if (effective_lcp > max_match) effective_lcp = max_match; |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 260 | TextIdx cost = count * effective_lcp; |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 261 | if (cost > best_cost && count >= min_count && |
| 262 | effective_lcp >= MIN_MATCH) { |
| 263 | best_cost = cost; |
| 264 | best_isle = top; |
| 265 | best_isle.lcp = effective_lcp; |
| 266 | } |
| 267 | isles.pop_back(); |
| 268 | for (size_t x = 0; x < cov.size(); ++x) { |
| 269 | isles.back().coverage[x] |= cov[x]; |
| 270 | } |
| 271 | } |
| 272 | if (lcp[i] > isles.back().lcp) isles.push_back({lcp[i], l, 0, {{0}}}); |
| 273 | for (size_t x = 0; x < cov.size(); ++x) { |
| 274 | isles.back().coverage[x] |= cov[x]; |
| 275 | } |
| 276 | } |
| 277 | |
| 278 | /* When saturated matches do not match length restrictions, lower the |
| 279 | saturation requirements. */ |
| 280 | if (best_cost == 0 || best_isle.lcp < MIN_MATCH) { |
| 281 | if (min_count >= 8) { |
| 282 | min_count = (min_count * 7) / 8; |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 283 | fprintf(stderr, "Retry: min_count=%zu\n", min_count); |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 284 | goto retry; |
| 285 | } |
| 286 | break; |
| 287 | } |
| 288 | |
| 289 | /* Save the entry. */ |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 290 | fprintf(stderr, "Savings: %d+%d, dictionary: %d+%d\n", |
| 291 | total_cost, best_cost, total, best_isle.lcp); |
| 292 | int* piece = &full_text[sa[best_isle.l]]; |
| 293 | output.insert(output.end(), piece, piece + best_isle.lcp); |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 294 | total += best_isle.lcp; |
| 295 | total_cost += best_cost; |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 296 | cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp, &invese_sa, |
| 297 | &next_terminator, &file_map, &file_offset); |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 298 | if (total >= dictionary_size_limit) break; |
| 299 | } |
Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 300 | |
| 301 | return output; |
Eugene Kliuchnikov | 5244106 | 2017-07-21 10:07:24 +0200 | [diff] [blame] | 302 | } |