Ivan Nikulin | 58cecf1 | 2016-09-15 10:44:19 +0200 | [diff] [blame] | 1 | /* Copyright 2016 Google Inc. All Rights Reserved. |
Eugene Kliuchnikov | dd8fa3e | 2016-09-22 11:32:23 +0200 | [diff] [blame] | 2 | Author: zip753@gmail.com (Ivan Nikulin) |
Ivan Nikulin | 58cecf1 | 2016-09-15 10:44:19 +0200 | [diff] [blame] | 3 | |
| 4 | Distributed under MIT license. |
| 5 | See file LICENSE for detail or copy at https://opensource.org/licenses/MIT |
| 6 | */ |
| 7 | |
| 8 | /* Tool for generating optimal backward references for the input file. Uses |
| 9 | sais-lite library for building suffix array. */ |
| 10 | |
| 11 | #include <algorithm> |
| 12 | #include <cassert> |
| 13 | #include <cstdio> |
| 14 | #include <cstring> |
Eugene Kliuchnikov | 27d9459 | 2016-12-22 13:03:28 +0100 | [diff] [blame] | 15 | #include <functional> |
| 16 | #include <utility> |
| 17 | #include <vector> |
Ivan Nikulin | 58cecf1 | 2016-09-15 10:44:19 +0200 | [diff] [blame] | 18 | |
Eugene Kliuchnikov | dd8fa3e | 2016-09-22 11:32:23 +0200 | [diff] [blame] | 19 | #include <gflags/gflags.h> |
| 20 | using gflags::ParseCommandLineFlags; |
| 21 | |
Ivan Nikulin | 9294022 | 2016-09-19 19:12:30 +0200 | [diff] [blame] | 22 | #include "./esaxx/sais.hxx" |
Ivan Nikulin | 58cecf1 | 2016-09-15 10:44:19 +0200 | [diff] [blame] | 23 | |
Eugene Kliuchnikov | 27d9459 | 2016-12-22 13:03:28 +0100 | [diff] [blame] | 24 | DEFINE_bool(advanced, false, "Advanced searching mode: finds all longest " |
| 25 | "matches at positions that are not covered by matches of length at least " |
| 26 | "max_length. WARNING: uses much more memory than simple mode, especially " |
| 27 | "for small values of min_length."); |
Eugene Kliuchnikov | dd8fa3e | 2016-09-22 11:32:23 +0200 | [diff] [blame] | 28 | DEFINE_int32(min_length, 1, "Minimal length of found backward references."); |
Eugene Kliuchnikov | 27d9459 | 2016-12-22 13:03:28 +0100 | [diff] [blame] | 29 | /* For advanced mode. */ |
| 30 | DEFINE_int32(long_length, 32, |
| 31 | "Maximal length of found backward references for advanced mode."); |
Eugene Kliuchnikov | dd8fa3e | 2016-09-22 11:32:23 +0200 | [diff] [blame] | 32 | DEFINE_int32(skip, 1, "Number of bytes to skip."); |
Ivan Nikulin | 58cecf1 | 2016-09-15 10:44:19 +0200 | [diff] [blame] | 33 | |
| 34 | const size_t kFileBufferSize = (1 << 16); // 64KB |
| 35 | |
| 36 | typedef int sarray_type; // Can't make it unsigned because of templates :( |
| 37 | typedef uint8_t input_type; |
| 38 | typedef uint32_t lcp_type; |
Eugene Kliuchnikov | 27d9459 | 2016-12-22 13:03:28 +0100 | [diff] [blame] | 39 | typedef std::pair<int, std::vector<int> > entry_type; |
| 40 | typedef std::function<void(sarray_type*, lcp_type*, size_t, int, int, int, int, |
| 41 | int)> Fn; |
Ivan Nikulin | 58cecf1 | 2016-09-15 10:44:19 +0200 | [diff] [blame] | 42 | |
| 43 | void ReadInput(FILE* fin, input_type* storage, size_t input_size) { |
| 44 | size_t last_pos = 0; |
| 45 | size_t available_in; |
| 46 | fseek(fin, 0, SEEK_SET); |
| 47 | do { |
| 48 | available_in = fread(storage + last_pos, 1, kFileBufferSize, fin); |
| 49 | last_pos += available_in; |
| 50 | } while (available_in != 0); |
| 51 | assert(last_pos == input_size); |
| 52 | } |
| 53 | |
| 54 | void BuildLCP(input_type* storage, sarray_type* sarray, lcp_type* lcp, |
| 55 | size_t size, uint32_t* pos) { |
| 56 | for (int i = 0; i < size; ++i) { |
| 57 | pos[sarray[i]] = i; |
| 58 | } |
| 59 | uint32_t k = 0; |
| 60 | lcp[size - 1] = 0; |
| 61 | for (int i = 0; i < size; ++i) { |
| 62 | if (pos[i] == size - 1) { |
| 63 | k = 0; |
| 64 | continue; |
| 65 | } |
| 66 | uint32_t j = sarray[pos[i] + 1]; // Suffix which follow i-th suffix in SA. |
| 67 | while (i + k < size && j + k < size && storage[i + k] == storage[j + k]) { |
| 68 | ++k; |
| 69 | } |
| 70 | lcp[pos[i]] = k; |
| 71 | if (k > 0) --k; |
| 72 | } |
| 73 | } |
| 74 | |
Eugene Kliuchnikov | 27d9459 | 2016-12-22 13:03:28 +0100 | [diff] [blame] | 75 | inline void PrintReference(sarray_type* sarray, lcp_type* lcp, size_t size, |
| 76 | int idx, int left_ix, int right_ix, int left_lcp, |
| 77 | int right_lcp, FILE* fout) { |
| 78 | int max_lcp_ix; |
| 79 | if (right_ix == size - 1 || (left_ix >= 0 && left_lcp >= right_lcp)) { |
| 80 | max_lcp_ix = left_ix; |
| 81 | } else { |
| 82 | max_lcp_ix = right_ix; |
| 83 | } |
| 84 | int dist = idx - sarray[max_lcp_ix]; |
| 85 | assert(dist > 0); |
| 86 | fputc(1, fout); |
| 87 | fwrite(&idx, sizeof(int), 1, fout); // Position in input. |
| 88 | fwrite(&dist, sizeof(int), 1, fout); // Backward distance. |
| 89 | } |
| 90 | |
| 91 | inline void GoLeft(sarray_type* sarray, lcp_type* lcp, int idx, int left_ix, |
| 92 | int left_lcp, entry_type* entry) { |
| 93 | entry->first = left_lcp; |
| 94 | if (left_lcp > FLAGS_long_length) return; |
| 95 | for (; left_ix >= 0; --left_ix) { |
| 96 | if (lcp[left_ix] < left_lcp) break; |
| 97 | if (sarray[left_ix] < idx) { |
| 98 | entry->second.push_back(idx - sarray[left_ix]); |
| 99 | } |
| 100 | } |
| 101 | } |
| 102 | |
| 103 | inline void GoRight(sarray_type* sarray, lcp_type* lcp, int idx, size_t size, |
| 104 | int right_ix, int right_lcp, entry_type* entry) { |
| 105 | entry->first = right_lcp; |
| 106 | if (right_lcp > FLAGS_long_length) return; |
| 107 | for (; right_ix < size - 1; ++right_ix) { |
| 108 | if (lcp[right_ix] < right_lcp) break; |
| 109 | if (sarray[right_ix] < idx) { |
| 110 | entry->second.push_back(idx - sarray[right_ix]); |
| 111 | } |
| 112 | } |
| 113 | } |
| 114 | |
| 115 | inline void StoreReference(sarray_type* sarray, lcp_type* lcp, size_t size, |
| 116 | int idx, int left_ix, int right_ix, int left_lcp, |
| 117 | int right_lcp, entry_type* entries) { |
| 118 | if (right_ix == size - 1 || (left_ix >= 0 && left_lcp > right_lcp)) { |
| 119 | // right is invalid or left is better |
| 120 | GoLeft(sarray, lcp, idx, left_ix, left_lcp, &entries[idx]); |
| 121 | } else if (left_ix < 0 || (right_ix < size - 1 && right_lcp > left_lcp)) { |
| 122 | // left is invalid or right is better |
| 123 | GoRight(sarray, lcp, idx, size, right_ix, right_lcp, &entries[idx]); |
| 124 | } else { // both are valid and of equal length |
| 125 | GoLeft(sarray, lcp, idx, left_ix, left_lcp, &entries[idx]); |
| 126 | GoRight(sarray, lcp, idx, size, right_ix, right_lcp, &entries[idx]); |
| 127 | } |
| 128 | } |
| 129 | |
| 130 | void ProcessReferences(sarray_type* sarray, lcp_type* lcp, size_t size, |
| 131 | uint32_t* pos, const Fn& process_output) { |
Ivan Nikulin | 58cecf1 | 2016-09-15 10:44:19 +0200 | [diff] [blame] | 132 | int min_length = FLAGS_min_length; |
| 133 | for (int idx = FLAGS_skip; idx < size; ++idx) { |
Ivan Nikulin | 58cecf1 | 2016-09-15 10:44:19 +0200 | [diff] [blame] | 134 | int left_lcp = -1; |
| 135 | int left_ix; |
| 136 | for (left_ix = pos[idx] - 1; left_ix >= 0; --left_ix) { |
| 137 | if (left_lcp == -1 || left_lcp > lcp[left_ix]) { |
| 138 | left_lcp = lcp[left_ix]; |
| 139 | } |
| 140 | if (left_lcp == 0) break; |
| 141 | if (sarray[left_ix] < idx) break; |
| 142 | } |
Ivan Nikulin | 58cecf1 | 2016-09-15 10:44:19 +0200 | [diff] [blame] | 143 | |
| 144 | int right_lcp = -1; |
| 145 | int right_ix; |
| 146 | for (right_ix = pos[idx]; right_ix < size - 1; ++right_ix) { |
| 147 | if (right_lcp == -1 || right_lcp > lcp[right_ix]) { |
| 148 | right_lcp = lcp[right_ix]; |
| 149 | } |
| 150 | // Stop if we have better result from the left side already. |
Eugene Kliuchnikov | 27d9459 | 2016-12-22 13:03:28 +0100 | [diff] [blame] | 151 | if (right_lcp < left_lcp && left_ix >= 0) break; |
Ivan Nikulin | 58cecf1 | 2016-09-15 10:44:19 +0200 | [diff] [blame] | 152 | if (right_lcp == 0) break; |
| 153 | if (sarray[right_ix] < idx) break; |
| 154 | } |
Ivan Nikulin | 58cecf1 | 2016-09-15 10:44:19 +0200 | [diff] [blame] | 155 | |
Eugene Kliuchnikov | 27d9459 | 2016-12-22 13:03:28 +0100 | [diff] [blame] | 156 | if ((left_ix >= 0 && left_lcp >= min_length) || |
| 157 | (right_ix < size - 1 && right_lcp >= min_length)) { |
| 158 | process_output(sarray, lcp, size, idx, left_ix, right_ix, left_lcp, |
| 159 | right_lcp); |
| 160 | } |
| 161 | } |
| 162 | } |
| 163 | |
| 164 | void ProcessEntries(entry_type* entries, size_t size, FILE* fout) { |
| 165 | int long_length = FLAGS_long_length; |
| 166 | std::vector<std::pair<int, int> > segments; |
| 167 | size_t idx; |
| 168 | for (idx = 0; idx < size;) { |
| 169 | entry_type& entry = entries[idx]; |
| 170 | if (entry.first > long_length) { |
| 171 | // Add segment. |
| 172 | if (segments.empty() || segments.back().second < idx) { |
| 173 | segments.push_back({idx, idx + entry.first}); |
| 174 | } else { |
| 175 | segments.back().second = idx + entry.first; |
Ivan Nikulin | 58cecf1 | 2016-09-15 10:44:19 +0200 | [diff] [blame] | 176 | } |
Eugene Kliuchnikov | 27d9459 | 2016-12-22 13:03:28 +0100 | [diff] [blame] | 177 | } |
| 178 | ++idx; |
| 179 | } |
| 180 | printf("Segments generated.\n"); |
| 181 | size_t segments_ix = 0; |
| 182 | for (idx = 0; idx < size;) { |
| 183 | if (idx == segments[segments_ix].first) { |
| 184 | // Skip segment. |
| 185 | idx = segments[segments_ix].second; |
| 186 | } else { |
| 187 | for (auto& dist : entries[idx].second) { |
| 188 | fputc(1, fout); |
| 189 | fwrite(&idx, sizeof(int), 1, fout); // Position in input. |
| 190 | fwrite(&dist, sizeof(int), 1, fout); // Backward distance. |
| 191 | } |
| 192 | ++idx; |
Ivan Nikulin | 58cecf1 | 2016-09-15 10:44:19 +0200 | [diff] [blame] | 193 | } |
| 194 | } |
| 195 | } |
| 196 | |
| 197 | int main(int argc, char* argv[]) { |
Eugene Kliuchnikov | dd8fa3e | 2016-09-22 11:32:23 +0200 | [diff] [blame] | 198 | ParseCommandLineFlags(&argc, &argv, true); |
Ivan Nikulin | 58cecf1 | 2016-09-15 10:44:19 +0200 | [diff] [blame] | 199 | if (argc != 3) { |
| 200 | printf("usage: %s input_file output_file\n", argv[0]); |
| 201 | return 1; |
| 202 | } |
| 203 | |
| 204 | FILE* fin = fopen(argv[1], "rb"); |
| 205 | FILE* fout = fopen(argv[2], "w"); |
| 206 | |
| 207 | fseek(fin, 0, SEEK_END); |
| 208 | int input_size = ftell(fin); |
| 209 | fseek(fin, 0, SEEK_SET); |
| 210 | printf("The file size is %u bytes\n", input_size); |
| 211 | |
| 212 | input_type* storage = new input_type[input_size]; |
| 213 | |
| 214 | ReadInput(fin, storage, input_size); |
| 215 | fclose(fin); |
| 216 | |
| 217 | sarray_type* sarray = new sarray_type[input_size]; |
| 218 | saisxx(storage, sarray, input_size); |
| 219 | printf("Suffix array calculated.\n"); |
| 220 | |
| 221 | // Inverse suffix array. |
| 222 | uint32_t* pos = new uint32_t[input_size]; |
| 223 | |
| 224 | lcp_type* lcp = new lcp_type[input_size]; |
| 225 | BuildLCP(storage, sarray, lcp, input_size, pos); |
| 226 | printf("LCP array constructed.\n"); |
Eugene Kliuchnikov | 27d9459 | 2016-12-22 13:03:28 +0100 | [diff] [blame] | 227 | delete[] storage; |
Ivan Nikulin | 58cecf1 | 2016-09-15 10:44:19 +0200 | [diff] [blame] | 228 | |
Eugene Kliuchnikov | 27d9459 | 2016-12-22 13:03:28 +0100 | [diff] [blame] | 229 | using std::placeholders::_1; |
| 230 | using std::placeholders::_2; |
| 231 | using std::placeholders::_3; |
| 232 | using std::placeholders::_4; |
| 233 | using std::placeholders::_5; |
| 234 | using std::placeholders::_6; |
| 235 | using std::placeholders::_7; |
| 236 | using std::placeholders::_8; |
| 237 | entry_type* entries; |
| 238 | if (FLAGS_advanced) { |
| 239 | entries = new entry_type[input_size]; |
| 240 | for (size_t i = 0; i < input_size; ++i) entries[i].first = -1; |
| 241 | } |
| 242 | Fn print = std::bind(PrintReference, _1, _2, _3, _4, _5, _6, _7, _8, fout); |
| 243 | Fn store = std::bind(StoreReference, _1, _2, _3, _4, _5, _6, _7, _8, entries); |
| 244 | |
| 245 | ProcessReferences(sarray, lcp, input_size, pos, |
| 246 | FLAGS_advanced ? store : print); |
| 247 | printf("References processed.\n"); |
| 248 | |
| 249 | if (FLAGS_advanced) { |
| 250 | int good_cnt = 0; |
| 251 | uint64_t avg_cnt = 0; |
| 252 | for (size_t i = 0; i < input_size; ++i) { |
| 253 | if (entries[i].first != -1) { |
| 254 | ++good_cnt; |
| 255 | avg_cnt += entries[i].second.size(); |
| 256 | } |
| 257 | } |
| 258 | printf("Number of covered positions = %d\n", good_cnt); |
| 259 | printf("Average number of references per covered position = %.4lf\n", |
| 260 | static_cast<double>(avg_cnt) / good_cnt); |
| 261 | ProcessEntries(entries, input_size, fout); |
| 262 | printf("Entries processed.\n"); |
| 263 | } |
| 264 | |
Ivan Nikulin | 58cecf1 | 2016-09-15 10:44:19 +0200 | [diff] [blame] | 265 | fclose(fout); |
| 266 | return 0; |
| 267 | } |