blob: 378d3b5213aad5e05dd126d973d8baecb748356b [file] [log] [blame]
Ivan Nikulin58cecf12016-09-15 10:44:19 +02001/* Copyright 2016 Google Inc. All Rights Reserved.
Eugene Kliuchnikovdd8fa3e2016-09-22 11:32:23 +02002 Author: zip753@gmail.com (Ivan Nikulin)
Ivan Nikulin58cecf12016-09-15 10:44:19 +02003
4 Distributed under MIT license.
5 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
6*/
7
8/* Tool for generating optimal backward references for the input file. Uses
9 sais-lite library for building suffix array. */
10
11#include <algorithm>
12#include <cassert>
13#include <cstdio>
14#include <cstring>
Eugene Kliuchnikov27d94592016-12-22 13:03:28 +010015#include <functional>
16#include <utility>
17#include <vector>
Ivan Nikulin58cecf12016-09-15 10:44:19 +020018
Eugene Kliuchnikovdd8fa3e2016-09-22 11:32:23 +020019#include <gflags/gflags.h>
20using gflags::ParseCommandLineFlags;
21
Ivan Nikulin92940222016-09-19 19:12:30 +020022#include "./esaxx/sais.hxx"
Ivan Nikulin58cecf12016-09-15 10:44:19 +020023
Eugene Kliuchnikov27d94592016-12-22 13:03:28 +010024DEFINE_bool(advanced, false, "Advanced searching mode: finds all longest "
25 "matches at positions that are not covered by matches of length at least "
26 "max_length. WARNING: uses much more memory than simple mode, especially "
27 "for small values of min_length.");
Eugene Kliuchnikovdd8fa3e2016-09-22 11:32:23 +020028DEFINE_int32(min_length, 1, "Minimal length of found backward references.");
Eugene Kliuchnikov27d94592016-12-22 13:03:28 +010029/* For advanced mode. */
30DEFINE_int32(long_length, 32,
31 "Maximal length of found backward references for advanced mode.");
Eugene Kliuchnikovdd8fa3e2016-09-22 11:32:23 +020032DEFINE_int32(skip, 1, "Number of bytes to skip.");
Ivan Nikulin58cecf12016-09-15 10:44:19 +020033
34const size_t kFileBufferSize = (1 << 16); // 64KB
35
36typedef int sarray_type; // Can't make it unsigned because of templates :(
37typedef uint8_t input_type;
38typedef uint32_t lcp_type;
Eugene Kliuchnikov27d94592016-12-22 13:03:28 +010039typedef std::pair<int, std::vector<int> > entry_type;
40typedef std::function<void(sarray_type*, lcp_type*, size_t, int, int, int, int,
41 int)> Fn;
Ivan Nikulin58cecf12016-09-15 10:44:19 +020042
43void ReadInput(FILE* fin, input_type* storage, size_t input_size) {
44 size_t last_pos = 0;
45 size_t available_in;
46 fseek(fin, 0, SEEK_SET);
47 do {
48 available_in = fread(storage + last_pos, 1, kFileBufferSize, fin);
49 last_pos += available_in;
50 } while (available_in != 0);
51 assert(last_pos == input_size);
52}
53
54void BuildLCP(input_type* storage, sarray_type* sarray, lcp_type* lcp,
55 size_t size, uint32_t* pos) {
56 for (int i = 0; i < size; ++i) {
57 pos[sarray[i]] = i;
58 }
59 uint32_t k = 0;
60 lcp[size - 1] = 0;
61 for (int i = 0; i < size; ++i) {
62 if (pos[i] == size - 1) {
63 k = 0;
64 continue;
65 }
66 uint32_t j = sarray[pos[i] + 1]; // Suffix which follow i-th suffix in SA.
67 while (i + k < size && j + k < size && storage[i + k] == storage[j + k]) {
68 ++k;
69 }
70 lcp[pos[i]] = k;
71 if (k > 0) --k;
72 }
73}
74
Eugene Kliuchnikov27d94592016-12-22 13:03:28 +010075inline void PrintReference(sarray_type* sarray, lcp_type* lcp, size_t size,
76 int idx, int left_ix, int right_ix, int left_lcp,
77 int right_lcp, FILE* fout) {
78 int max_lcp_ix;
79 if (right_ix == size - 1 || (left_ix >= 0 && left_lcp >= right_lcp)) {
80 max_lcp_ix = left_ix;
81 } else {
82 max_lcp_ix = right_ix;
83 }
84 int dist = idx - sarray[max_lcp_ix];
85 assert(dist > 0);
86 fputc(1, fout);
87 fwrite(&idx, sizeof(int), 1, fout); // Position in input.
88 fwrite(&dist, sizeof(int), 1, fout); // Backward distance.
89}
90
91inline void GoLeft(sarray_type* sarray, lcp_type* lcp, int idx, int left_ix,
92 int left_lcp, entry_type* entry) {
93 entry->first = left_lcp;
94 if (left_lcp > FLAGS_long_length) return;
95 for (; left_ix >= 0; --left_ix) {
96 if (lcp[left_ix] < left_lcp) break;
97 if (sarray[left_ix] < idx) {
98 entry->second.push_back(idx - sarray[left_ix]);
99 }
100 }
101}
102
103inline void GoRight(sarray_type* sarray, lcp_type* lcp, int idx, size_t size,
104 int right_ix, int right_lcp, entry_type* entry) {
105 entry->first = right_lcp;
106 if (right_lcp > FLAGS_long_length) return;
107 for (; right_ix < size - 1; ++right_ix) {
108 if (lcp[right_ix] < right_lcp) break;
109 if (sarray[right_ix] < idx) {
110 entry->second.push_back(idx - sarray[right_ix]);
111 }
112 }
113}
114
115inline void StoreReference(sarray_type* sarray, lcp_type* lcp, size_t size,
116 int idx, int left_ix, int right_ix, int left_lcp,
117 int right_lcp, entry_type* entries) {
118 if (right_ix == size - 1 || (left_ix >= 0 && left_lcp > right_lcp)) {
119 // right is invalid or left is better
120 GoLeft(sarray, lcp, idx, left_ix, left_lcp, &entries[idx]);
121 } else if (left_ix < 0 || (right_ix < size - 1 && right_lcp > left_lcp)) {
122 // left is invalid or right is better
123 GoRight(sarray, lcp, idx, size, right_ix, right_lcp, &entries[idx]);
124 } else { // both are valid and of equal length
125 GoLeft(sarray, lcp, idx, left_ix, left_lcp, &entries[idx]);
126 GoRight(sarray, lcp, idx, size, right_ix, right_lcp, &entries[idx]);
127 }
128}
129
130void ProcessReferences(sarray_type* sarray, lcp_type* lcp, size_t size,
131 uint32_t* pos, const Fn& process_output) {
Ivan Nikulin58cecf12016-09-15 10:44:19 +0200132 int min_length = FLAGS_min_length;
133 for (int idx = FLAGS_skip; idx < size; ++idx) {
Ivan Nikulin58cecf12016-09-15 10:44:19 +0200134 int left_lcp = -1;
135 int left_ix;
136 for (left_ix = pos[idx] - 1; left_ix >= 0; --left_ix) {
137 if (left_lcp == -1 || left_lcp > lcp[left_ix]) {
138 left_lcp = lcp[left_ix];
139 }
140 if (left_lcp == 0) break;
141 if (sarray[left_ix] < idx) break;
142 }
Ivan Nikulin58cecf12016-09-15 10:44:19 +0200143
144 int right_lcp = -1;
145 int right_ix;
146 for (right_ix = pos[idx]; right_ix < size - 1; ++right_ix) {
147 if (right_lcp == -1 || right_lcp > lcp[right_ix]) {
148 right_lcp = lcp[right_ix];
149 }
150 // Stop if we have better result from the left side already.
Eugene Kliuchnikov27d94592016-12-22 13:03:28 +0100151 if (right_lcp < left_lcp && left_ix >= 0) break;
Ivan Nikulin58cecf12016-09-15 10:44:19 +0200152 if (right_lcp == 0) break;
153 if (sarray[right_ix] < idx) break;
154 }
Ivan Nikulin58cecf12016-09-15 10:44:19 +0200155
Eugene Kliuchnikov27d94592016-12-22 13:03:28 +0100156 if ((left_ix >= 0 && left_lcp >= min_length) ||
157 (right_ix < size - 1 && right_lcp >= min_length)) {
158 process_output(sarray, lcp, size, idx, left_ix, right_ix, left_lcp,
159 right_lcp);
160 }
161 }
162}
163
164void ProcessEntries(entry_type* entries, size_t size, FILE* fout) {
165 int long_length = FLAGS_long_length;
166 std::vector<std::pair<int, int> > segments;
167 size_t idx;
168 for (idx = 0; idx < size;) {
169 entry_type& entry = entries[idx];
170 if (entry.first > long_length) {
171 // Add segment.
172 if (segments.empty() || segments.back().second < idx) {
173 segments.push_back({idx, idx + entry.first});
174 } else {
175 segments.back().second = idx + entry.first;
Ivan Nikulin58cecf12016-09-15 10:44:19 +0200176 }
Eugene Kliuchnikov27d94592016-12-22 13:03:28 +0100177 }
178 ++idx;
179 }
180 printf("Segments generated.\n");
181 size_t segments_ix = 0;
182 for (idx = 0; idx < size;) {
183 if (idx == segments[segments_ix].first) {
184 // Skip segment.
185 idx = segments[segments_ix].second;
186 } else {
187 for (auto& dist : entries[idx].second) {
188 fputc(1, fout);
189 fwrite(&idx, sizeof(int), 1, fout); // Position in input.
190 fwrite(&dist, sizeof(int), 1, fout); // Backward distance.
191 }
192 ++idx;
Ivan Nikulin58cecf12016-09-15 10:44:19 +0200193 }
194 }
195}
196
197int main(int argc, char* argv[]) {
Eugene Kliuchnikovdd8fa3e2016-09-22 11:32:23 +0200198 ParseCommandLineFlags(&argc, &argv, true);
Ivan Nikulin58cecf12016-09-15 10:44:19 +0200199 if (argc != 3) {
200 printf("usage: %s input_file output_file\n", argv[0]);
201 return 1;
202 }
203
204 FILE* fin = fopen(argv[1], "rb");
205 FILE* fout = fopen(argv[2], "w");
206
207 fseek(fin, 0, SEEK_END);
208 int input_size = ftell(fin);
209 fseek(fin, 0, SEEK_SET);
210 printf("The file size is %u bytes\n", input_size);
211
212 input_type* storage = new input_type[input_size];
213
214 ReadInput(fin, storage, input_size);
215 fclose(fin);
216
217 sarray_type* sarray = new sarray_type[input_size];
218 saisxx(storage, sarray, input_size);
219 printf("Suffix array calculated.\n");
220
221 // Inverse suffix array.
222 uint32_t* pos = new uint32_t[input_size];
223
224 lcp_type* lcp = new lcp_type[input_size];
225 BuildLCP(storage, sarray, lcp, input_size, pos);
226 printf("LCP array constructed.\n");
Eugene Kliuchnikov27d94592016-12-22 13:03:28 +0100227 delete[] storage;
Ivan Nikulin58cecf12016-09-15 10:44:19 +0200228
Eugene Kliuchnikov27d94592016-12-22 13:03:28 +0100229 using std::placeholders::_1;
230 using std::placeholders::_2;
231 using std::placeholders::_3;
232 using std::placeholders::_4;
233 using std::placeholders::_5;
234 using std::placeholders::_6;
235 using std::placeholders::_7;
236 using std::placeholders::_8;
237 entry_type* entries;
238 if (FLAGS_advanced) {
239 entries = new entry_type[input_size];
240 for (size_t i = 0; i < input_size; ++i) entries[i].first = -1;
241 }
242 Fn print = std::bind(PrintReference, _1, _2, _3, _4, _5, _6, _7, _8, fout);
243 Fn store = std::bind(StoreReference, _1, _2, _3, _4, _5, _6, _7, _8, entries);
244
245 ProcessReferences(sarray, lcp, input_size, pos,
246 FLAGS_advanced ? store : print);
247 printf("References processed.\n");
248
249 if (FLAGS_advanced) {
250 int good_cnt = 0;
251 uint64_t avg_cnt = 0;
252 for (size_t i = 0; i < input_size; ++i) {
253 if (entries[i].first != -1) {
254 ++good_cnt;
255 avg_cnt += entries[i].second.size();
256 }
257 }
258 printf("Number of covered positions = %d\n", good_cnt);
259 printf("Average number of references per covered position = %.4lf\n",
260 static_cast<double>(avg_cnt) / good_cnt);
261 ProcessEntries(entries, input_size, fout);
262 printf("Entries processed.\n");
263 }
264
Ivan Nikulin58cecf12016-09-15 10:44:19 +0200265 fclose(fout);
266 return 0;
267}