blob: 75b3a38e4eba957b5f4ed047ead19c6d665b321b [file] [log] [blame]
Samuel Huang06f1ae92018-03-13 18:19:34 +00001// Copyright 2017 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_
6#define COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_
7
8#include <algorithm>
9#include <iterator>
10#include <numeric>
11#include <vector>
12
Hans Wennborgd5ccca82020-06-24 13:46:35 +000013#include "base/check.h"
Samuel Huang06f1ae92018-03-13 18:19:34 +000014
15namespace zucchini {
16
17// A functor class that implements the naive suffix sorting algorithm that uses
18// std::sort with lexicographical compare. This is only meant as reference of
19// the interface.
20class NaiveSuffixSort {
21 public:
22 // Type requirements:
23 // |InputRng| is an input random access range.
24 // |KeyType| is an unsigned integer type.
25 // |SAIt| is a random access iterator with mutable references.
26 template <class InputRng, class KeyType, class SAIt>
27 // |str| is the input string on which suffix sort is applied.
28 // Characters found in |str| must be in the range [0, |key_bound|)
29 // |suffix_array| is the beginning of the destination range, which is at least
30 // as large as |str|.
31 void operator()(const InputRng& str,
32 KeyType key_bound,
33 SAIt suffix_array) const {
34 using size_type = typename SAIt::value_type;
35
36 size_type n = static_cast<size_type>(std::end(str) - std::begin(str));
37
38 // |suffix_array| is first filled with ordered indices of |str|.
39 // Those indices are then sorted with lexicographical comparisons in |str|.
40 std::iota(suffix_array, suffix_array + n, 0);
41 std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) {
42 return std::lexicographical_compare(std::begin(str) + i, std::end(str),
43 std::begin(str) + j, std::end(str));
44 });
45 }
46};
47
48// A functor class that implements suffix array induced sorting (SA-IS)
49// algorithm with linear time and memory complexity,
50// see http://ieeexplore.ieee.org/abstract/document/5582081/
51class InducedSuffixSort {
52 public:
53 // Type requirements:
54 // |InputRng| is an input random access range.
55 // |KeyType| is an unsigned integer type.
56 // |SAIt| is a random access iterator with mutable values.
57 template <class InputRng, class KeyType, class SAIt>
58 // |str| is the input string on which suffix sort is applied.
59 // Characters found in |str| must be in the range [0, |key_bound|)
60 // |suffix_array| is the beginning of the destination range, which is at least
61 // as large as |str|.
62 void operator()(const InputRng& str,
63 KeyType key_bound,
64 SAIt suffix_array) const {
65 using value_type = typename InputRng::value_type;
66 using size_type = typename SAIt::value_type;
67
68 static_assert(std::is_unsigned<value_type>::value,
69 "SA-IS only supports input string with unsigned values");
70 static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");
71
72 size_type n = static_cast<size_type>(std::end(str) - std::begin(str));
73
74 Implementation<size_type, KeyType>::SuffixSort(std::begin(str), n,
75 key_bound, suffix_array);
76 }
77
78 // Given string S of length n. We assume S is terminated by a unique sentinel
79 // $, which is considered as the smallest character. This sentinel does not
80 // exist in memory and is only treated implicitly, hence |n| does not count
81 // the sentinel in this implementation. We denote suf(S,i) the suffix formed
82 // by S[i..n).
83
84 // A suffix suf(S,i) is said to be S-type or L-type, if suf(S,i) < suf(S,i+1)
85 // or suf(S,i) > suf(S,i+1), respectively.
86 enum SLType : bool { SType, LType };
87
88 // A character S[i] is said to be S-type or L-type if the suffix suf(S,i) is
89 // S-type or L-type, respectively.
90
91 // A character S[i] is called LMS (leftmost S-type), if S[i] is S-type and
92 // S[i-1] is L-type. A suffix suf(S,i) is called LMS, if S[i] is an LMS
93 // character.
94
95 // A substring S[i..j) is an LMS-substring if
96 // (1) S[i] is LMS, S[j] is LMS or the sentinel $, and S[i..j) has no other
97 // LMS characters, or
98 // (2) S[i..j) is the sentinel $.
99
100 template <class SizeType, class KeyType>
101 struct Implementation {
102 static_assert(std::is_unsigned<SizeType>::value,
103 "SizeType must be unsigned");
104 static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");
105 using size_type = SizeType;
106 using key_type = KeyType;
107
108 using iterator = typename std::vector<size_type>::iterator;
109 using const_iterator = typename std::vector<size_type>::const_iterator;
110
111 // Partition every suffix based on SL-type. Returns the number of LMS
112 // suffixes.
113 template <class StrIt>
114 static size_type BuildSLPartition(
115 StrIt str,
116 size_type length,
117 key_type key_bound,
118 std::vector<SLType>::reverse_iterator sl_partition_it) {
119 // We will count LMS suffixes (S to L-type or last S-type).
120 size_type lms_count = 0;
121
122 // |previous_type| is initialized to L-type to avoid counting an extra
123 // LMS suffix at the end
124 SLType previous_type = LType;
125
126 // Initialized to dummy, impossible key.
127 key_type previous_key = key_bound;
128
129 // We're travelling backward to determine the partition,
130 // as if we prepend one character at a time to the string, ex:
131 // b$ is L-type because b > $.
132 // ab$ is S-type because a < b, implying ab$ < b$.
133 // bab$ is L-type because b > a, implying bab$ > ab$.
134 // bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$.
135 for (auto str_it = std::reverse_iterator<StrIt>(str + length);
136 str_it != std::reverse_iterator<StrIt>(str);
137 ++str_it, ++sl_partition_it) {
138 key_type current_key = *str_it;
139
140 if (current_key > previous_key || previous_key == key_bound) {
141 // S[i] > S[i + 1] or S[i] is last character.
142 if (previous_type == SType)
143 // suf(S,i) is L-type and suf(S,i + 1) is S-type, therefore,
144 // suf(S,i+1) was a LMS suffix.
145 ++lms_count;
146
147 previous_type = LType; // For next round.
148 } else if (current_key < previous_key) {
149 // S[i] < S[i + 1]
150 previous_type = SType; // For next round.
151 }
152 // Else, S[i] == S[i + 1]:
153 // The next character that differs determines the SL-type,
154 // so we reuse the last seen type.
155
156 *sl_partition_it = previous_type;
157 previous_key = current_key; // For next round.
158 }
159
160 return lms_count;
161 }
162
163 // Find indices of LMS suffixes and write result to |lms_indices|.
164 static void FindLmsSuffixes(const std::vector<SLType>& sl_partition,
165 iterator lms_indices) {
166 // |previous_type| is initialized to S-type to avoid counting an extra
167 // LMS suffix at the beginning
168 SLType previous_type = SType;
169 for (size_type i = 0; i < sl_partition.size(); ++i) {
170 if (sl_partition[i] == SType && previous_type == LType)
171 *lms_indices++ = i;
172 previous_type = sl_partition[i];
173 }
174 }
175
176 template <class StrIt>
177 static std::vector<size_type> MakeBucketCount(StrIt str,
178 size_type length,
179 key_type key_bound) {
180 // Occurrence of every unique character is counted in |buckets|
181 std::vector<size_type> buckets(static_cast<size_type>(key_bound));
182
183 for (auto it = str; it != str + length; ++it)
184 ++buckets[*it];
185 return buckets;
186 }
187
188 // Apply induced sort from |lms_indices| to |suffix_array| associated with
189 // the string |str|.
190 template <class StrIt, class SAIt>
191 static void InducedSort(StrIt str,
192 size_type length,
193 const std::vector<SLType>& sl_partition,
194 const std::vector<size_type>& lms_indices,
195 const std::vector<size_type>& buckets,
196 SAIt suffix_array) {
197 // All indices are first marked as unset with the illegal value |length|.
198 std::fill(suffix_array, suffix_array + length, length);
199
200 // Used to mark bucket boundaries (head or end) as indices in str.
201 DCHECK(!buckets.empty());
202 std::vector<size_type> bucket_bounds(buckets.size());
203
204 // Step 1: Assign indices for LMS suffixes, populating the end of
205 // respective buckets but keeping relative order.
206
207 // Find the end of each bucket and write it to |bucket_bounds|.
208 std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());
209
210 // Process each |lms_indices| backward, and assign them to the end of
211 // their respective buckets, so relative order is preserved.
212 for (auto it = lms_indices.crbegin(); it != lms_indices.crend(); ++it) {
213 key_type key = str[*it];
214 suffix_array[--bucket_bounds[key]] = *it;
215 }
216
217 // Step 2
218 // Scan forward |suffix_array|; for each modified suf(S,i) for which
219 // suf(S,SA(i) - 1) is L-type, place suf(S,SA(i) - 1) to the current
220 // head of the corresponding bucket and forward the bucket head to the
221 // right.
222
223 // Find the head of each bucket and write it to |bucket_bounds|. Since
224 // only LMS suffixes where inserted in |suffix_array| during Step 1,
225 // |bucket_bounds| does not contains the head of each bucket and needs to
226 // be updated.
227 bucket_bounds[0] = 0;
228 std::partial_sum(buckets.begin(), buckets.end() - 1,
229 bucket_bounds.begin() + 1);
230
231 // From Step 1, the sentinel $, which we treat implicitly, would have
232 // been placed at the beginning of |suffix_array|, since $ is always
233 // considered as the smallest character. We then have to deal with the
234 // previous (last) suffix.
235 if (sl_partition[length - 1] == LType) {
236 key_type key = str[length - 1];
237 suffix_array[bucket_bounds[key]++] = length - 1;
238 }
239 for (auto it = suffix_array; it != suffix_array + length; ++it) {
240 size_type suffix_index = *it;
241
242 // While the original algorithm marks unset suffixes with -1,
243 // we found that marking them with |length| is also possible and more
244 // convenient because we are working with unsigned integers.
245 if (suffix_index != length && suffix_index > 0 &&
246 sl_partition[--suffix_index] == LType) {
247 key_type key = str[suffix_index];
248 suffix_array[bucket_bounds[key]++] = suffix_index;
249 }
250 }
251
252 // Step 3
253 // Scan backward |suffix_array|; for each modified suf(S, i) for which
254 // suf(S,SA(i) - 1) is S-type, place suf(S,SA(i) - 1) to the current
255 // end of the corresponding bucket and forward the bucket head to the
256 // left.
257
258 // Find the end of each bucket and write it to |bucket_bounds|. Since
259 // only L-type suffixes where inserted in |suffix_array| during Step 2,
260 // |bucket_bounds| does not contain the end of each bucket and needs to
261 // be updated.
262 std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());
263
264 for (auto it = std::reverse_iterator<SAIt>(suffix_array + length);
265 it != std::reverse_iterator<SAIt>(suffix_array); ++it) {
266 size_type suffix_index = *it;
267 if (suffix_index != length && suffix_index > 0 &&
268 sl_partition[--suffix_index] == SType) {
269 key_type key = str[suffix_index];
270 suffix_array[--bucket_bounds[key]] = suffix_index;
271 }
272 }
273 // Deals with the last suffix, because of the sentinel.
274 if (sl_partition[length - 1] == SType) {
275 key_type key = str[length - 1];
276 suffix_array[--bucket_bounds[key]] = length - 1;
277 }
278 }
279
280 // Given a string S starting at |str| with length |length|, an array
281 // starting at |substring_array| containing lexicographically ordered LMS
282 // terminated substring indices of S and an SL-Type partition |sl_partition|
283 // of S, assigns a unique label to every unique LMS substring. The sorted
284 // labels for all LMS substrings are written to |lms_str|, while the indices
285 // of LMS suffixes are written to |lms_indices|. In addition, returns the
286 // total number of unique labels.
287 template <class StrIt, class SAIt>
288 static size_type LabelLmsSubstrings(StrIt str,
289 size_type length,
290 const std::vector<SLType>& sl_partition,
291 SAIt suffix_array,
292 iterator lms_indices,
293 iterator lms_str) {
294 // Labelling starts at 0.
295 size_type label = 0;
296
297 // |previous_lms| is initialized to 0 to indicate it is unset.
298 // Note that suf(S,0) is never a LMS suffix. Substrings will be visited in
299 // lexicographical order.
300 size_type previous_lms = 0;
301 for (auto it = suffix_array; it != suffix_array + length; ++it) {
302 if (*it > 0 && sl_partition[*it] == SType &&
303 sl_partition[*it - 1] == LType) {
304 // suf(S, *it) is a LMS suffix.
305
306 size_type current_lms = *it;
307 if (previous_lms != 0) {
308 // There was a previous LMS suffix. Check if the current LMS
309 // substring is equal to the previous one.
310 SLType current_lms_type = SType;
311 SLType previous_lms_type = SType;
312 for (size_type k = 0;; ++k) {
313 // |current_lms_end| and |previous_lms_end| denote whether we have
314 // reached the end of the current and previous LMS substring,
315 // respectively
316 bool current_lms_end = false;
317 bool previous_lms_end = false;
318
319 // Check for both previous and current substring ends.
320 // Note that it is more convenient to check if
321 // suf(S,current_lms + k) is an LMS suffix than to retrieve it
322 // from lms_indices.
323 if (current_lms + k >= length ||
324 (current_lms_type == LType &&
325 sl_partition[current_lms + k] == SType)) {
326 current_lms_end = true;
327 }
328 if (previous_lms + k >= length ||
329 (previous_lms_type == LType &&
330 sl_partition[previous_lms + k] == SType)) {
331 previous_lms_end = true;
332 }
333
334 if (current_lms_end && previous_lms_end) {
335 break; // Previous and current substrings are identical.
336 } else if (current_lms_end != previous_lms_end ||
337 str[current_lms + k] != str[previous_lms + k]) {
338 // Previous and current substrings differ, a new label is used.
339 ++label;
340 break;
341 }
342
343 current_lms_type = sl_partition[current_lms + k];
344 previous_lms_type = sl_partition[previous_lms + k];
345 }
346 }
347 *lms_indices++ = *it;
348 *lms_str++ = label;
349 previous_lms = current_lms;
350 }
351 }
352
353 return label + 1;
354 }
355
356 // Implementation of the SA-IS algorithm. |str| must be a random access
357 // iterator pointing at the beginning of S with length |length|. The result
358 // is writtend in |suffix_array|, a random access iterator.
359 template <class StrIt, class SAIt>
360 static void SuffixSort(StrIt str,
361 size_type length,
362 key_type key_bound,
363 SAIt suffix_array) {
364 if (length == 1)
365 *suffix_array = 0;
366 if (length < 2)
367 return;
368
369 std::vector<SLType> sl_partition(length);
370 size_type lms_count =
371 BuildSLPartition(str, length, key_bound, sl_partition.rbegin());
372 std::vector<size_type> lms_indices(lms_count);
373 FindLmsSuffixes(sl_partition, lms_indices.begin());
374 std::vector<size_type> buckets = MakeBucketCount(str, length, key_bound);
375
376 if (lms_indices.size() > 1) {
377 // Given |lms_indices| in the same order they appear in |str|, induce
378 // LMS substrings relative order and write result to |suffix_array|.
379 InducedSort(str, length, sl_partition, lms_indices, buckets,
380 suffix_array);
381 std::vector<size_type> lms_str(lms_indices.size());
382
383 // Given LMS substrings in relative order found in |suffix_array|,
384 // map LMS substrings to unique labels to form a new string, |lms_str|.
385 size_type label_count =
386 LabelLmsSubstrings(str, length, sl_partition, suffix_array,
387 lms_indices.begin(), lms_str.begin());
388
389 if (label_count < lms_str.size()) {
390 // Reorder |lms_str| to have LMS suffixes in the same order they
391 // appear in |str|.
392 for (size_type i = 0; i < lms_indices.size(); ++i)
393 suffix_array[lms_indices[i]] = lms_str[i];
394
395 SLType previous_type = SType;
396 for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) {
397 if (sl_partition[i] == SType && previous_type == LType) {
398 lms_str[j] = suffix_array[i];
399 lms_indices[j++] = i;
400 }
401 previous_type = sl_partition[i];
402 }
403
404 // Recursively apply SuffixSort on |lms_str|, which is formed from
405 // labeled LMS suffixes in the same order they appear in |str|.
406 // Note that |KeyType| will be size_type because |lms_str| contains
407 // indices. |lms_str| is at most half the length of |str|.
408 Implementation<size_type, size_type>::SuffixSort(
409 lms_str.begin(), static_cast<size_type>(lms_str.size()),
410 label_count, suffix_array);
411
412 // Map LMS labels back to indices in |str| and write result to
413 // |lms_indices|. We're using |suffix_array| as a temporary buffer.
414 for (size_type i = 0; i < lms_indices.size(); ++i)
415 suffix_array[i] = lms_indices[suffix_array[i]];
416 std::copy_n(suffix_array, lms_indices.size(), lms_indices.begin());
417
418 // At this point, |lms_indices| contains sorted LMS suffixes of |str|.
419 }
420 }
421 // Given |lms_indices| where LMS suffixes are sorted, induce the full
422 // order of suffixes in |str|.
423 InducedSort(str, length, sl_partition, lms_indices, buckets,
424 suffix_array);
425 }
426
Samuel Huangf137bf42021-08-13 15:42:26 +0000427 Implementation() = delete;
428 Implementation(const Implementation&) = delete;
429 const Implementation& operator=(const Implementation&) = delete;
Samuel Huang06f1ae92018-03-13 18:19:34 +0000430 };
431};
432
433// Generates a sorted suffix array for the input string |str| using the functor
434// |Algorithm| which provides an interface equivalent to NaiveSuffixSort.
435/// Characters found in |str| are assumed to be in range [0, |key_bound|).
436// Returns the suffix array as a vector.
437// |StrRng| is an input random access range.
438// |KeyType| is an unsigned integer type.
439template <class Algorithm, class StrRng, class KeyType>
440std::vector<typename StrRng::size_type> MakeSuffixArray(const StrRng& str,
441 KeyType key_bound) {
442 Algorithm sort;
443 std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin());
444 sort(str, key_bound, suffix_array.begin());
445 return suffix_array;
446}
447
448// Type requirements:
449// |SARng| is an input random access range.
450// |StrIt1| is a random access iterator.
451// |StrIt2| is a forward iterator.
452template <class SARng, class StrIt1, class StrIt2>
453// Lexicographical lower bound using binary search for
454// [|str2_first|, |str2_last|) in the suffix array |suffix_array| of a string
455// starting at |str1_first|. This does not necessarily return the index of
456// the longest matching substring.
457auto SuffixLowerBound(const SARng& suffix_array,
458 StrIt1 str1_first,
459 StrIt2 str2_first,
460 StrIt2 str2_last) -> decltype(std::begin(suffix_array)) {
461 using size_type = typename SARng::value_type;
462
463 size_t n = std::end(suffix_array) - std::begin(suffix_array);
464 auto it = std::lower_bound(
465 std::begin(suffix_array), std::end(suffix_array), str2_first,
466 [str1_first, str2_last, n](size_type a, StrIt2 b) {
467 return std::lexicographical_compare(str1_first + a, str1_first + n, b,
468 str2_last);
469 });
470 return it;
471}
472
473} // namespace zucchini
474
475#endif // COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_