Blame - suffix_array.h - platform/external/zucchini

blob: 75b3a38e4eba957b5f4ed047ead19c6d665b321b [file] [log] [blame]

Samuel Huang	06f1ae9	2018-03-13 18:19:34 +0000	[diff] [blame]	1	// Copyright 2017 The Chromium Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	#ifndef COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_
				6	#define COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_
				7
				8	#include <algorithm>
				9	#include <iterator>
				10	#include <numeric>
				11	#include <vector>
				12
Hans Wennborg	d5ccca8	2020-06-24 13:46:35 +0000	[diff] [blame]	13	#include "base/check.h"
Samuel Huang	06f1ae9	2018-03-13 18:19:34 +0000	[diff] [blame]	14
				15	namespace zucchini {
				16
				17	// A functor class that implements the naive suffix sorting algorithm that uses
				18	// std::sort with lexicographical compare. This is only meant as reference of
				19	// the interface.
				20	class NaiveSuffixSort {
				21	public:
				22	// Type requirements:
				23	// \|InputRng\| is an input random access range.
				24	// \|KeyType\| is an unsigned integer type.
				25	// \|SAIt\| is a random access iterator with mutable references.
				26	template <class InputRng, class KeyType, class SAIt>
				27	// \|str\| is the input string on which suffix sort is applied.
				28	// Characters found in \|str\| must be in the range [0, \|key_bound\|)
				29	// \|suffix_array\| is the beginning of the destination range, which is at least
				30	// as large as \|str\|.
				31	void operator()(const InputRng& str,
				32	KeyType key_bound,
				33	SAIt suffix_array) const {
				34	using size_type = typename SAIt::value_type;
				35
				36	size_type n = static_cast<size_type>(std::end(str) - std::begin(str));
				37
				38	// \|suffix_array\| is first filled with ordered indices of \|str\|.
				39	// Those indices are then sorted with lexicographical comparisons in \|str\|.
				40	std::iota(suffix_array, suffix_array + n, 0);
				41	std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) {
				42	return std::lexicographical_compare(std::begin(str) + i, std::end(str),
				43	std::begin(str) + j, std::end(str));
				44	});
				45	}
				46	};
				47
				48	// A functor class that implements suffix array induced sorting (SA-IS)
				49	// algorithm with linear time and memory complexity,
				50	// see http://ieeexplore.ieee.org/abstract/document/5582081/
				51	class InducedSuffixSort {
				52	public:
				53	// Type requirements:
				54	// \|InputRng\| is an input random access range.
				55	// \|KeyType\| is an unsigned integer type.
				56	// \|SAIt\| is a random access iterator with mutable values.
				57	template <class InputRng, class KeyType, class SAIt>
				58	// \|str\| is the input string on which suffix sort is applied.
				59	// Characters found in \|str\| must be in the range [0, \|key_bound\|)
				60	// \|suffix_array\| is the beginning of the destination range, which is at least
				61	// as large as \|str\|.
				62	void operator()(const InputRng& str,
				63	KeyType key_bound,
				64	SAIt suffix_array) const {
				65	using value_type = typename InputRng::value_type;
				66	using size_type = typename SAIt::value_type;
				67
				68	static_assert(std::is_unsigned<value_type>::value,
				69	"SA-IS only supports input string with unsigned values");
				70	static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");
				71
				72	size_type n = static_cast<size_type>(std::end(str) - std::begin(str));
				73
				74	Implementation<size_type, KeyType>::SuffixSort(std::begin(str), n,
				75	key_bound, suffix_array);
				76	}
				77
				78	// Given string S of length n. We assume S is terminated by a unique sentinel
				79	// $, which is considered as the smallest character. This sentinel does not
				80	// exist in memory and is only treated implicitly, hence \|n\| does not count
				81	// the sentinel in this implementation. We denote suf(S,i) the suffix formed
				82	// by S[i..n).
				83
				84	// A suffix suf(S,i) is said to be S-type or L-type, if suf(S,i) < suf(S,i+1)
				85	// or suf(S,i) > suf(S,i+1), respectively.
				86	enum SLType : bool { SType, LType };
				87
				88	// A character S[i] is said to be S-type or L-type if the suffix suf(S,i) is
				89	// S-type or L-type, respectively.
				90
				91	// A character S[i] is called LMS (leftmost S-type), if S[i] is S-type and
				92	// S[i-1] is L-type. A suffix suf(S,i) is called LMS, if S[i] is an LMS
				93	// character.
				94
				95	// A substring S[i..j) is an LMS-substring if
				96	// (1) S[i] is LMS, S[j] is LMS or the sentinel $, and S[i..j) has no other
				97	// LMS characters, or
				98	// (2) S[i..j) is the sentinel $.
				99
				100	template <class SizeType, class KeyType>
				101	struct Implementation {
				102	static_assert(std::is_unsigned<SizeType>::value,
				103	"SizeType must be unsigned");
				104	static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");
				105	using size_type = SizeType;
				106	using key_type = KeyType;
				107
				108	using iterator = typename std::vector<size_type>::iterator;
				109	using const_iterator = typename std::vector<size_type>::const_iterator;
				110
				111	// Partition every suffix based on SL-type. Returns the number of LMS
				112	// suffixes.
				113	template <class StrIt>
				114	static size_type BuildSLPartition(
				115	StrIt str,
				116	size_type length,
				117	key_type key_bound,
				118	std::vector<SLType>::reverse_iterator sl_partition_it) {
				119	// We will count LMS suffixes (S to L-type or last S-type).
				120	size_type lms_count = 0;
				121
				122	// \|previous_type\| is initialized to L-type to avoid counting an extra
				123	// LMS suffix at the end
				124	SLType previous_type = LType;
				125
				126	// Initialized to dummy, impossible key.
				127	key_type previous_key = key_bound;
				128
				129	// We're travelling backward to determine the partition,
				130	// as if we prepend one character at a time to the string, ex:
				131	// b$ is L-type because b > $.
				132	// ab$ is S-type because a < b, implying ab$ < b$.
				133	// bab$ is L-type because b > a, implying bab$ > ab$.
				134	// bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$.
				135	for (auto str_it = std::reverse_iterator<StrIt>(str + length);
				136	str_it != std::reverse_iterator<StrIt>(str);
				137	++str_it, ++sl_partition_it) {
				138	key_type current_key = *str_it;
				139
				140	if (current_key > previous_key \|\| previous_key == key_bound) {
				141	// S[i] > S[i + 1] or S[i] is last character.
				142	if (previous_type == SType)
				143	// suf(S,i) is L-type and suf(S,i + 1) is S-type, therefore,
				144	// suf(S,i+1) was a LMS suffix.
				145	++lms_count;
				146
				147	previous_type = LType; // For next round.
				148	} else if (current_key < previous_key) {
				149	// S[i] < S[i + 1]
				150	previous_type = SType; // For next round.
				151	}
				152	// Else, S[i] == S[i + 1]:
				153	// The next character that differs determines the SL-type,
				154	// so we reuse the last seen type.
				155
				156	*sl_partition_it = previous_type;
				157	previous_key = current_key; // For next round.
				158	}
				159
				160	return lms_count;
				161	}
				162
				163	// Find indices of LMS suffixes and write result to \|lms_indices\|.
				164	static void FindLmsSuffixes(const std::vector<SLType>& sl_partition,
				165	iterator lms_indices) {
				166	// \|previous_type\| is initialized to S-type to avoid counting an extra
				167	// LMS suffix at the beginning
				168	SLType previous_type = SType;
				169	for (size_type i = 0; i < sl_partition.size(); ++i) {
				170	if (sl_partition[i] == SType && previous_type == LType)
				171	*lms_indices++ = i;
				172	previous_type = sl_partition[i];
				173	}
				174	}
				175
				176	template <class StrIt>
				177	static std::vector<size_type> MakeBucketCount(StrIt str,
				178	size_type length,
				179	key_type key_bound) {
				180	// Occurrence of every unique character is counted in \|buckets\|
				181	std::vector<size_type> buckets(static_cast<size_type>(key_bound));
				182
				183	for (auto it = str; it != str + length; ++it)
				184	++buckets[*it];
				185	return buckets;
				186	}
				187
				188	// Apply induced sort from \|lms_indices\| to \|suffix_array\| associated with
				189	// the string \|str\|.
				190	template <class StrIt, class SAIt>
				191	static void InducedSort(StrIt str,
				192	size_type length,
				193	const std::vector<SLType>& sl_partition,
				194	const std::vector<size_type>& lms_indices,
				195	const std::vector<size_type>& buckets,
				196	SAIt suffix_array) {
				197	// All indices are first marked as unset with the illegal value \|length\|.
				198	std::fill(suffix_array, suffix_array + length, length);
				199
				200	// Used to mark bucket boundaries (head or end) as indices in str.
				201	DCHECK(!buckets.empty());
				202	std::vector<size_type> bucket_bounds(buckets.size());
				203
				204	// Step 1: Assign indices for LMS suffixes, populating the end of
				205	// respective buckets but keeping relative order.
				206
				207	// Find the end of each bucket and write it to \|bucket_bounds\|.
				208	std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());
				209
				210	// Process each \|lms_indices\| backward, and assign them to the end of
				211	// their respective buckets, so relative order is preserved.
				212	for (auto it = lms_indices.crbegin(); it != lms_indices.crend(); ++it) {
				213	key_type key = str[*it];
				214	suffix_array[--bucket_bounds[key]] = *it;
				215	}
				216
				217	// Step 2
				218	// Scan forward \|suffix_array\|; for each modified suf(S,i) for which
				219	// suf(S,SA(i) - 1) is L-type, place suf(S,SA(i) - 1) to the current
				220	// head of the corresponding bucket and forward the bucket head to the
				221	// right.
				222
				223	// Find the head of each bucket and write it to \|bucket_bounds\|. Since
				224	// only LMS suffixes where inserted in \|suffix_array\| during Step 1,
				225	// \|bucket_bounds\| does not contains the head of each bucket and needs to
				226	// be updated.
				227	bucket_bounds[0] = 0;
				228	std::partial_sum(buckets.begin(), buckets.end() - 1,
				229	bucket_bounds.begin() + 1);
				230
				231	// From Step 1, the sentinel $, which we treat implicitly, would have
				232	// been placed at the beginning of \|suffix_array\|, since $ is always
				233	// considered as the smallest character. We then have to deal with the
				234	// previous (last) suffix.
				235	if (sl_partition[length - 1] == LType) {
				236	key_type key = str[length - 1];
				237	suffix_array[bucket_bounds[key]++] = length - 1;
				238	}
				239	for (auto it = suffix_array; it != suffix_array + length; ++it) {
				240	size_type suffix_index = *it;
				241
				242	// While the original algorithm marks unset suffixes with -1,
				243	// we found that marking them with \|length\| is also possible and more
				244	// convenient because we are working with unsigned integers.
				245	if (suffix_index != length && suffix_index > 0 &&
				246	sl_partition[--suffix_index] == LType) {
				247	key_type key = str[suffix_index];
				248	suffix_array[bucket_bounds[key]++] = suffix_index;
				249	}
				250	}
				251
				252	// Step 3
				253	// Scan backward \|suffix_array\|; for each modified suf(S, i) for which
				254	// suf(S,SA(i) - 1) is S-type, place suf(S,SA(i) - 1) to the current
				255	// end of the corresponding bucket and forward the bucket head to the
				256	// left.
				257
				258	// Find the end of each bucket and write it to \|bucket_bounds\|. Since
				259	// only L-type suffixes where inserted in \|suffix_array\| during Step 2,
				260	// \|bucket_bounds\| does not contain the end of each bucket and needs to
				261	// be updated.
				262	std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());
				263
				264	for (auto it = std::reverse_iterator<SAIt>(suffix_array + length);
				265	it != std::reverse_iterator<SAIt>(suffix_array); ++it) {
				266	size_type suffix_index = *it;
				267	if (suffix_index != length && suffix_index > 0 &&
				268	sl_partition[--suffix_index] == SType) {
				269	key_type key = str[suffix_index];
				270	suffix_array[--bucket_bounds[key]] = suffix_index;
				271	}
				272	}
				273	// Deals with the last suffix, because of the sentinel.
				274	if (sl_partition[length - 1] == SType) {
				275	key_type key = str[length - 1];
				276	suffix_array[--bucket_bounds[key]] = length - 1;
				277	}
				278	}
				279
				280	// Given a string S starting at \|str\| with length \|length\|, an array
				281	// starting at \|substring_array\| containing lexicographically ordered LMS
				282	// terminated substring indices of S and an SL-Type partition \|sl_partition\|
				283	// of S, assigns a unique label to every unique LMS substring. The sorted
				284	// labels for all LMS substrings are written to \|lms_str\|, while the indices
				285	// of LMS suffixes are written to \|lms_indices\|. In addition, returns the
				286	// total number of unique labels.
				287	template <class StrIt, class SAIt>
				288	static size_type LabelLmsSubstrings(StrIt str,
				289	size_type length,
				290	const std::vector<SLType>& sl_partition,
				291	SAIt suffix_array,
				292	iterator lms_indices,
				293	iterator lms_str) {
				294	// Labelling starts at 0.
				295	size_type label = 0;
				296
				297	// \|previous_lms\| is initialized to 0 to indicate it is unset.
				298	// Note that suf(S,0) is never a LMS suffix. Substrings will be visited in
				299	// lexicographical order.
				300	size_type previous_lms = 0;
				301	for (auto it = suffix_array; it != suffix_array + length; ++it) {
				302	if (it > 0 && sl_partition[it] == SType &&
				303	sl_partition[*it - 1] == LType) {
				304	// suf(S, *it) is a LMS suffix.
				305
				306	size_type current_lms = *it;
				307	if (previous_lms != 0) {
				308	// There was a previous LMS suffix. Check if the current LMS
				309	// substring is equal to the previous one.
				310	SLType current_lms_type = SType;
				311	SLType previous_lms_type = SType;
				312	for (size_type k = 0;; ++k) {
				313	// \|current_lms_end\| and \|previous_lms_end\| denote whether we have
				314	// reached the end of the current and previous LMS substring,
				315	// respectively
				316	bool current_lms_end = false;
				317	bool previous_lms_end = false;
				318
				319	// Check for both previous and current substring ends.
				320	// Note that it is more convenient to check if
				321	// suf(S,current_lms + k) is an LMS suffix than to retrieve it
				322	// from lms_indices.
				323	if (current_lms + k >= length \|\|
				324	(current_lms_type == LType &&
				325	sl_partition[current_lms + k] == SType)) {
				326	current_lms_end = true;
				327	}
				328	if (previous_lms + k >= length \|\|
				329	(previous_lms_type == LType &&
				330	sl_partition[previous_lms + k] == SType)) {
				331	previous_lms_end = true;
				332	}
				333
				334	if (current_lms_end && previous_lms_end) {
				335	break; // Previous and current substrings are identical.
				336	} else if (current_lms_end != previous_lms_end \|\|
				337	str[current_lms + k] != str[previous_lms + k]) {
				338	// Previous and current substrings differ, a new label is used.
				339	++label;
				340	break;
				341	}
				342
				343	current_lms_type = sl_partition[current_lms + k];
				344	previous_lms_type = sl_partition[previous_lms + k];
				345	}
				346	}
				347	lms_indices++ = it;
				348	*lms_str++ = label;
				349	previous_lms = current_lms;
				350	}
				351	}
				352
				353	return label + 1;
				354	}
				355
				356	// Implementation of the SA-IS algorithm. \|str\| must be a random access
				357	// iterator pointing at the beginning of S with length \|length\|. The result
				358	// is writtend in \|suffix_array\|, a random access iterator.
				359	template <class StrIt, class SAIt>
				360	static void SuffixSort(StrIt str,
				361	size_type length,
				362	key_type key_bound,
				363	SAIt suffix_array) {
				364	if (length == 1)
				365	*suffix_array = 0;
				366	if (length < 2)
				367	return;
				368
				369	std::vector<SLType> sl_partition(length);
				370	size_type lms_count =
				371	BuildSLPartition(str, length, key_bound, sl_partition.rbegin());
				372	std::vector<size_type> lms_indices(lms_count);
				373	FindLmsSuffixes(sl_partition, lms_indices.begin());
				374	std::vector<size_type> buckets = MakeBucketCount(str, length, key_bound);
				375
				376	if (lms_indices.size() > 1) {
				377	// Given \|lms_indices\| in the same order they appear in \|str\|, induce
				378	// LMS substrings relative order and write result to \|suffix_array\|.
				379	InducedSort(str, length, sl_partition, lms_indices, buckets,
				380	suffix_array);
				381	std::vector<size_type> lms_str(lms_indices.size());
				382
				383	// Given LMS substrings in relative order found in \|suffix_array\|,
				384	// map LMS substrings to unique labels to form a new string, \|lms_str\|.
				385	size_type label_count =
				386	LabelLmsSubstrings(str, length, sl_partition, suffix_array,
				387	lms_indices.begin(), lms_str.begin());
				388
				389	if (label_count < lms_str.size()) {
				390	// Reorder \|lms_str\| to have LMS suffixes in the same order they
				391	// appear in \|str\|.
				392	for (size_type i = 0; i < lms_indices.size(); ++i)
				393	suffix_array[lms_indices[i]] = lms_str[i];
				394
				395	SLType previous_type = SType;
				396	for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) {
				397	if (sl_partition[i] == SType && previous_type == LType) {
				398	lms_str[j] = suffix_array[i];
				399	lms_indices[j++] = i;
				400	}
				401	previous_type = sl_partition[i];
				402	}
				403
				404	// Recursively apply SuffixSort on \|lms_str\|, which is formed from
				405	// labeled LMS suffixes in the same order they appear in \|str\|.
				406	// Note that \|KeyType\| will be size_type because \|lms_str\| contains
				407	// indices. \|lms_str\| is at most half the length of \|str\|.
				408	Implementation<size_type, size_type>::SuffixSort(
				409	lms_str.begin(), static_cast<size_type>(lms_str.size()),
				410	label_count, suffix_array);
				411
				412	// Map LMS labels back to indices in \|str\| and write result to
				413	// \|lms_indices\|. We're using \|suffix_array\| as a temporary buffer.
				414	for (size_type i = 0; i < lms_indices.size(); ++i)
				415	suffix_array[i] = lms_indices[suffix_array[i]];
				416	std::copy_n(suffix_array, lms_indices.size(), lms_indices.begin());
				417
				418	// At this point, \|lms_indices\| contains sorted LMS suffixes of \|str\|.
				419	}
				420	}
				421	// Given \|lms_indices\| where LMS suffixes are sorted, induce the full
				422	// order of suffixes in \|str\|.
				423	InducedSort(str, length, sl_partition, lms_indices, buckets,
				424	suffix_array);
				425	}
				426
Samuel Huang	f137bf4	2021-08-13 15:42:26 +0000	[diff] [blame]	427	Implementation() = delete;
				428	Implementation(const Implementation&) = delete;
				429	const Implementation& operator=(const Implementation&) = delete;
Samuel Huang	06f1ae9	2018-03-13 18:19:34 +0000	[diff] [blame]	430	};
				431	};
				432
				433	// Generates a sorted suffix array for the input string \|str\| using the functor
				434	// \|Algorithm\| which provides an interface equivalent to NaiveSuffixSort.
				435	/// Characters found in \|str\| are assumed to be in range [0, \|key_bound\|).
				436	// Returns the suffix array as a vector.
				437	// \|StrRng\| is an input random access range.
				438	// \|KeyType\| is an unsigned integer type.
				439	template <class Algorithm, class StrRng, class KeyType>
				440	std::vector<typename StrRng::size_type> MakeSuffixArray(const StrRng& str,
				441	KeyType key_bound) {
				442	Algorithm sort;
				443	std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin());
				444	sort(str, key_bound, suffix_array.begin());
				445	return suffix_array;
				446	}
				447
				448	// Type requirements:
				449	// \|SARng\| is an input random access range.
				450	// \|StrIt1\| is a random access iterator.
				451	// \|StrIt2\| is a forward iterator.
				452	template <class SARng, class StrIt1, class StrIt2>
				453	// Lexicographical lower bound using binary search for
				454	// [\|str2_first\|, \|str2_last\|) in the suffix array \|suffix_array\| of a string
				455	// starting at \|str1_first\|. This does not necessarily return the index of
				456	// the longest matching substring.
				457	auto SuffixLowerBound(const SARng& suffix_array,
				458	StrIt1 str1_first,
				459	StrIt2 str2_first,
				460	StrIt2 str2_last) -> decltype(std::begin(suffix_array)) {
				461	using size_type = typename SARng::value_type;
				462
				463	size_t n = std::end(suffix_array) - std::begin(suffix_array);
				464	auto it = std::lower_bound(
				465	std::begin(suffix_array), std::end(suffix_array), str2_first,
				466	[str1_first, str2_last, n](size_type a, StrIt2 b) {
				467	return std::lexicographical_compare(str1_first + a, str1_first + n, b,
				468	str2_last);
				469	});
				470	return it;
				471	}
				472
				473	} // namespace zucchini
				474
				475	#endif // COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_