Blame - src/string-search.h - fp2-dev/platform/external/v8

blob: d7959c0be982c35208c14cfc8803ec00a7f14a9b [file] [log] [blame]

Steve Block	5915150	2010-09-22 15:07:15 +0100	[diff] [blame^]	1	// Copyright 2010 the V8 project authors. All rights reserved.
				2	// Redistribution and use in source and binary forms, with or without
				3	// modification, are permitted provided that the following conditions are
				4	// met:
				5	//
				6	// * Redistributions of source code must retain the above copyright
				7	// notice, this list of conditions and the following disclaimer.
				8	// * Redistributions in binary form must reproduce the above
				9	// copyright notice, this list of conditions and the following
				10	// disclaimer in the documentation and/or other materials provided
				11	// with the distribution.
				12	// * Neither the name of Google Inc. nor the names of its
				13	// contributors may be used to endorse or promote products derived
				14	// from this software without specific prior written permission.
				15	//
				16	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				17	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				18	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				19	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				20	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				21	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				22	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				23	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				24	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				25	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				26	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				27
				28	#ifndef V8_STRING_SEARCH_H_
				29	#define V8_STRING_SEARCH_H_
				30
				31	namespace v8 {
				32	namespace internal {
				33
				34
				35	// Cap on the maximal shift in the Boyer-Moore implementation. By setting a
				36	// limit, we can fix the size of tables. For a needle longer than this limit,
				37	// search will not be optimal, since we only build tables for a smaller suffix
				38	// of the string, which is a safe approximation.
				39	static const int kBMMaxShift = 250;
				40	// Reduce alphabet to this size.
				41	// One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
				42	// proportional to the input alphabet. We reduce the alphabet size by
				43	// equating input characters modulo a smaller alphabet size. This gives
				44	// a potentially less efficient searching, but is a safe approximation.
				45	// For needles using only characters in the same Unicode 256-code point page,
				46	// there is no search speed degradation.
				47	static const int kBMAlphabetSize = 256;
				48	// For patterns below this length, the skip length of Boyer-Moore is too short
				49	// to compensate for the algorithmic overhead compared to simple brute force.
				50	static const int kBMMinPatternLength = 7;
				51
				52	// Holds the two buffers used by Boyer-Moore string search's Good Suffix
				53	// shift. Only allows the last kBMMaxShift characters of the needle
				54	// to be indexed.
				55	class BMGoodSuffixBuffers {
				56	public:
				57	BMGoodSuffixBuffers() {}
				58	inline void Initialize(int needle_length) {
				59	ASSERT(needle_length > 1);
				60	int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift;
				61	int len = needle_length - start;
				62	biased_suffixes_ = suffixes_ - start;
				63	biased_good_suffix_shift_ = good_suffix_shift_ - start;
				64	for (int i = 0; i <= len; i++) {
				65	good_suffix_shift_[i] = len;
				66	}
				67	}
				68	inline int& suffix(int index) {
				69	ASSERT(biased_suffixes_ + index >= suffixes_);
				70	return biased_suffixes_[index];
				71	}
				72	inline int& shift(int index) {
				73	ASSERT(biased_good_suffix_shift_ + index >= good_suffix_shift_);
				74	return biased_good_suffix_shift_[index];
				75	}
				76	private:
				77	int suffixes_[kBMMaxShift + 1];
				78	int good_suffix_shift_[kBMMaxShift + 1];
				79	int* biased_suffixes_;
				80	int* biased_good_suffix_shift_;
				81	DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers);
				82	};
				83
				84	// buffers reused by BoyerMoore
				85	struct BMBuffers {
				86	public:
				87	static int bad_char_occurrence[kBMAlphabetSize];
				88	static BMGoodSuffixBuffers bmgs_buffers;
				89	};
				90
				91	// State of the string match tables.
				92	// SIMPLE: No usable content in the buffers.
				93	// BOYER_MOORE_HORSPOOL: The bad_char_occurence table has been populated.
				94	// BOYER_MOORE: The bmgs_buffers tables have also been populated.
				95	// Whenever starting with a new needle, one should call InitializeStringSearch
				96	// to determine which search strategy to use, and in the case of a long-needle
				97	// strategy, the call also initializes the algorithm to SIMPLE.
				98	enum StringSearchAlgorithm { SIMPLE_SEARCH, BOYER_MOORE_HORSPOOL, BOYER_MOORE };
				99	static StringSearchAlgorithm algorithm;
				100
				101
				102	// Compute the bad-char table for Boyer-Moore in the static buffer.
				103	template <typename PatternChar>
				104	static void BoyerMoorePopulateBadCharTable(Vector<const PatternChar> pattern) {
				105	// Only preprocess at most kBMMaxShift last characters of pattern.
				106	int start = Max(pattern.length() - kBMMaxShift, 0);
				107	// Run forwards to populate bad_char_table, so that last instance
				108	// of character equivalence class is the one registered.
				109	// Notice: Doesn't include the last character.
				110	int table_size = (sizeof(PatternChar) == 1) ? String::kMaxAsciiCharCode + 1
				111	: kBMAlphabetSize;
				112	if (start == 0) { // All patterns less than kBMMaxShift in length.
				113	memset(BMBuffers::bad_char_occurrence,
				114	-1,
				115	table_size * sizeof(*BMBuffers::bad_char_occurrence));
				116	} else {
				117	for (int i = 0; i < table_size; i++) {
				118	BMBuffers::bad_char_occurrence[i] = start - 1;
				119	}
				120	}
				121	for (int i = start; i < pattern.length() - 1; i++) {
				122	PatternChar c = pattern[i];
				123	int bucket = (sizeof(PatternChar) ==1) ? c : c % kBMAlphabetSize;
				124	BMBuffers::bad_char_occurrence[bucket] = i;
				125	}
				126	}
				127
				128
				129	template <typename PatternChar>
				130	static void BoyerMoorePopulateGoodSuffixTable(
				131	Vector<const PatternChar> pattern) {
				132	int m = pattern.length();
				133	int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
				134	int len = m - start;
				135	// Compute Good Suffix tables.
				136	BMBuffers::bmgs_buffers.Initialize(m);
				137
				138	BMBuffers::bmgs_buffers.shift(m-1) = 1;
				139	BMBuffers::bmgs_buffers.suffix(m) = m + 1;
				140	PatternChar last_char = pattern[m - 1];
				141	int suffix = m + 1;
				142	{
				143	int i = m;
				144	while (i > start) {
				145	PatternChar c = pattern[i - 1];
				146	while (suffix <= m && c != pattern[suffix - 1]) {
				147	if (BMBuffers::bmgs_buffers.shift(suffix) == len) {
				148	BMBuffers::bmgs_buffers.shift(suffix) = suffix - i;
				149	}
				150	suffix = BMBuffers::bmgs_buffers.suffix(suffix);
				151	}
				152	BMBuffers::bmgs_buffers.suffix(--i) = --suffix;
				153	if (suffix == m) {
				154	// No suffix to extend, so we check against last_char only.
				155	while ((i > start) && (pattern[i - 1] != last_char)) {
				156	if (BMBuffers::bmgs_buffers.shift(m) == len) {
				157	BMBuffers::bmgs_buffers.shift(m) = m - i;
				158	}
				159	BMBuffers::bmgs_buffers.suffix(--i) = m;
				160	}
				161	if (i > start) {
				162	BMBuffers::bmgs_buffers.suffix(--i) = --suffix;
				163	}
				164	}
				165	}
				166	}
				167	if (suffix < m) {
				168	for (int i = start; i <= m; i++) {
				169	if (BMBuffers::bmgs_buffers.shift(i) == len) {
				170	BMBuffers::bmgs_buffers.shift(i) = suffix - start;
				171	}
				172	if (i == suffix) {
				173	suffix = BMBuffers::bmgs_buffers.suffix(suffix);
				174	}
				175	}
				176	}
				177	}
				178
				179
				180	template <typename SubjectChar, typename PatternChar>
				181	static inline int CharOccurrence(int char_code) {
				182	if (sizeof(SubjectChar) == 1) {
				183	return BMBuffers::bad_char_occurrence[char_code];
				184	}
				185	if (sizeof(PatternChar) == 1) {
				186	if (char_code > String::kMaxAsciiCharCode) {
				187	return -1;
				188	}
				189	return BMBuffers::bad_char_occurrence[char_code];
				190	}
				191	return BMBuffers::bad_char_occurrence[char_code % kBMAlphabetSize];
				192	}
				193
				194
				195	// Restricted simplified Boyer-Moore string matching.
				196	// Uses only the bad-shift table of Boyer-Moore and only uses it
				197	// for the character compared to the last character of the needle.
				198	template <typename SubjectChar, typename PatternChar>
				199	static int BoyerMooreHorspool(Vector<const SubjectChar> subject,
				200	Vector<const PatternChar> pattern,
				201	int start_index,
				202	bool* complete) {
				203	ASSERT(algorithm <= BOYER_MOORE_HORSPOOL);
				204	int n = subject.length();
				205	int m = pattern.length();
				206
				207	int badness = -m;
				208
				209	// How bad we are doing without a good-suffix table.
				210	int idx; // No matches found prior to this index.
				211	PatternChar last_char = pattern[m - 1];
				212	int last_char_shift =
				213	m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char);
				214	// Perform search
				215	for (idx = start_index; idx <= n - m;) {
				216	int j = m - 1;
				217	int c;
				218	while (last_char != (c = subject[idx + j])) {
				219	int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c);
				220	int shift = j - bc_occ;
				221	idx += shift;
				222	badness += 1 - shift; // at most zero, so badness cannot increase.
				223	if (idx > n - m) {
				224	*complete = true;
				225	return -1;
				226	}
				227	}
				228	j--;
				229	while (j >= 0 && pattern[j] == (subject[idx + j])) j--;
				230	if (j < 0) {
				231	*complete = true;
				232	return idx;
				233	} else {
				234	idx += last_char_shift;
				235	// Badness increases by the number of characters we have
				236	// checked, and decreases by the number of characters we
				237	// can skip by shifting. It's a measure of how we are doing
				238	// compared to reading each character exactly once.
				239	badness += (m - j) - last_char_shift;
				240	if (badness > 0) {
				241	*complete = false;
				242	return idx;
				243	}
				244	}
				245	}
				246	*complete = true;
				247	return -1;
				248	}
				249
				250
				251	template <typename SubjectChar, typename PatternChar>
				252	static int BoyerMooreIndexOf(Vector<const SubjectChar> subject,
				253	Vector<const PatternChar> pattern,
				254	int idx) {
				255	ASSERT(algorithm <= BOYER_MOORE);
				256	int n = subject.length();
				257	int m = pattern.length();
				258	// Only preprocess at most kBMMaxShift last characters of pattern.
				259	int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
				260
				261	PatternChar last_char = pattern[m - 1];
				262	// Continue search from i.
				263	while (idx <= n - m) {
				264	int j = m - 1;
				265	SubjectChar c;
				266	while (last_char != (c = subject[idx + j])) {
				267	int shift = j - CharOccurrence<SubjectChar, PatternChar>(c);
				268	idx += shift;
				269	if (idx > n - m) {
				270	return -1;
				271	}
				272	}
				273	while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;
				274	if (j < 0) {
				275	return idx;
				276	} else if (j < start) {
				277	// we have matched more than our tables allow us to be smart about.
				278	// Fall back on BMH shift.
				279	idx += m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char);
				280	} else {
				281	int gs_shift = BMBuffers::bmgs_buffers.shift(j + 1);
				282	int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c);
				283	int shift = j - bc_occ;
				284	if (gs_shift > shift) {
				285	shift = gs_shift;
				286	}
				287	idx += shift;
				288	}
				289	}
				290
				291	return -1;
				292	}
				293
				294
				295	// Trivial string search for shorter strings.
				296	// On return, if "complete" is set to true, the return value is the
				297	// final result of searching for the patter in the subject.
				298	// If "complete" is set to false, the return value is the index where
				299	// further checking should start, i.e., it's guaranteed that the pattern
				300	// does not occur at a position prior to the returned index.
				301	template <typename PatternChar, typename SubjectChar>
				302	static int SimpleIndexOf(Vector<const SubjectChar> subject,
				303	Vector<const PatternChar> pattern,
				304	int idx,
				305	bool* complete) {
				306	ASSERT(pattern.length() > 1);
				307	int pattern_length = pattern.length();
				308	// Badness is a count of how much work we have done. When we have
				309	// done enough work we decide it's probably worth switching to a better
				310	// algorithm.
				311	int badness = -10 - (pattern_length << 2);
				312
				313	// We know our pattern is at least 2 characters, we cache the first so
				314	// the common case of the first character not matching is faster.
				315	PatternChar pattern_first_char = pattern[0];
				316	for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) {
				317	badness++;
				318	if (badness > 0) {
				319	*complete = false;
				320	return i;
				321	}
				322	if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
				323	const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
				324	memchr(subject.start() + i,
				325	pattern_first_char,
				326	n - i + 1));
				327	if (pos == NULL) {
				328	*complete = true;
				329	return -1;
				330	}
				331	i = static_cast<int>(pos - subject.start());
				332	} else {
				333	if (subject[i] != pattern_first_char) continue;
				334	}
				335	int j = 1;
				336	do {
				337	if (pattern[j] != subject[i+j]) {
				338	break;
				339	}
				340	j++;
				341	} while (j < pattern_length);
				342	if (j == pattern_length) {
				343	*complete = true;
				344	return i;
				345	}
				346	badness += j;
				347	}
				348	*complete = true;
				349	return -1;
				350	}
				351
				352	// Simple indexOf that never bails out. For short patterns only.
				353	template <typename PatternChar, typename SubjectChar>
				354	static int SimpleIndexOf(Vector<const SubjectChar> subject,
				355	Vector<const PatternChar> pattern,
				356	int idx) {
				357	int pattern_length = pattern.length();
				358	PatternChar pattern_first_char = pattern[0];
				359	for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) {
				360	if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
				361	const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
				362	memchr(subject.start() + i,
				363	pattern_first_char,
				364	n - i + 1));
				365	if (pos == NULL) return -1;
				366	i = static_cast<int>(pos - subject.start());
				367	} else {
				368	if (subject[i] != pattern_first_char) continue;
				369	}
				370	int j = 1;
				371	while (j < pattern_length) {
				372	if (pattern[j] != subject[i+j]) {
				373	break;
				374	}
				375	j++;
				376	}
				377	if (j == pattern_length) {
				378	return i;
				379	}
				380	}
				381	return -1;
				382	}
				383
				384
				385	// Strategy for searching for a string in another string.
				386	enum StringSearchStrategy { SEARCH_FAIL, SEARCH_SHORT, SEARCH_LONG };
				387
				388
				389	template <typename PatternChar>
				390	static inline StringSearchStrategy InitializeStringSearch(
				391	Vector<const PatternChar> pat, bool ascii_subject) {
				392	// We have an ASCII haystack and a non-ASCII needle. Check if there
				393	// really is a non-ASCII character in the needle and bail out if there
				394	// is.
				395	if (ascii_subject && sizeof(PatternChar) > 1) {
				396	for (int i = 0; i < pat.length(); i++) {
				397	uc16 c = pat[i];
				398	if (c > String::kMaxAsciiCharCode) {
				399	return SEARCH_FAIL;
				400	}
				401	}
				402	}
				403	if (pat.length() < kBMMinPatternLength) {
				404	return SEARCH_SHORT;
				405	}
				406	algorithm = SIMPLE_SEARCH;
				407	return SEARCH_LONG;
				408	}
				409
				410
				411	// Dispatch long needle searches to different algorithms.
				412	template <typename SubjectChar, typename PatternChar>
				413	static int ComplexIndexOf(Vector<const SubjectChar> sub,
				414	Vector<const PatternChar> pat,
				415	int start_index) {
				416	ASSERT(pat.length() >= kBMMinPatternLength);
				417	// Try algorithms in order of increasing setup cost and expected performance.
				418	bool complete;
				419	int idx = start_index;
				420	switch (algorithm) {
				421	case SIMPLE_SEARCH:
				422	idx = SimpleIndexOf(sub, pat, idx, &complete);
				423	if (complete) return idx;
				424	BoyerMoorePopulateBadCharTable(pat);
				425	algorithm = BOYER_MOORE_HORSPOOL;
				426	// FALLTHROUGH.
				427	case BOYER_MOORE_HORSPOOL:
				428	idx = BoyerMooreHorspool(sub, pat, idx, &complete);
				429	if (complete) return idx;
				430	// Build the Good Suffix table and continue searching.
				431	BoyerMoorePopulateGoodSuffixTable(pat);
				432	algorithm = BOYER_MOORE;
				433	// FALLTHROUGH.
				434	case BOYER_MOORE:
				435	return BoyerMooreIndexOf(sub, pat, idx);
				436	}
				437	UNREACHABLE();
				438	return -1;
				439	}
				440
				441
				442	// Dispatch to different search strategies for a single search.
				443	// If searching multiple times on the same needle, the search
				444	// strategy should only be computed once and then dispatch to different
				445	// loops.
				446	template <typename SubjectChar, typename PatternChar>
				447	static int StringSearch(Vector<const SubjectChar> sub,
				448	Vector<const PatternChar> pat,
				449	int start_index) {
				450	bool ascii_subject = (sizeof(SubjectChar) == 1);
				451	StringSearchStrategy strategy = InitializeStringSearch(pat, ascii_subject);
				452	switch (strategy) {
				453	case SEARCH_FAIL: return -1;
				454	case SEARCH_SHORT: return SimpleIndexOf(sub, pat, start_index);
				455	case SEARCH_LONG: return ComplexIndexOf(sub, pat, start_index);
				456	}
				457	UNREACHABLE();
				458	return -1;
				459	}
				460
				461	}} // namespace v8::internal
				462
				463	#endif // V8_STRING_SEARCH_H_