blob: d7959c0be982c35208c14cfc8803ec00a7f14a9b [file] [log] [blame]
Steve Block59151502010-09-22 15:07:15 +01001// Copyright 2010 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#ifndef V8_STRING_SEARCH_H_
29#define V8_STRING_SEARCH_H_
30
31namespace v8 {
32namespace internal {
33
34
35// Cap on the maximal shift in the Boyer-Moore implementation. By setting a
36// limit, we can fix the size of tables. For a needle longer than this limit,
37// search will not be optimal, since we only build tables for a smaller suffix
38// of the string, which is a safe approximation.
39static const int kBMMaxShift = 250;
40// Reduce alphabet to this size.
41// One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
42// proportional to the input alphabet. We reduce the alphabet size by
43// equating input characters modulo a smaller alphabet size. This gives
44// a potentially less efficient searching, but is a safe approximation.
45// For needles using only characters in the same Unicode 256-code point page,
46// there is no search speed degradation.
47static const int kBMAlphabetSize = 256;
48// For patterns below this length, the skip length of Boyer-Moore is too short
49// to compensate for the algorithmic overhead compared to simple brute force.
50static const int kBMMinPatternLength = 7;
51
52// Holds the two buffers used by Boyer-Moore string search's Good Suffix
53// shift. Only allows the last kBMMaxShift characters of the needle
54// to be indexed.
55class BMGoodSuffixBuffers {
56 public:
57 BMGoodSuffixBuffers() {}
58 inline void Initialize(int needle_length) {
59 ASSERT(needle_length > 1);
60 int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift;
61 int len = needle_length - start;
62 biased_suffixes_ = suffixes_ - start;
63 biased_good_suffix_shift_ = good_suffix_shift_ - start;
64 for (int i = 0; i <= len; i++) {
65 good_suffix_shift_[i] = len;
66 }
67 }
68 inline int& suffix(int index) {
69 ASSERT(biased_suffixes_ + index >= suffixes_);
70 return biased_suffixes_[index];
71 }
72 inline int& shift(int index) {
73 ASSERT(biased_good_suffix_shift_ + index >= good_suffix_shift_);
74 return biased_good_suffix_shift_[index];
75 }
76 private:
77 int suffixes_[kBMMaxShift + 1];
78 int good_suffix_shift_[kBMMaxShift + 1];
79 int* biased_suffixes_;
80 int* biased_good_suffix_shift_;
81 DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers);
82};
83
84// buffers reused by BoyerMoore
85struct BMBuffers {
86 public:
87 static int bad_char_occurrence[kBMAlphabetSize];
88 static BMGoodSuffixBuffers bmgs_buffers;
89};
90
91// State of the string match tables.
92// SIMPLE: No usable content in the buffers.
93// BOYER_MOORE_HORSPOOL: The bad_char_occurence table has been populated.
94// BOYER_MOORE: The bmgs_buffers tables have also been populated.
95// Whenever starting with a new needle, one should call InitializeStringSearch
96// to determine which search strategy to use, and in the case of a long-needle
97// strategy, the call also initializes the algorithm to SIMPLE.
98enum StringSearchAlgorithm { SIMPLE_SEARCH, BOYER_MOORE_HORSPOOL, BOYER_MOORE };
99static StringSearchAlgorithm algorithm;
100
101
102// Compute the bad-char table for Boyer-Moore in the static buffer.
103template <typename PatternChar>
104static void BoyerMoorePopulateBadCharTable(Vector<const PatternChar> pattern) {
105 // Only preprocess at most kBMMaxShift last characters of pattern.
106 int start = Max(pattern.length() - kBMMaxShift, 0);
107 // Run forwards to populate bad_char_table, so that *last* instance
108 // of character equivalence class is the one registered.
109 // Notice: Doesn't include the last character.
110 int table_size = (sizeof(PatternChar) == 1) ? String::kMaxAsciiCharCode + 1
111 : kBMAlphabetSize;
112 if (start == 0) { // All patterns less than kBMMaxShift in length.
113 memset(BMBuffers::bad_char_occurrence,
114 -1,
115 table_size * sizeof(*BMBuffers::bad_char_occurrence));
116 } else {
117 for (int i = 0; i < table_size; i++) {
118 BMBuffers::bad_char_occurrence[i] = start - 1;
119 }
120 }
121 for (int i = start; i < pattern.length() - 1; i++) {
122 PatternChar c = pattern[i];
123 int bucket = (sizeof(PatternChar) ==1) ? c : c % kBMAlphabetSize;
124 BMBuffers::bad_char_occurrence[bucket] = i;
125 }
126}
127
128
129template <typename PatternChar>
130static void BoyerMoorePopulateGoodSuffixTable(
131 Vector<const PatternChar> pattern) {
132 int m = pattern.length();
133 int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
134 int len = m - start;
135 // Compute Good Suffix tables.
136 BMBuffers::bmgs_buffers.Initialize(m);
137
138 BMBuffers::bmgs_buffers.shift(m-1) = 1;
139 BMBuffers::bmgs_buffers.suffix(m) = m + 1;
140 PatternChar last_char = pattern[m - 1];
141 int suffix = m + 1;
142 {
143 int i = m;
144 while (i > start) {
145 PatternChar c = pattern[i - 1];
146 while (suffix <= m && c != pattern[suffix - 1]) {
147 if (BMBuffers::bmgs_buffers.shift(suffix) == len) {
148 BMBuffers::bmgs_buffers.shift(suffix) = suffix - i;
149 }
150 suffix = BMBuffers::bmgs_buffers.suffix(suffix);
151 }
152 BMBuffers::bmgs_buffers.suffix(--i) = --suffix;
153 if (suffix == m) {
154 // No suffix to extend, so we check against last_char only.
155 while ((i > start) && (pattern[i - 1] != last_char)) {
156 if (BMBuffers::bmgs_buffers.shift(m) == len) {
157 BMBuffers::bmgs_buffers.shift(m) = m - i;
158 }
159 BMBuffers::bmgs_buffers.suffix(--i) = m;
160 }
161 if (i > start) {
162 BMBuffers::bmgs_buffers.suffix(--i) = --suffix;
163 }
164 }
165 }
166 }
167 if (suffix < m) {
168 for (int i = start; i <= m; i++) {
169 if (BMBuffers::bmgs_buffers.shift(i) == len) {
170 BMBuffers::bmgs_buffers.shift(i) = suffix - start;
171 }
172 if (i == suffix) {
173 suffix = BMBuffers::bmgs_buffers.suffix(suffix);
174 }
175 }
176 }
177}
178
179
180template <typename SubjectChar, typename PatternChar>
181static inline int CharOccurrence(int char_code) {
182 if (sizeof(SubjectChar) == 1) {
183 return BMBuffers::bad_char_occurrence[char_code];
184 }
185 if (sizeof(PatternChar) == 1) {
186 if (char_code > String::kMaxAsciiCharCode) {
187 return -1;
188 }
189 return BMBuffers::bad_char_occurrence[char_code];
190 }
191 return BMBuffers::bad_char_occurrence[char_code % kBMAlphabetSize];
192}
193
194
195// Restricted simplified Boyer-Moore string matching.
196// Uses only the bad-shift table of Boyer-Moore and only uses it
197// for the character compared to the last character of the needle.
198template <typename SubjectChar, typename PatternChar>
199static int BoyerMooreHorspool(Vector<const SubjectChar> subject,
200 Vector<const PatternChar> pattern,
201 int start_index,
202 bool* complete) {
203 ASSERT(algorithm <= BOYER_MOORE_HORSPOOL);
204 int n = subject.length();
205 int m = pattern.length();
206
207 int badness = -m;
208
209 // How bad we are doing without a good-suffix table.
210 int idx; // No matches found prior to this index.
211 PatternChar last_char = pattern[m - 1];
212 int last_char_shift =
213 m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char);
214 // Perform search
215 for (idx = start_index; idx <= n - m;) {
216 int j = m - 1;
217 int c;
218 while (last_char != (c = subject[idx + j])) {
219 int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c);
220 int shift = j - bc_occ;
221 idx += shift;
222 badness += 1 - shift; // at most zero, so badness cannot increase.
223 if (idx > n - m) {
224 *complete = true;
225 return -1;
226 }
227 }
228 j--;
229 while (j >= 0 && pattern[j] == (subject[idx + j])) j--;
230 if (j < 0) {
231 *complete = true;
232 return idx;
233 } else {
234 idx += last_char_shift;
235 // Badness increases by the number of characters we have
236 // checked, and decreases by the number of characters we
237 // can skip by shifting. It's a measure of how we are doing
238 // compared to reading each character exactly once.
239 badness += (m - j) - last_char_shift;
240 if (badness > 0) {
241 *complete = false;
242 return idx;
243 }
244 }
245 }
246 *complete = true;
247 return -1;
248}
249
250
251template <typename SubjectChar, typename PatternChar>
252static int BoyerMooreIndexOf(Vector<const SubjectChar> subject,
253 Vector<const PatternChar> pattern,
254 int idx) {
255 ASSERT(algorithm <= BOYER_MOORE);
256 int n = subject.length();
257 int m = pattern.length();
258 // Only preprocess at most kBMMaxShift last characters of pattern.
259 int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
260
261 PatternChar last_char = pattern[m - 1];
262 // Continue search from i.
263 while (idx <= n - m) {
264 int j = m - 1;
265 SubjectChar c;
266 while (last_char != (c = subject[idx + j])) {
267 int shift = j - CharOccurrence<SubjectChar, PatternChar>(c);
268 idx += shift;
269 if (idx > n - m) {
270 return -1;
271 }
272 }
273 while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;
274 if (j < 0) {
275 return idx;
276 } else if (j < start) {
277 // we have matched more than our tables allow us to be smart about.
278 // Fall back on BMH shift.
279 idx += m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char);
280 } else {
281 int gs_shift = BMBuffers::bmgs_buffers.shift(j + 1);
282 int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c);
283 int shift = j - bc_occ;
284 if (gs_shift > shift) {
285 shift = gs_shift;
286 }
287 idx += shift;
288 }
289 }
290
291 return -1;
292}
293
294
295// Trivial string search for shorter strings.
296// On return, if "complete" is set to true, the return value is the
297// final result of searching for the patter in the subject.
298// If "complete" is set to false, the return value is the index where
299// further checking should start, i.e., it's guaranteed that the pattern
300// does not occur at a position prior to the returned index.
301template <typename PatternChar, typename SubjectChar>
302static int SimpleIndexOf(Vector<const SubjectChar> subject,
303 Vector<const PatternChar> pattern,
304 int idx,
305 bool* complete) {
306 ASSERT(pattern.length() > 1);
307 int pattern_length = pattern.length();
308 // Badness is a count of how much work we have done. When we have
309 // done enough work we decide it's probably worth switching to a better
310 // algorithm.
311 int badness = -10 - (pattern_length << 2);
312
313 // We know our pattern is at least 2 characters, we cache the first so
314 // the common case of the first character not matching is faster.
315 PatternChar pattern_first_char = pattern[0];
316 for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) {
317 badness++;
318 if (badness > 0) {
319 *complete = false;
320 return i;
321 }
322 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
323 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
324 memchr(subject.start() + i,
325 pattern_first_char,
326 n - i + 1));
327 if (pos == NULL) {
328 *complete = true;
329 return -1;
330 }
331 i = static_cast<int>(pos - subject.start());
332 } else {
333 if (subject[i] != pattern_first_char) continue;
334 }
335 int j = 1;
336 do {
337 if (pattern[j] != subject[i+j]) {
338 break;
339 }
340 j++;
341 } while (j < pattern_length);
342 if (j == pattern_length) {
343 *complete = true;
344 return i;
345 }
346 badness += j;
347 }
348 *complete = true;
349 return -1;
350}
351
352// Simple indexOf that never bails out. For short patterns only.
353template <typename PatternChar, typename SubjectChar>
354static int SimpleIndexOf(Vector<const SubjectChar> subject,
355 Vector<const PatternChar> pattern,
356 int idx) {
357 int pattern_length = pattern.length();
358 PatternChar pattern_first_char = pattern[0];
359 for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) {
360 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
361 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
362 memchr(subject.start() + i,
363 pattern_first_char,
364 n - i + 1));
365 if (pos == NULL) return -1;
366 i = static_cast<int>(pos - subject.start());
367 } else {
368 if (subject[i] != pattern_first_char) continue;
369 }
370 int j = 1;
371 while (j < pattern_length) {
372 if (pattern[j] != subject[i+j]) {
373 break;
374 }
375 j++;
376 }
377 if (j == pattern_length) {
378 return i;
379 }
380 }
381 return -1;
382}
383
384
385// Strategy for searching for a string in another string.
386enum StringSearchStrategy { SEARCH_FAIL, SEARCH_SHORT, SEARCH_LONG };
387
388
389template <typename PatternChar>
390static inline StringSearchStrategy InitializeStringSearch(
391 Vector<const PatternChar> pat, bool ascii_subject) {
392 // We have an ASCII haystack and a non-ASCII needle. Check if there
393 // really is a non-ASCII character in the needle and bail out if there
394 // is.
395 if (ascii_subject && sizeof(PatternChar) > 1) {
396 for (int i = 0; i < pat.length(); i++) {
397 uc16 c = pat[i];
398 if (c > String::kMaxAsciiCharCode) {
399 return SEARCH_FAIL;
400 }
401 }
402 }
403 if (pat.length() < kBMMinPatternLength) {
404 return SEARCH_SHORT;
405 }
406 algorithm = SIMPLE_SEARCH;
407 return SEARCH_LONG;
408}
409
410
411// Dispatch long needle searches to different algorithms.
412template <typename SubjectChar, typename PatternChar>
413static int ComplexIndexOf(Vector<const SubjectChar> sub,
414 Vector<const PatternChar> pat,
415 int start_index) {
416 ASSERT(pat.length() >= kBMMinPatternLength);
417 // Try algorithms in order of increasing setup cost and expected performance.
418 bool complete;
419 int idx = start_index;
420 switch (algorithm) {
421 case SIMPLE_SEARCH:
422 idx = SimpleIndexOf(sub, pat, idx, &complete);
423 if (complete) return idx;
424 BoyerMoorePopulateBadCharTable(pat);
425 algorithm = BOYER_MOORE_HORSPOOL;
426 // FALLTHROUGH.
427 case BOYER_MOORE_HORSPOOL:
428 idx = BoyerMooreHorspool(sub, pat, idx, &complete);
429 if (complete) return idx;
430 // Build the Good Suffix table and continue searching.
431 BoyerMoorePopulateGoodSuffixTable(pat);
432 algorithm = BOYER_MOORE;
433 // FALLTHROUGH.
434 case BOYER_MOORE:
435 return BoyerMooreIndexOf(sub, pat, idx);
436 }
437 UNREACHABLE();
438 return -1;
439}
440
441
442// Dispatch to different search strategies for a single search.
443// If searching multiple times on the same needle, the search
444// strategy should only be computed once and then dispatch to different
445// loops.
446template <typename SubjectChar, typename PatternChar>
447static int StringSearch(Vector<const SubjectChar> sub,
448 Vector<const PatternChar> pat,
449 int start_index) {
450 bool ascii_subject = (sizeof(SubjectChar) == 1);
451 StringSearchStrategy strategy = InitializeStringSearch(pat, ascii_subject);
452 switch (strategy) {
453 case SEARCH_FAIL: return -1;
454 case SEARCH_SHORT: return SimpleIndexOf(sub, pat, start_index);
455 case SEARCH_LONG: return ComplexIndexOf(sub, pat, start_index);
456 }
457 UNREACHABLE();
458 return -1;
459}
460
461}} // namespace v8::internal
462
463#endif // V8_STRING_SEARCH_H_