blob: eac84757ecf1d45458133442c5cfc200627cd691 [file] [log] [blame]
Steve Block59151502010-09-22 15:07:15 +01001// Copyright 2010 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#ifndef V8_STRING_SEARCH_H_
29#define V8_STRING_SEARCH_H_
30
31namespace v8 {
32namespace internal {
33
34
Ben Murdochf87a2032010-10-22 12:50:53 +010035//---------------------------------------------------------------------
36// String Search object.
37//---------------------------------------------------------------------
Steve Block59151502010-09-22 15:07:15 +010038
Ben Murdochf87a2032010-10-22 12:50:53 +010039// Class holding constants and methods that apply to all string search variants,
40// independently of subject and pattern char size.
41class StringSearchBase {
42 protected:
43 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
44 // limit, we can fix the size of tables. For a needle longer than this limit,
45 // search will not be optimal, since we only build tables for a suffix
46 // of the string, but it is a safe approximation.
47 static const int kBMMaxShift = 250;
48
49 // Reduce alphabet to this size.
50 // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
51 // proportional to the input alphabet. We reduce the alphabet size by
52 // equating input characters modulo a smaller alphabet size. This gives
53 // a potentially less efficient searching, but is a safe approximation.
54 // For needles using only characters in the same Unicode 256-code point page,
55 // there is no search speed degradation.
56 static const int kAsciiAlphabetSize = 128;
57 static const int kUC16AlphabetSize = 256;
58
59 // Bad-char shift table stored in the state. It's length is the alphabet size.
60 // For patterns below this length, the skip length of Boyer-Moore is too short
61 // to compensate for the algorithmic overhead compared to simple brute force.
62 static const int kBMMinPatternLength = 7;
63
64 static inline bool IsAsciiString(Vector<const char>) {
65 return true;
66 }
67
68 static inline bool IsAsciiString(Vector<const uc16> string) {
69 for (int i = 0, n = string.length(); i < n; i++) {
70 if (static_cast<unsigned>(string[i]) > String::kMaxAsciiCharCodeU) {
71 return false;
72 }
73 }
74 return true;
75 }
76
77 // The following tables are shared by all searches.
78 // TODO(lrn): Introduce a way for a pattern to keep its tables
79 // between searches (e.g., for an Atom RegExp).
80
81 // Store for the BoyerMoore(Horspool) bad char shift table.
82 static int kBadCharShiftTable[kUC16AlphabetSize];
83 // Store for the BoyerMoore good suffix shift table.
84 static int kGoodSuffixShiftTable[kBMMaxShift + 1];
85 // Table used temporarily while building the BoyerMoore good suffix
86 // shift table.
87 static int kSuffixTable[kBMMaxShift + 1];
88};
89
90
91template <typename PatternChar, typename SubjectChar>
92class StringSearch : private StringSearchBase {
Steve Block59151502010-09-22 15:07:15 +010093 public:
Ben Murdochf87a2032010-10-22 12:50:53 +010094 explicit StringSearch(Vector<const PatternChar> pattern)
95 : pattern_(pattern),
96 start_(Max(0, pattern.length() - kBMMaxShift)) {
97 if (sizeof(PatternChar) > sizeof(SubjectChar)) {
98 if (!IsAsciiString(pattern_)) {
99 strategy_ = &FailSearch;
100 return;
101 }
102 }
103 int pattern_length = pattern_.length();
104 if (pattern_length < kBMMinPatternLength) {
105 if (pattern_length == 1) {
106 strategy_ = &SingleCharSearch;
107 return;
108 }
109 strategy_ = &LinearSearch;
110 return;
111 }
112 strategy_ = &InitialSearch;
113 }
114
115 int Search(Vector<const SubjectChar> subject, int index) {
116 return strategy_(this, subject, index);
117 }
118
119 static inline int AlphabetSize() {
120 if (sizeof(PatternChar) == 1) {
121 // ASCII needle.
122 return kAsciiAlphabetSize;
123 } else {
124 ASSERT(sizeof(PatternChar) == 2);
125 // UC16 needle.
126 return kUC16AlphabetSize;
Steve Block59151502010-09-22 15:07:15 +0100127 }
128 }
Ben Murdochf87a2032010-10-22 12:50:53 +0100129
Steve Block59151502010-09-22 15:07:15 +0100130 private:
Ben Murdochf87a2032010-10-22 12:50:53 +0100131 typedef int (*SearchFunction)( // NOLINT - it's not a cast!
132 StringSearch<PatternChar, SubjectChar>*,
133 Vector<const SubjectChar>,
134 int);
135
136 static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
137 Vector<const SubjectChar>,
138 int) {
139 return -1;
140 }
141
142 static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
143 Vector<const SubjectChar> subject,
144 int start_index);
145
146 static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
147 Vector<const SubjectChar> subject,
148 int start_index);
149
150 static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
151 Vector<const SubjectChar> subject,
152 int start_index);
153
154 static int BoyerMooreHorspoolSearch(
155 StringSearch<PatternChar, SubjectChar>* search,
156 Vector<const SubjectChar> subject,
157 int start_index);
158
159 static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
160 Vector<const SubjectChar> subject,
161 int start_index);
162
163 void PopulateBoyerMooreHorspoolTable();
164
165 void PopulateBoyerMooreTable();
166
167 static inline int CharOccurrence(int* bad_char_occurrence,
168 SubjectChar char_code) {
169 if (sizeof(SubjectChar) == 1) {
170 return bad_char_occurrence[static_cast<int>(char_code)];
171 }
172 if (sizeof(PatternChar) == 1) {
173 if (static_cast<unsigned int>(char_code) > String::kMaxAsciiCharCodeU) {
174 return -1;
175 }
176 return bad_char_occurrence[static_cast<unsigned int>(char_code)];
177 }
178 // Both pattern and subject are UC16. Reduce character to equivalence class.
179 int equiv_class = char_code % kUC16AlphabetSize;
180 return bad_char_occurrence[equiv_class];
181 }
182
183 // Return a table covering the last kBMMaxShift+1 positions of
184 // pattern.
185 int* bad_char_table() {
186 return kBadCharShiftTable;
187 }
188
189 int* good_suffix_shift_table() {
190 // Return biased pointer that maps the range [start_..pattern_.length()
191 // to the kGoodSuffixShiftTable array.
192 return kGoodSuffixShiftTable - start_;
193 }
194
195 int* suffix_table() {
196 // Return biased pointer that maps the range [start_..pattern_.length()
197 // to the kSuffixTable array.
198 return kSuffixTable - start_;
199 }
200
201 // The pattern to search for.
202 Vector<const PatternChar> pattern_;
203 // Pointer to implementation of the search.
204 SearchFunction strategy_;
205 // Cache value of Max(0, pattern_length() - kBMMaxShift)
206 int start_;
Steve Block59151502010-09-22 15:07:15 +0100207};
208
Steve Block59151502010-09-22 15:07:15 +0100209
Ben Murdochf87a2032010-10-22 12:50:53 +0100210//---------------------------------------------------------------------
211// Single Character Pattern Search Strategy
212//---------------------------------------------------------------------
Steve Block59151502010-09-22 15:07:15 +0100213
Ben Murdochf87a2032010-10-22 12:50:53 +0100214template <typename PatternChar, typename SubjectChar>
215int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
216 StringSearch<PatternChar, SubjectChar>* search,
217 Vector<const SubjectChar> subject,
218 int index) {
219 ASSERT_EQ(1, search->pattern_.length());
220 PatternChar pattern_first_char = search->pattern_[0];
221 int i = index;
222 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
223 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
224 memchr(subject.start() + i,
225 pattern_first_char,
226 subject.length() - i));
227 if (pos == NULL) return -1;
228 return static_cast<int>(pos - subject.start());
Steve Block59151502010-09-22 15:07:15 +0100229 } else {
Ben Murdochf87a2032010-10-22 12:50:53 +0100230 if (sizeof(PatternChar) > sizeof(SubjectChar)) {
231 if (static_cast<uc16>(pattern_first_char) > String::kMaxAsciiCharCodeU) {
Steve Block59151502010-09-22 15:07:15 +0100232 return -1;
233 }
234 }
Ben Murdochf87a2032010-10-22 12:50:53 +0100235 SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
236 int n = subject.length();
237 while (i < n) {
238 if (subject[i++] == search_char) return i - 1;
239 }
240 return -1;
241 }
242}
243
244//---------------------------------------------------------------------
245// Linear Search Strategy
246//---------------------------------------------------------------------
247
248
249template <typename PatternChar, typename SubjectChar>
250static inline bool CharCompare(const PatternChar* pattern,
251 const SubjectChar* subject,
252 int length) {
253 ASSERT(length > 0);
254 int pos = 0;
255 do {
256 if (pattern[pos] != subject[pos]) {
257 return false;
258 }
259 pos++;
260 } while (pos < length);
261 return true;
262}
263
264
265// Simple linear search for short patterns. Never bails out.
266template <typename PatternChar, typename SubjectChar>
267int StringSearch<PatternChar, SubjectChar>::LinearSearch(
268 StringSearch<PatternChar, SubjectChar>* search,
269 Vector<const SubjectChar> subject,
270 int index) {
271 Vector<const PatternChar> pattern = search->pattern_;
272 ASSERT(pattern.length() > 1);
273 int pattern_length = pattern.length();
274 PatternChar pattern_first_char = pattern[0];
275 int i = index;
276 int n = subject.length() - pattern_length;
277 while (i <= n) {
278 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
279 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
280 memchr(subject.start() + i,
281 pattern_first_char,
282 n - i + 1));
283 if (pos == NULL) return -1;
284 i = static_cast<int>(pos - subject.start()) + 1;
Steve Block59151502010-09-22 15:07:15 +0100285 } else {
Ben Murdochf87a2032010-10-22 12:50:53 +0100286 if (subject[i++] != pattern_first_char) continue;
287 }
288 // Loop extracted to separate function to allow using return to do
289 // a deeper break.
290 if (CharCompare(pattern.start() + 1,
291 subject.start() + i,
292 pattern_length - 1)) {
293 return i - 1;
Steve Block59151502010-09-22 15:07:15 +0100294 }
295 }
Steve Block59151502010-09-22 15:07:15 +0100296 return -1;
297}
298
Ben Murdochf87a2032010-10-22 12:50:53 +0100299//---------------------------------------------------------------------
300// Boyer-Moore string search
301//---------------------------------------------------------------------
Steve Block59151502010-09-22 15:07:15 +0100302
Ben Murdochf87a2032010-10-22 12:50:53 +0100303template <typename PatternChar, typename SubjectChar>
304int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
305 StringSearch<PatternChar, SubjectChar>* search,
306 Vector<const SubjectChar> subject,
307 int start_index) {
308 Vector<const PatternChar> pattern = search->pattern_;
309 int subject_length = subject.length();
310 int pattern_length = pattern.length();
Steve Block59151502010-09-22 15:07:15 +0100311 // Only preprocess at most kBMMaxShift last characters of pattern.
Ben Murdochf87a2032010-10-22 12:50:53 +0100312 int start = search->start_;
Steve Block59151502010-09-22 15:07:15 +0100313
Ben Murdochf87a2032010-10-22 12:50:53 +0100314 int* bad_char_occurence = search->bad_char_table();
315 int* good_suffix_shift = search->good_suffix_shift_table();
316
317 PatternChar last_char = pattern[pattern_length - 1];
318 int index = start_index;
Steve Block59151502010-09-22 15:07:15 +0100319 // Continue search from i.
Ben Murdochf87a2032010-10-22 12:50:53 +0100320 while (index <= subject_length - pattern_length) {
321 int j = pattern_length - 1;
322 int c;
323 while (last_char != (c = subject[index + j])) {
324 int shift =
325 j - CharOccurrence(bad_char_occurence, c);
326 index += shift;
327 if (index > subject_length - pattern_length) {
Steve Block59151502010-09-22 15:07:15 +0100328 return -1;
329 }
330 }
Ben Murdochf87a2032010-10-22 12:50:53 +0100331 while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
Steve Block59151502010-09-22 15:07:15 +0100332 if (j < 0) {
Ben Murdochf87a2032010-10-22 12:50:53 +0100333 return index;
Steve Block59151502010-09-22 15:07:15 +0100334 } else if (j < start) {
335 // we have matched more than our tables allow us to be smart about.
336 // Fall back on BMH shift.
Ben Murdochf87a2032010-10-22 12:50:53 +0100337 index += pattern_length - 1
338 - CharOccurrence(bad_char_occurence,
339 static_cast<SubjectChar>(last_char));
Steve Block59151502010-09-22 15:07:15 +0100340 } else {
Ben Murdochf87a2032010-10-22 12:50:53 +0100341 int gs_shift = good_suffix_shift[j + 1];
342 int bc_occ =
343 CharOccurrence(bad_char_occurence, c);
Steve Block59151502010-09-22 15:07:15 +0100344 int shift = j - bc_occ;
345 if (gs_shift > shift) {
346 shift = gs_shift;
347 }
Ben Murdochf87a2032010-10-22 12:50:53 +0100348 index += shift;
Steve Block59151502010-09-22 15:07:15 +0100349 }
350 }
351
352 return -1;
353}
354
355
Steve Block59151502010-09-22 15:07:15 +0100356template <typename PatternChar, typename SubjectChar>
Ben Murdochf87a2032010-10-22 12:50:53 +0100357void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
358 int pattern_length = pattern_.length();
359 const PatternChar* pattern = pattern_.start();
360 // Only look at the last kBMMaxShift characters of pattern (from start_
361 // to pattern_length).
362 int start = start_;
363 int length = pattern_length - start;
364
365 // Biased tables so that we can use pattern indices as table indices,
366 // even if we only cover the part of the pattern from offset start.
367 int* shift_table = good_suffix_shift_table();
368 int* suffix_table = this->suffix_table();
369
370 // Initialize table.
371 for (int i = start; i < pattern_length; i++) {
372 shift_table[i] = length;
373 }
374 shift_table[pattern_length] = 1;
375 suffix_table[pattern_length] = pattern_length + 1;
376
377 // Find suffixes.
378 PatternChar last_char = pattern[pattern_length - 1];
379 int suffix = pattern_length + 1;
380 {
381 int i = pattern_length;
382 while (i > start) {
383 PatternChar c = pattern[i - 1];
384 while (suffix <= pattern_length && c != pattern[suffix - 1]) {
385 if (shift_table[suffix] == length) {
386 shift_table[suffix] = suffix - i;
387 }
388 suffix = suffix_table[suffix];
389 }
390 suffix_table[--i] = --suffix;
391 if (suffix == pattern_length) {
392 // No suffix to extend, so we check against last_char only.
393 while ((i > start) && (pattern[i - 1] != last_char)) {
394 if (shift_table[pattern_length] == length) {
395 shift_table[pattern_length] = pattern_length - i;
396 }
397 suffix_table[--i] = pattern_length;
398 }
399 if (i > start) {
400 suffix_table[--i] = --suffix;
401 }
402 }
403 }
404 }
405 // Build shift table using suffixes.
406 if (suffix < pattern_length) {
407 for (int i = start; i <= pattern_length; i++) {
408 if (shift_table[i] == length) {
409 shift_table[i] = suffix - start;
410 }
411 if (i == suffix) {
412 suffix = suffix_table[suffix];
413 }
414 }
415 }
416}
417
418//---------------------------------------------------------------------
419// Boyer-Moore-Horspool string search.
420//---------------------------------------------------------------------
421
422template <typename PatternChar, typename SubjectChar>
423int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
424 StringSearch<PatternChar, SubjectChar>* search,
425 Vector<const SubjectChar> subject,
426 int start_index) {
427 Vector<const PatternChar> pattern = search->pattern_;
428 int subject_length = subject.length();
429 int pattern_length = pattern.length();
430 int* char_occurrences = search->bad_char_table();
431 int badness = -pattern_length;
432
433 // How bad we are doing without a good-suffix table.
434 PatternChar last_char = pattern[pattern_length - 1];
435 int last_char_shift = pattern_length - 1 -
436 CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
437 // Perform search
438 int index = start_index; // No matches found prior to this index.
439 while (index <= subject_length - pattern_length) {
440 int j = pattern_length - 1;
441 int subject_char;
442 while (last_char != (subject_char = subject[index + j])) {
443 int bc_occ = CharOccurrence(char_occurrences, subject_char);
444 int shift = j - bc_occ;
445 index += shift;
446 badness += 1 - shift; // at most zero, so badness cannot increase.
447 if (index > subject_length - pattern_length) {
448 return -1;
449 }
450 }
451 j--;
452 while (j >= 0 && pattern[j] == (subject[index + j])) j--;
453 if (j < 0) {
454 return index;
455 } else {
456 index += last_char_shift;
457 // Badness increases by the number of characters we have
458 // checked, and decreases by the number of characters we
459 // can skip by shifting. It's a measure of how we are doing
460 // compared to reading each character exactly once.
461 badness += (pattern_length - j) - last_char_shift;
462 if (badness > 0) {
463 search->PopulateBoyerMooreTable();
464 search->strategy_ = &BoyerMooreSearch;
465 return BoyerMooreSearch(search, subject, index);
466 }
467 }
468 }
469 return -1;
470}
471
472
473template <typename PatternChar, typename SubjectChar>
474void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
475 int pattern_length = pattern_.length();
476
477 int* bad_char_occurrence = bad_char_table();
478
479 // Only preprocess at most kBMMaxShift last characters of pattern.
480 int start = start_;
481 // Run forwards to populate bad_char_table, so that *last* instance
482 // of character equivalence class is the one registered.
483 // Notice: Doesn't include the last character.
484 int table_size = AlphabetSize();
485 if (start == 0) { // All patterns less than kBMMaxShift in length.
486 memset(bad_char_occurrence,
487 -1,
488 table_size * sizeof(*bad_char_occurrence));
489 } else {
490 for (int i = 0; i < table_size; i++) {
491 bad_char_occurrence[i] = start - 1;
492 }
493 }
494 for (int i = start; i < pattern_length - 1; i++) {
495 PatternChar c = pattern_[i];
496 int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
497 bad_char_occurrence[bucket] = i;
498 }
499}
500
501//---------------------------------------------------------------------
502// Linear string search with bailout to BMH.
503//---------------------------------------------------------------------
504
505// Simple linear search for short patterns, which bails out if the string
506// isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
507template <typename PatternChar, typename SubjectChar>
508int StringSearch<PatternChar, SubjectChar>::InitialSearch(
509 StringSearch<PatternChar, SubjectChar>* search,
510 Vector<const SubjectChar> subject,
511 int index) {
512 Vector<const PatternChar> pattern = search->pattern_;
Steve Block59151502010-09-22 15:07:15 +0100513 int pattern_length = pattern.length();
514 // Badness is a count of how much work we have done. When we have
515 // done enough work we decide it's probably worth switching to a better
516 // algorithm.
517 int badness = -10 - (pattern_length << 2);
518
519 // We know our pattern is at least 2 characters, we cache the first so
520 // the common case of the first character not matching is faster.
521 PatternChar pattern_first_char = pattern[0];
Ben Murdochf87a2032010-10-22 12:50:53 +0100522 for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
Steve Block59151502010-09-22 15:07:15 +0100523 badness++;
Ben Murdochf87a2032010-10-22 12:50:53 +0100524 if (badness <= 0) {
525 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
526 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
527 memchr(subject.start() + i,
528 pattern_first_char,
529 n - i + 1));
530 if (pos == NULL) {
531 return -1;
532 }
533 i = static_cast<int>(pos - subject.start());
534 } else {
535 if (subject[i] != pattern_first_char) continue;
Steve Block59151502010-09-22 15:07:15 +0100536 }
Ben Murdochf87a2032010-10-22 12:50:53 +0100537 int j = 1;
538 do {
539 if (pattern[j] != subject[i + j]) {
540 break;
541 }
542 j++;
543 } while (j < pattern_length);
544 if (j == pattern_length) {
545 return i;
546 }
547 badness += j;
Steve Block59151502010-09-22 15:07:15 +0100548 } else {
Ben Murdochf87a2032010-10-22 12:50:53 +0100549 search->PopulateBoyerMooreHorspoolTable();
550 search->strategy_ = &BoyerMooreHorspoolSearch;
551 return BoyerMooreHorspoolSearch(search, subject, i);
Steve Block59151502010-09-22 15:07:15 +0100552 }
553 }
554 return -1;
555}
556
557
Ben Murdochf87a2032010-10-22 12:50:53 +0100558// Perform a a single stand-alone search.
559// If searching multiple times for the same pattern, a search
560// object should be constructed once and the Search function then called
561// for each search.
Steve Block59151502010-09-22 15:07:15 +0100562template <typename SubjectChar, typename PatternChar>
Ben Murdochf87a2032010-10-22 12:50:53 +0100563static int SearchString(Vector<const SubjectChar> subject,
564 Vector<const PatternChar> pattern,
Steve Block59151502010-09-22 15:07:15 +0100565 int start_index) {
Ben Murdochf87a2032010-10-22 12:50:53 +0100566 StringSearch<PatternChar, SubjectChar> search(pattern);
567 return search.Search(subject, start_index);
Steve Block59151502010-09-22 15:07:15 +0100568}
569
570}} // namespace v8::internal
571
572#endif // V8_STRING_SEARCH_H_