blob: 86237f3aebcb385e510edd49fde2a274e8c9680c [file] [log] [blame]
mstarzinger@chromium.org1b3afd12011-11-29 14:28:56 +00001// Copyright 2011 the V8 project authors. All rights reserved.
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#ifndef V8_STRING_SEARCH_H_
29#define V8_STRING_SEARCH_H_
30
31namespace v8 {
32namespace internal {
33
34
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +000035//---------------------------------------------------------------------
36// String Search object.
37//---------------------------------------------------------------------
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +000038
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +000039// Class holding constants and methods that apply to all string search variants,
40// independently of subject and pattern char size.
41class StringSearchBase {
42 protected:
43 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
44 // limit, we can fix the size of tables. For a needle longer than this limit,
45 // search will not be optimal, since we only build tables for a suffix
46 // of the string, but it is a safe approximation.
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +000047 static const int kBMMaxShift = Isolate::kBMMaxShift;
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +000048
49 // Reduce alphabet to this size.
50 // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
51 // proportional to the input alphabet. We reduce the alphabet size by
52 // equating input characters modulo a smaller alphabet size. This gives
53 // a potentially less efficient searching, but is a safe approximation.
54 // For needles using only characters in the same Unicode 256-code point page,
55 // there is no search speed degradation.
jkummerow@chromium.org59297c72013-01-09 16:32:23 +000056#ifndef ENABLE_LATIN_1
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +000057 static const int kAsciiAlphabetSize = 128;
jkummerow@chromium.org59297c72013-01-09 16:32:23 +000058#else
59 static const int kAsciiAlphabetSize = 256;
60#endif
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +000061 static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +000062
63 // Bad-char shift table stored in the state. It's length is the alphabet size.
64 // For patterns below this length, the skip length of Boyer-Moore is too short
65 // to compensate for the algorithmic overhead compared to simple brute force.
66 static const int kBMMinPatternLength = 7;
67
jkummerow@chromium.org59297c72013-01-09 16:32:23 +000068 static inline bool IsOneByteString(Vector<const uint8_t> string) {
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +000069 return true;
70 }
71
jkummerow@chromium.org59297c72013-01-09 16:32:23 +000072 static inline bool IsOneByteString(Vector<const uc16> string) {
73 return String::IsOneByte(string.start(), string.length());
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +000074 }
75
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +000076 friend class Isolate;
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +000077};
78
79
80template <typename PatternChar, typename SubjectChar>
81class StringSearch : private StringSearchBase {
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +000082 public:
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +000083 StringSearch(Isolate* isolate, Vector<const PatternChar> pattern)
84 : isolate_(isolate),
85 pattern_(pattern),
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +000086 start_(Max(0, pattern.length() - kBMMaxShift)) {
87 if (sizeof(PatternChar) > sizeof(SubjectChar)) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +000088 if (!IsOneByteString(pattern_)) {
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +000089 strategy_ = &FailSearch;
90 return;
91 }
92 }
93 int pattern_length = pattern_.length();
94 if (pattern_length < kBMMinPatternLength) {
95 if (pattern_length == 1) {
96 strategy_ = &SingleCharSearch;
97 return;
98 }
99 strategy_ = &LinearSearch;
100 return;
101 }
102 strategy_ = &InitialSearch;
103 }
104
105 int Search(Vector<const SubjectChar> subject, int index) {
106 return strategy_(this, subject, index);
107 }
108
109 static inline int AlphabetSize() {
110 if (sizeof(PatternChar) == 1) {
111 // ASCII needle.
112 return kAsciiAlphabetSize;
113 } else {
114 ASSERT(sizeof(PatternChar) == 2);
115 // UC16 needle.
116 return kUC16AlphabetSize;
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000117 }
118 }
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000119
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000120 private:
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000121 typedef int (*SearchFunction)( // NOLINT - it's not a cast!
122 StringSearch<PatternChar, SubjectChar>*,
123 Vector<const SubjectChar>,
124 int);
125
126 static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
127 Vector<const SubjectChar>,
128 int) {
129 return -1;
130 }
131
132 static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
133 Vector<const SubjectChar> subject,
134 int start_index);
135
136 static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
137 Vector<const SubjectChar> subject,
138 int start_index);
139
140 static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
141 Vector<const SubjectChar> subject,
142 int start_index);
143
144 static int BoyerMooreHorspoolSearch(
145 StringSearch<PatternChar, SubjectChar>* search,
146 Vector<const SubjectChar> subject,
147 int start_index);
148
149 static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
150 Vector<const SubjectChar> subject,
151 int start_index);
152
153 void PopulateBoyerMooreHorspoolTable();
154
155 void PopulateBoyerMooreTable();
156
jkummerow@chromium.org59297c72013-01-09 16:32:23 +0000157 static inline bool exceedsOneByte(uint8_t c) {
158#ifdef ENABLE_LATIN_1
159 return false;
160#else
161 return c > String::kMaxOneByteCharCodeU;
162#endif
163 }
164
165 static inline bool exceedsOneByte(uint16_t c) {
166 return c > String::kMaxOneByteCharCodeU;
167 }
168
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000169 static inline int CharOccurrence(int* bad_char_occurrence,
170 SubjectChar char_code) {
171 if (sizeof(SubjectChar) == 1) {
172 return bad_char_occurrence[static_cast<int>(char_code)];
173 }
174 if (sizeof(PatternChar) == 1) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +0000175 if (exceedsOneByte(char_code)) {
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000176 return -1;
177 }
kmillikin@chromium.orgec7067e2010-10-01 09:22:58 +0000178 return bad_char_occurrence[static_cast<unsigned int>(char_code)];
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000179 }
kmillikin@chromium.orgec7067e2010-10-01 09:22:58 +0000180 // Both pattern and subject are UC16. Reduce character to equivalence class.
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000181 int equiv_class = char_code % kUC16AlphabetSize;
182 return bad_char_occurrence[equiv_class];
183 }
184
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000185 // The following tables are shared by all searches.
186 // TODO(lrn): Introduce a way for a pattern to keep its tables
187 // between searches (e.g., for an Atom RegExp).
188
189 // Store for the BoyerMoore(Horspool) bad char shift table.
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000190 // Return a table covering the last kBMMaxShift+1 positions of
191 // pattern.
192 int* bad_char_table() {
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000193 return isolate_->bad_char_shift_table();
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000194 }
195
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000196 // Store for the BoyerMoore good suffix shift table.
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000197 int* good_suffix_shift_table() {
198 // Return biased pointer that maps the range [start_..pattern_.length()
199 // to the kGoodSuffixShiftTable array.
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000200 return isolate_->good_suffix_shift_table() - start_;
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000201 }
202
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000203 // Table used temporarily while building the BoyerMoore good suffix
204 // shift table.
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000205 int* suffix_table() {
206 // Return biased pointer that maps the range [start_..pattern_.length()
207 // to the kSuffixTable array.
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000208 return isolate_->suffix_table() - start_;
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000209 }
210
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000211 Isolate* isolate_;
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000212 // The pattern to search for.
213 Vector<const PatternChar> pattern_;
214 // Pointer to implementation of the search.
215 SearchFunction strategy_;
216 // Cache value of Max(0, pattern_length() - kBMMaxShift)
217 int start_;
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000218};
219
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000220
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000221//---------------------------------------------------------------------
222// Single Character Pattern Search Strategy
223//---------------------------------------------------------------------
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000224
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000225template <typename PatternChar, typename SubjectChar>
226int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
227 StringSearch<PatternChar, SubjectChar>* search,
228 Vector<const SubjectChar> subject,
229 int index) {
230 ASSERT_EQ(1, search->pattern_.length());
231 PatternChar pattern_first_char = search->pattern_[0];
232 int i = index;
233 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
234 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
235 memchr(subject.start() + i,
236 pattern_first_char,
237 subject.length() - i));
238 if (pos == NULL) return -1;
239 return static_cast<int>(pos - subject.start());
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000240 } else {
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000241 if (sizeof(PatternChar) > sizeof(SubjectChar)) {
jkummerow@chromium.org59297c72013-01-09 16:32:23 +0000242 if (exceedsOneByte(pattern_first_char)) {
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000243 return -1;
244 }
245 }
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000246 SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
247 int n = subject.length();
248 while (i < n) {
249 if (subject[i++] == search_char) return i - 1;
250 }
251 return -1;
252 }
253}
254
255//---------------------------------------------------------------------
256// Linear Search Strategy
257//---------------------------------------------------------------------
258
259
260template <typename PatternChar, typename SubjectChar>
mstarzinger@chromium.org1b3afd12011-11-29 14:28:56 +0000261inline bool CharCompare(const PatternChar* pattern,
262 const SubjectChar* subject,
263 int length) {
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000264 ASSERT(length > 0);
265 int pos = 0;
266 do {
267 if (pattern[pos] != subject[pos]) {
268 return false;
269 }
270 pos++;
271 } while (pos < length);
272 return true;
273}
274
275
276// Simple linear search for short patterns. Never bails out.
277template <typename PatternChar, typename SubjectChar>
278int StringSearch<PatternChar, SubjectChar>::LinearSearch(
279 StringSearch<PatternChar, SubjectChar>* search,
280 Vector<const SubjectChar> subject,
281 int index) {
282 Vector<const PatternChar> pattern = search->pattern_;
283 ASSERT(pattern.length() > 1);
284 int pattern_length = pattern.length();
285 PatternChar pattern_first_char = pattern[0];
286 int i = index;
287 int n = subject.length() - pattern_length;
288 while (i <= n) {
289 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
290 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
291 memchr(subject.start() + i,
292 pattern_first_char,
293 n - i + 1));
294 if (pos == NULL) return -1;
295 i = static_cast<int>(pos - subject.start()) + 1;
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000296 } else {
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000297 if (subject[i++] != pattern_first_char) continue;
298 }
299 // Loop extracted to separate function to allow using return to do
300 // a deeper break.
301 if (CharCompare(pattern.start() + 1,
302 subject.start() + i,
303 pattern_length - 1)) {
304 return i - 1;
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000305 }
306 }
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000307 return -1;
308}
309
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000310//---------------------------------------------------------------------
311// Boyer-Moore string search
312//---------------------------------------------------------------------
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000313
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000314template <typename PatternChar, typename SubjectChar>
315int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
316 StringSearch<PatternChar, SubjectChar>* search,
317 Vector<const SubjectChar> subject,
318 int start_index) {
319 Vector<const PatternChar> pattern = search->pattern_;
320 int subject_length = subject.length();
321 int pattern_length = pattern.length();
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000322 // Only preprocess at most kBMMaxShift last characters of pattern.
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000323 int start = search->start_;
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000324
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000325 int* bad_char_occurence = search->bad_char_table();
326 int* good_suffix_shift = search->good_suffix_shift_table();
327
328 PatternChar last_char = pattern[pattern_length - 1];
329 int index = start_index;
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000330 // Continue search from i.
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000331 while (index <= subject_length - pattern_length) {
332 int j = pattern_length - 1;
333 int c;
334 while (last_char != (c = subject[index + j])) {
335 int shift =
336 j - CharOccurrence(bad_char_occurence, c);
337 index += shift;
338 if (index > subject_length - pattern_length) {
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000339 return -1;
340 }
341 }
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000342 while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000343 if (j < 0) {
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000344 return index;
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000345 } else if (j < start) {
346 // we have matched more than our tables allow us to be smart about.
347 // Fall back on BMH shift.
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000348 index += pattern_length - 1
349 - CharOccurrence(bad_char_occurence,
350 static_cast<SubjectChar>(last_char));
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000351 } else {
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000352 int gs_shift = good_suffix_shift[j + 1];
353 int bc_occ =
354 CharOccurrence(bad_char_occurence, c);
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000355 int shift = j - bc_occ;
356 if (gs_shift > shift) {
357 shift = gs_shift;
358 }
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000359 index += shift;
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000360 }
361 }
362
363 return -1;
364}
365
366
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000367template <typename PatternChar, typename SubjectChar>
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000368void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
369 int pattern_length = pattern_.length();
370 const PatternChar* pattern = pattern_.start();
371 // Only look at the last kBMMaxShift characters of pattern (from start_
372 // to pattern_length).
373 int start = start_;
374 int length = pattern_length - start;
375
376 // Biased tables so that we can use pattern indices as table indices,
377 // even if we only cover the part of the pattern from offset start.
378 int* shift_table = good_suffix_shift_table();
379 int* suffix_table = this->suffix_table();
380
381 // Initialize table.
382 for (int i = start; i < pattern_length; i++) {
383 shift_table[i] = length;
384 }
385 shift_table[pattern_length] = 1;
386 suffix_table[pattern_length] = pattern_length + 1;
387
danno@chromium.orgbf0c8202011-12-27 10:09:42 +0000388 if (pattern_length <= start) {
389 return;
390 }
391
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000392 // Find suffixes.
393 PatternChar last_char = pattern[pattern_length - 1];
394 int suffix = pattern_length + 1;
395 {
396 int i = pattern_length;
397 while (i > start) {
398 PatternChar c = pattern[i - 1];
399 while (suffix <= pattern_length && c != pattern[suffix - 1]) {
400 if (shift_table[suffix] == length) {
401 shift_table[suffix] = suffix - i;
402 }
403 suffix = suffix_table[suffix];
404 }
405 suffix_table[--i] = --suffix;
406 if (suffix == pattern_length) {
407 // No suffix to extend, so we check against last_char only.
408 while ((i > start) && (pattern[i - 1] != last_char)) {
409 if (shift_table[pattern_length] == length) {
410 shift_table[pattern_length] = pattern_length - i;
411 }
412 suffix_table[--i] = pattern_length;
413 }
414 if (i > start) {
415 suffix_table[--i] = --suffix;
416 }
417 }
418 }
419 }
420 // Build shift table using suffixes.
421 if (suffix < pattern_length) {
422 for (int i = start; i <= pattern_length; i++) {
423 if (shift_table[i] == length) {
424 shift_table[i] = suffix - start;
425 }
426 if (i == suffix) {
427 suffix = suffix_table[suffix];
428 }
429 }
430 }
431}
432
433//---------------------------------------------------------------------
434// Boyer-Moore-Horspool string search.
435//---------------------------------------------------------------------
436
437template <typename PatternChar, typename SubjectChar>
438int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
439 StringSearch<PatternChar, SubjectChar>* search,
440 Vector<const SubjectChar> subject,
441 int start_index) {
442 Vector<const PatternChar> pattern = search->pattern_;
443 int subject_length = subject.length();
444 int pattern_length = pattern.length();
445 int* char_occurrences = search->bad_char_table();
446 int badness = -pattern_length;
447
448 // How bad we are doing without a good-suffix table.
449 PatternChar last_char = pattern[pattern_length - 1];
450 int last_char_shift = pattern_length - 1 -
451 CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
452 // Perform search
453 int index = start_index; // No matches found prior to this index.
454 while (index <= subject_length - pattern_length) {
455 int j = pattern_length - 1;
456 int subject_char;
457 while (last_char != (subject_char = subject[index + j])) {
458 int bc_occ = CharOccurrence(char_occurrences, subject_char);
459 int shift = j - bc_occ;
460 index += shift;
461 badness += 1 - shift; // at most zero, so badness cannot increase.
462 if (index > subject_length - pattern_length) {
463 return -1;
464 }
465 }
466 j--;
467 while (j >= 0 && pattern[j] == (subject[index + j])) j--;
468 if (j < 0) {
469 return index;
470 } else {
471 index += last_char_shift;
472 // Badness increases by the number of characters we have
473 // checked, and decreases by the number of characters we
474 // can skip by shifting. It's a measure of how we are doing
475 // compared to reading each character exactly once.
476 badness += (pattern_length - j) - last_char_shift;
477 if (badness > 0) {
478 search->PopulateBoyerMooreTable();
479 search->strategy_ = &BoyerMooreSearch;
480 return BoyerMooreSearch(search, subject, index);
481 }
482 }
483 }
484 return -1;
485}
486
487
488template <typename PatternChar, typename SubjectChar>
489void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
490 int pattern_length = pattern_.length();
491
492 int* bad_char_occurrence = bad_char_table();
493
494 // Only preprocess at most kBMMaxShift last characters of pattern.
495 int start = start_;
496 // Run forwards to populate bad_char_table, so that *last* instance
497 // of character equivalence class is the one registered.
498 // Notice: Doesn't include the last character.
499 int table_size = AlphabetSize();
500 if (start == 0) { // All patterns less than kBMMaxShift in length.
501 memset(bad_char_occurrence,
502 -1,
503 table_size * sizeof(*bad_char_occurrence));
504 } else {
505 for (int i = 0; i < table_size; i++) {
506 bad_char_occurrence[i] = start - 1;
507 }
508 }
509 for (int i = start; i < pattern_length - 1; i++) {
510 PatternChar c = pattern_[i];
511 int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
512 bad_char_occurrence[bucket] = i;
513 }
514}
515
516//---------------------------------------------------------------------
517// Linear string search with bailout to BMH.
518//---------------------------------------------------------------------
519
520// Simple linear search for short patterns, which bails out if the string
521// isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
522template <typename PatternChar, typename SubjectChar>
523int StringSearch<PatternChar, SubjectChar>::InitialSearch(
524 StringSearch<PatternChar, SubjectChar>* search,
525 Vector<const SubjectChar> subject,
526 int index) {
527 Vector<const PatternChar> pattern = search->pattern_;
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000528 int pattern_length = pattern.length();
529 // Badness is a count of how much work we have done. When we have
530 // done enough work we decide it's probably worth switching to a better
531 // algorithm.
532 int badness = -10 - (pattern_length << 2);
533
534 // We know our pattern is at least 2 characters, we cache the first so
535 // the common case of the first character not matching is faster.
536 PatternChar pattern_first_char = pattern[0];
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000537 for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000538 badness++;
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000539 if (badness <= 0) {
540 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
541 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
542 memchr(subject.start() + i,
543 pattern_first_char,
544 n - i + 1));
545 if (pos == NULL) {
546 return -1;
547 }
548 i = static_cast<int>(pos - subject.start());
549 } else {
550 if (subject[i] != pattern_first_char) continue;
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000551 }
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000552 int j = 1;
553 do {
554 if (pattern[j] != subject[i + j]) {
555 break;
556 }
557 j++;
558 } while (j < pattern_length);
559 if (j == pattern_length) {
560 return i;
561 }
562 badness += j;
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000563 } else {
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000564 search->PopulateBoyerMooreHorspoolTable();
565 search->strategy_ = &BoyerMooreHorspoolSearch;
566 return BoyerMooreHorspoolSearch(search, subject, i);
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000567 }
568 }
569 return -1;
570}
571
572
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000573// Perform a a single stand-alone search.
574// If searching multiple times for the same pattern, a search
575// object should be constructed once and the Search function then called
576// for each search.
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000577template <typename SubjectChar, typename PatternChar>
mstarzinger@chromium.org1b3afd12011-11-29 14:28:56 +0000578int SearchString(Isolate* isolate,
579 Vector<const SubjectChar> subject,
580 Vector<const PatternChar> pattern,
581 int start_index) {
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000582 StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
kmillikin@chromium.orgf05f2912010-09-30 10:07:24 +0000583 return search.Search(subject, start_index);
sgjesse@chromium.org2ec107f2010-09-13 09:19:46 +0000584}
585
586}} // namespace v8::internal
587
588#endif // V8_STRING_SEARCH_H_