Ben Murdoch | 3ef787d | 2012-04-12 10:51:47 +0100 | [diff] [blame] | 1 | // Copyright 2011 the V8 project authors. All rights reserved. |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 4 | |
| 5 | #ifndef V8_STRING_SEARCH_H_ |
| 6 | #define V8_STRING_SEARCH_H_ |
| 7 | |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 8 | #include "src/isolate.h" |
| 9 | #include "src/vector.h" |
| 10 | |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 11 | namespace v8 { |
| 12 | namespace internal { |
| 13 | |
| 14 | |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 15 | //--------------------------------------------------------------------- |
| 16 | // String Search object. |
| 17 | //--------------------------------------------------------------------- |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 18 | |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 19 | // Class holding constants and methods that apply to all string search variants, |
| 20 | // independently of subject and pattern char size. |
| 21 | class StringSearchBase { |
| 22 | protected: |
| 23 | // Cap on the maximal shift in the Boyer-Moore implementation. By setting a |
| 24 | // limit, we can fix the size of tables. For a needle longer than this limit, |
| 25 | // search will not be optimal, since we only build tables for a suffix |
| 26 | // of the string, but it is a safe approximation. |
Steve Block | 44f0eee | 2011-05-26 01:26:41 +0100 | [diff] [blame] | 27 | static const int kBMMaxShift = Isolate::kBMMaxShift; |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 28 | |
| 29 | // Reduce alphabet to this size. |
| 30 | // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size |
| 31 | // proportional to the input alphabet. We reduce the alphabet size by |
| 32 | // equating input characters modulo a smaller alphabet size. This gives |
| 33 | // a potentially less efficient searching, but is a safe approximation. |
| 34 | // For needles using only characters in the same Unicode 256-code point page, |
| 35 | // there is no search speed degradation. |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 36 | static const int kLatin1AlphabetSize = 256; |
Steve Block | 44f0eee | 2011-05-26 01:26:41 +0100 | [diff] [blame] | 37 | static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize; |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 38 | |
| 39 | // Bad-char shift table stored in the state. It's length is the alphabet size. |
| 40 | // For patterns below this length, the skip length of Boyer-Moore is too short |
| 41 | // to compensate for the algorithmic overhead compared to simple brute force. |
| 42 | static const int kBMMinPatternLength = 7; |
| 43 | |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 44 | static inline bool IsOneByteString(Vector<const uint8_t> string) { |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 45 | return true; |
| 46 | } |
| 47 | |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 48 | static inline bool IsOneByteString(Vector<const uc16> string) { |
| 49 | return String::IsOneByte(string.start(), string.length()); |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 50 | } |
| 51 | |
Steve Block | 44f0eee | 2011-05-26 01:26:41 +0100 | [diff] [blame] | 52 | friend class Isolate; |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 53 | }; |
| 54 | |
| 55 | |
| 56 | template <typename PatternChar, typename SubjectChar> |
| 57 | class StringSearch : private StringSearchBase { |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 58 | public: |
Steve Block | 44f0eee | 2011-05-26 01:26:41 +0100 | [diff] [blame] | 59 | StringSearch(Isolate* isolate, Vector<const PatternChar> pattern) |
| 60 | : isolate_(isolate), |
| 61 | pattern_(pattern), |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 62 | start_(Max(0, pattern.length() - kBMMaxShift)) { |
| 63 | if (sizeof(PatternChar) > sizeof(SubjectChar)) { |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 64 | if (!IsOneByteString(pattern_)) { |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 65 | strategy_ = &FailSearch; |
| 66 | return; |
| 67 | } |
| 68 | } |
| 69 | int pattern_length = pattern_.length(); |
| 70 | if (pattern_length < kBMMinPatternLength) { |
| 71 | if (pattern_length == 1) { |
| 72 | strategy_ = &SingleCharSearch; |
| 73 | return; |
| 74 | } |
| 75 | strategy_ = &LinearSearch; |
| 76 | return; |
| 77 | } |
| 78 | strategy_ = &InitialSearch; |
| 79 | } |
| 80 | |
| 81 | int Search(Vector<const SubjectChar> subject, int index) { |
| 82 | return strategy_(this, subject, index); |
| 83 | } |
| 84 | |
| 85 | static inline int AlphabetSize() { |
| 86 | if (sizeof(PatternChar) == 1) { |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 87 | // Latin1 needle. |
| 88 | return kLatin1AlphabetSize; |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 89 | } else { |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 90 | DCHECK(sizeof(PatternChar) == 2); |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 91 | // UC16 needle. |
| 92 | return kUC16AlphabetSize; |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 93 | } |
| 94 | } |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 95 | |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 96 | private: |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 97 | typedef int (*SearchFunction)( // NOLINT - it's not a cast! |
| 98 | StringSearch<PatternChar, SubjectChar>*, |
| 99 | Vector<const SubjectChar>, |
| 100 | int); |
| 101 | |
| 102 | static int FailSearch(StringSearch<PatternChar, SubjectChar>*, |
| 103 | Vector<const SubjectChar>, |
| 104 | int) { |
| 105 | return -1; |
| 106 | } |
| 107 | |
| 108 | static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search, |
| 109 | Vector<const SubjectChar> subject, |
| 110 | int start_index); |
| 111 | |
| 112 | static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search, |
| 113 | Vector<const SubjectChar> subject, |
| 114 | int start_index); |
| 115 | |
| 116 | static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search, |
| 117 | Vector<const SubjectChar> subject, |
| 118 | int start_index); |
| 119 | |
| 120 | static int BoyerMooreHorspoolSearch( |
| 121 | StringSearch<PatternChar, SubjectChar>* search, |
| 122 | Vector<const SubjectChar> subject, |
| 123 | int start_index); |
| 124 | |
| 125 | static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search, |
| 126 | Vector<const SubjectChar> subject, |
| 127 | int start_index); |
| 128 | |
| 129 | void PopulateBoyerMooreHorspoolTable(); |
| 130 | |
| 131 | void PopulateBoyerMooreTable(); |
| 132 | |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 133 | static inline bool exceedsOneByte(uint8_t c) { |
| 134 | return false; |
| 135 | } |
| 136 | |
| 137 | static inline bool exceedsOneByte(uint16_t c) { |
| 138 | return c > String::kMaxOneByteCharCodeU; |
| 139 | } |
| 140 | |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 141 | static inline int CharOccurrence(int* bad_char_occurrence, |
| 142 | SubjectChar char_code) { |
| 143 | if (sizeof(SubjectChar) == 1) { |
| 144 | return bad_char_occurrence[static_cast<int>(char_code)]; |
| 145 | } |
| 146 | if (sizeof(PatternChar) == 1) { |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 147 | if (exceedsOneByte(char_code)) { |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 148 | return -1; |
| 149 | } |
| 150 | return bad_char_occurrence[static_cast<unsigned int>(char_code)]; |
| 151 | } |
| 152 | // Both pattern and subject are UC16. Reduce character to equivalence class. |
| 153 | int equiv_class = char_code % kUC16AlphabetSize; |
| 154 | return bad_char_occurrence[equiv_class]; |
| 155 | } |
| 156 | |
Steve Block | 44f0eee | 2011-05-26 01:26:41 +0100 | [diff] [blame] | 157 | // The following tables are shared by all searches. |
| 158 | // TODO(lrn): Introduce a way for a pattern to keep its tables |
| 159 | // between searches (e.g., for an Atom RegExp). |
| 160 | |
| 161 | // Store for the BoyerMoore(Horspool) bad char shift table. |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 162 | // Return a table covering the last kBMMaxShift+1 positions of |
| 163 | // pattern. |
| 164 | int* bad_char_table() { |
Steve Block | 44f0eee | 2011-05-26 01:26:41 +0100 | [diff] [blame] | 165 | return isolate_->bad_char_shift_table(); |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 166 | } |
| 167 | |
Steve Block | 44f0eee | 2011-05-26 01:26:41 +0100 | [diff] [blame] | 168 | // Store for the BoyerMoore good suffix shift table. |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 169 | int* good_suffix_shift_table() { |
| 170 | // Return biased pointer that maps the range [start_..pattern_.length() |
| 171 | // to the kGoodSuffixShiftTable array. |
Steve Block | 44f0eee | 2011-05-26 01:26:41 +0100 | [diff] [blame] | 172 | return isolate_->good_suffix_shift_table() - start_; |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 173 | } |
| 174 | |
Steve Block | 44f0eee | 2011-05-26 01:26:41 +0100 | [diff] [blame] | 175 | // Table used temporarily while building the BoyerMoore good suffix |
| 176 | // shift table. |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 177 | int* suffix_table() { |
| 178 | // Return biased pointer that maps the range [start_..pattern_.length() |
| 179 | // to the kSuffixTable array. |
Steve Block | 44f0eee | 2011-05-26 01:26:41 +0100 | [diff] [blame] | 180 | return isolate_->suffix_table() - start_; |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 181 | } |
| 182 | |
Steve Block | 44f0eee | 2011-05-26 01:26:41 +0100 | [diff] [blame] | 183 | Isolate* isolate_; |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 184 | // The pattern to search for. |
| 185 | Vector<const PatternChar> pattern_; |
| 186 | // Pointer to implementation of the search. |
| 187 | SearchFunction strategy_; |
| 188 | // Cache value of Max(0, pattern_length() - kBMMaxShift) |
| 189 | int start_; |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 190 | }; |
| 191 | |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 192 | |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 193 | template <typename T, typename U> |
| 194 | inline T AlignDown(T value, U alignment) { |
| 195 | return reinterpret_cast<T>( |
| 196 | (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1))); |
| 197 | } |
| 198 | |
| 199 | |
| 200 | inline uint8_t GetHighestValueByte(uc16 character) { |
| 201 | return Max(static_cast<uint8_t>(character & 0xFF), |
| 202 | static_cast<uint8_t>(character >> 8)); |
| 203 | } |
| 204 | |
| 205 | |
| 206 | inline uint8_t GetHighestValueByte(uint8_t character) { return character; } |
| 207 | |
| 208 | |
| 209 | template <typename PatternChar, typename SubjectChar> |
| 210 | inline int FindFirstCharacter(Vector<const PatternChar> pattern, |
| 211 | Vector<const SubjectChar> subject, int index) { |
| 212 | const PatternChar pattern_first_char = pattern[0]; |
| 213 | const int max_n = (subject.length() - pattern.length() + 1); |
| 214 | |
| 215 | const uint8_t search_byte = GetHighestValueByte(pattern_first_char); |
| 216 | const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char); |
| 217 | int pos = index; |
| 218 | do { |
| 219 | DCHECK_GE(max_n - pos, 0); |
| 220 | const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>( |
| 221 | memchr(subject.start() + pos, search_byte, |
| 222 | (max_n - pos) * sizeof(SubjectChar))); |
| 223 | if (char_pos == NULL) return -1; |
| 224 | char_pos = AlignDown(char_pos, sizeof(SubjectChar)); |
| 225 | pos = static_cast<int>(char_pos - subject.start()); |
| 226 | if (subject[pos] == search_char) return pos; |
| 227 | } while (++pos < max_n); |
| 228 | |
| 229 | return -1; |
| 230 | } |
| 231 | |
| 232 | |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 233 | //--------------------------------------------------------------------- |
| 234 | // Single Character Pattern Search Strategy |
| 235 | //--------------------------------------------------------------------- |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 236 | |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 237 | template <typename PatternChar, typename SubjectChar> |
| 238 | int StringSearch<PatternChar, SubjectChar>::SingleCharSearch( |
| 239 | StringSearch<PatternChar, SubjectChar>* search, |
| 240 | Vector<const SubjectChar> subject, |
| 241 | int index) { |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 242 | DCHECK_EQ(1, search->pattern_.length()); |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 243 | PatternChar pattern_first_char = search->pattern_[0]; |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 244 | if (sizeof(PatternChar) > sizeof(SubjectChar)) { |
| 245 | if (exceedsOneByte(pattern_first_char)) { |
| 246 | return -1; |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 247 | } |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 248 | } |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 249 | return FindFirstCharacter(search->pattern_, subject, index); |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 250 | } |
| 251 | |
| 252 | //--------------------------------------------------------------------- |
| 253 | // Linear Search Strategy |
| 254 | //--------------------------------------------------------------------- |
| 255 | |
| 256 | |
| 257 | template <typename PatternChar, typename SubjectChar> |
Ben Murdoch | 3ef787d | 2012-04-12 10:51:47 +0100 | [diff] [blame] | 258 | inline bool CharCompare(const PatternChar* pattern, |
| 259 | const SubjectChar* subject, |
| 260 | int length) { |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 261 | DCHECK(length > 0); |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 262 | int pos = 0; |
| 263 | do { |
| 264 | if (pattern[pos] != subject[pos]) { |
| 265 | return false; |
| 266 | } |
| 267 | pos++; |
| 268 | } while (pos < length); |
| 269 | return true; |
| 270 | } |
| 271 | |
| 272 | |
| 273 | // Simple linear search for short patterns. Never bails out. |
| 274 | template <typename PatternChar, typename SubjectChar> |
| 275 | int StringSearch<PatternChar, SubjectChar>::LinearSearch( |
| 276 | StringSearch<PatternChar, SubjectChar>* search, |
| 277 | Vector<const SubjectChar> subject, |
| 278 | int index) { |
| 279 | Vector<const PatternChar> pattern = search->pattern_; |
Ben Murdoch | b8a8cc1 | 2014-11-26 15:28:44 +0000 | [diff] [blame] | 280 | DCHECK(pattern.length() > 1); |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 281 | int pattern_length = pattern.length(); |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 282 | int i = index; |
| 283 | int n = subject.length() - pattern_length; |
| 284 | while (i <= n) { |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 285 | i = FindFirstCharacter(pattern, subject, i); |
| 286 | if (i == -1) return -1; |
| 287 | DCHECK_LE(i, n); |
| 288 | i++; |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 289 | // Loop extracted to separate function to allow using return to do |
| 290 | // a deeper break. |
| 291 | if (CharCompare(pattern.start() + 1, |
| 292 | subject.start() + i, |
| 293 | pattern_length - 1)) { |
| 294 | return i - 1; |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 295 | } |
| 296 | } |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 297 | return -1; |
| 298 | } |
| 299 | |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 300 | //--------------------------------------------------------------------- |
| 301 | // Boyer-Moore string search |
| 302 | //--------------------------------------------------------------------- |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 303 | |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 304 | template <typename PatternChar, typename SubjectChar> |
| 305 | int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch( |
| 306 | StringSearch<PatternChar, SubjectChar>* search, |
| 307 | Vector<const SubjectChar> subject, |
| 308 | int start_index) { |
| 309 | Vector<const PatternChar> pattern = search->pattern_; |
| 310 | int subject_length = subject.length(); |
| 311 | int pattern_length = pattern.length(); |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 312 | // Only preprocess at most kBMMaxShift last characters of pattern. |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 313 | int start = search->start_; |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 314 | |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 315 | int* bad_char_occurence = search->bad_char_table(); |
| 316 | int* good_suffix_shift = search->good_suffix_shift_table(); |
| 317 | |
| 318 | PatternChar last_char = pattern[pattern_length - 1]; |
| 319 | int index = start_index; |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 320 | // Continue search from i. |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 321 | while (index <= subject_length - pattern_length) { |
| 322 | int j = pattern_length - 1; |
| 323 | int c; |
| 324 | while (last_char != (c = subject[index + j])) { |
| 325 | int shift = |
| 326 | j - CharOccurrence(bad_char_occurence, c); |
| 327 | index += shift; |
| 328 | if (index > subject_length - pattern_length) { |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 329 | return -1; |
| 330 | } |
| 331 | } |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 332 | while (j >= 0 && pattern[j] == (c = subject[index + j])) j--; |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 333 | if (j < 0) { |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 334 | return index; |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 335 | } else if (j < start) { |
| 336 | // we have matched more than our tables allow us to be smart about. |
| 337 | // Fall back on BMH shift. |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 338 | index += pattern_length - 1 |
| 339 | - CharOccurrence(bad_char_occurence, |
| 340 | static_cast<SubjectChar>(last_char)); |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 341 | } else { |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 342 | int gs_shift = good_suffix_shift[j + 1]; |
| 343 | int bc_occ = |
| 344 | CharOccurrence(bad_char_occurence, c); |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 345 | int shift = j - bc_occ; |
| 346 | if (gs_shift > shift) { |
| 347 | shift = gs_shift; |
| 348 | } |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 349 | index += shift; |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 350 | } |
| 351 | } |
| 352 | |
| 353 | return -1; |
| 354 | } |
| 355 | |
| 356 | |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 357 | template <typename PatternChar, typename SubjectChar> |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 358 | void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() { |
| 359 | int pattern_length = pattern_.length(); |
| 360 | const PatternChar* pattern = pattern_.start(); |
| 361 | // Only look at the last kBMMaxShift characters of pattern (from start_ |
| 362 | // to pattern_length). |
| 363 | int start = start_; |
| 364 | int length = pattern_length - start; |
| 365 | |
| 366 | // Biased tables so that we can use pattern indices as table indices, |
| 367 | // even if we only cover the part of the pattern from offset start. |
| 368 | int* shift_table = good_suffix_shift_table(); |
| 369 | int* suffix_table = this->suffix_table(); |
| 370 | |
| 371 | // Initialize table. |
| 372 | for (int i = start; i < pattern_length; i++) { |
| 373 | shift_table[i] = length; |
| 374 | } |
| 375 | shift_table[pattern_length] = 1; |
| 376 | suffix_table[pattern_length] = pattern_length + 1; |
| 377 | |
Ben Murdoch | 3ef787d | 2012-04-12 10:51:47 +0100 | [diff] [blame] | 378 | if (pattern_length <= start) { |
| 379 | return; |
| 380 | } |
| 381 | |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 382 | // Find suffixes. |
| 383 | PatternChar last_char = pattern[pattern_length - 1]; |
| 384 | int suffix = pattern_length + 1; |
| 385 | { |
| 386 | int i = pattern_length; |
| 387 | while (i > start) { |
| 388 | PatternChar c = pattern[i - 1]; |
| 389 | while (suffix <= pattern_length && c != pattern[suffix - 1]) { |
| 390 | if (shift_table[suffix] == length) { |
| 391 | shift_table[suffix] = suffix - i; |
| 392 | } |
| 393 | suffix = suffix_table[suffix]; |
| 394 | } |
| 395 | suffix_table[--i] = --suffix; |
| 396 | if (suffix == pattern_length) { |
| 397 | // No suffix to extend, so we check against last_char only. |
| 398 | while ((i > start) && (pattern[i - 1] != last_char)) { |
| 399 | if (shift_table[pattern_length] == length) { |
| 400 | shift_table[pattern_length] = pattern_length - i; |
| 401 | } |
| 402 | suffix_table[--i] = pattern_length; |
| 403 | } |
| 404 | if (i > start) { |
| 405 | suffix_table[--i] = --suffix; |
| 406 | } |
| 407 | } |
| 408 | } |
| 409 | } |
| 410 | // Build shift table using suffixes. |
| 411 | if (suffix < pattern_length) { |
| 412 | for (int i = start; i <= pattern_length; i++) { |
| 413 | if (shift_table[i] == length) { |
| 414 | shift_table[i] = suffix - start; |
| 415 | } |
| 416 | if (i == suffix) { |
| 417 | suffix = suffix_table[suffix]; |
| 418 | } |
| 419 | } |
| 420 | } |
| 421 | } |
| 422 | |
| 423 | //--------------------------------------------------------------------- |
| 424 | // Boyer-Moore-Horspool string search. |
| 425 | //--------------------------------------------------------------------- |
| 426 | |
| 427 | template <typename PatternChar, typename SubjectChar> |
| 428 | int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch( |
| 429 | StringSearch<PatternChar, SubjectChar>* search, |
| 430 | Vector<const SubjectChar> subject, |
| 431 | int start_index) { |
| 432 | Vector<const PatternChar> pattern = search->pattern_; |
| 433 | int subject_length = subject.length(); |
| 434 | int pattern_length = pattern.length(); |
| 435 | int* char_occurrences = search->bad_char_table(); |
| 436 | int badness = -pattern_length; |
| 437 | |
| 438 | // How bad we are doing without a good-suffix table. |
| 439 | PatternChar last_char = pattern[pattern_length - 1]; |
| 440 | int last_char_shift = pattern_length - 1 - |
| 441 | CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char)); |
| 442 | // Perform search |
| 443 | int index = start_index; // No matches found prior to this index. |
| 444 | while (index <= subject_length - pattern_length) { |
| 445 | int j = pattern_length - 1; |
| 446 | int subject_char; |
| 447 | while (last_char != (subject_char = subject[index + j])) { |
| 448 | int bc_occ = CharOccurrence(char_occurrences, subject_char); |
| 449 | int shift = j - bc_occ; |
| 450 | index += shift; |
| 451 | badness += 1 - shift; // at most zero, so badness cannot increase. |
| 452 | if (index > subject_length - pattern_length) { |
| 453 | return -1; |
| 454 | } |
| 455 | } |
| 456 | j--; |
| 457 | while (j >= 0 && pattern[j] == (subject[index + j])) j--; |
| 458 | if (j < 0) { |
| 459 | return index; |
| 460 | } else { |
| 461 | index += last_char_shift; |
| 462 | // Badness increases by the number of characters we have |
| 463 | // checked, and decreases by the number of characters we |
| 464 | // can skip by shifting. It's a measure of how we are doing |
| 465 | // compared to reading each character exactly once. |
| 466 | badness += (pattern_length - j) - last_char_shift; |
| 467 | if (badness > 0) { |
| 468 | search->PopulateBoyerMooreTable(); |
| 469 | search->strategy_ = &BoyerMooreSearch; |
| 470 | return BoyerMooreSearch(search, subject, index); |
| 471 | } |
| 472 | } |
| 473 | } |
| 474 | return -1; |
| 475 | } |
| 476 | |
| 477 | |
| 478 | template <typename PatternChar, typename SubjectChar> |
| 479 | void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() { |
| 480 | int pattern_length = pattern_.length(); |
| 481 | |
| 482 | int* bad_char_occurrence = bad_char_table(); |
| 483 | |
| 484 | // Only preprocess at most kBMMaxShift last characters of pattern. |
| 485 | int start = start_; |
| 486 | // Run forwards to populate bad_char_table, so that *last* instance |
| 487 | // of character equivalence class is the one registered. |
| 488 | // Notice: Doesn't include the last character. |
| 489 | int table_size = AlphabetSize(); |
| 490 | if (start == 0) { // All patterns less than kBMMaxShift in length. |
| 491 | memset(bad_char_occurrence, |
| 492 | -1, |
| 493 | table_size * sizeof(*bad_char_occurrence)); |
| 494 | } else { |
| 495 | for (int i = 0; i < table_size; i++) { |
| 496 | bad_char_occurrence[i] = start - 1; |
| 497 | } |
| 498 | } |
| 499 | for (int i = start; i < pattern_length - 1; i++) { |
| 500 | PatternChar c = pattern_[i]; |
| 501 | int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize(); |
| 502 | bad_char_occurrence[bucket] = i; |
| 503 | } |
| 504 | } |
| 505 | |
| 506 | //--------------------------------------------------------------------- |
| 507 | // Linear string search with bailout to BMH. |
| 508 | //--------------------------------------------------------------------- |
| 509 | |
| 510 | // Simple linear search for short patterns, which bails out if the string |
| 511 | // isn't found very early in the subject. Upgrades to BoyerMooreHorspool. |
| 512 | template <typename PatternChar, typename SubjectChar> |
| 513 | int StringSearch<PatternChar, SubjectChar>::InitialSearch( |
| 514 | StringSearch<PatternChar, SubjectChar>* search, |
| 515 | Vector<const SubjectChar> subject, |
| 516 | int index) { |
| 517 | Vector<const PatternChar> pattern = search->pattern_; |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 518 | int pattern_length = pattern.length(); |
| 519 | // Badness is a count of how much work we have done. When we have |
| 520 | // done enough work we decide it's probably worth switching to a better |
| 521 | // algorithm. |
| 522 | int badness = -10 - (pattern_length << 2); |
| 523 | |
| 524 | // We know our pattern is at least 2 characters, we cache the first so |
| 525 | // the common case of the first character not matching is faster. |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 526 | for (int i = index, n = subject.length() - pattern_length; i <= n; i++) { |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 527 | badness++; |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 528 | if (badness <= 0) { |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 529 | i = FindFirstCharacter(pattern, subject, i); |
| 530 | if (i == -1) return -1; |
| 531 | DCHECK_LE(i, n); |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 532 | int j = 1; |
| 533 | do { |
| 534 | if (pattern[j] != subject[i + j]) { |
| 535 | break; |
| 536 | } |
| 537 | j++; |
| 538 | } while (j < pattern_length); |
| 539 | if (j == pattern_length) { |
| 540 | return i; |
| 541 | } |
| 542 | badness += j; |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 543 | } else { |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 544 | search->PopulateBoyerMooreHorspoolTable(); |
| 545 | search->strategy_ = &BoyerMooreHorspoolSearch; |
| 546 | return BoyerMooreHorspoolSearch(search, subject, i); |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 547 | } |
| 548 | } |
| 549 | return -1; |
| 550 | } |
| 551 | |
| 552 | |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 553 | // Perform a a single stand-alone search. |
| 554 | // If searching multiple times for the same pattern, a search |
| 555 | // object should be constructed once and the Search function then called |
| 556 | // for each search. |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 557 | template <typename SubjectChar, typename PatternChar> |
Ben Murdoch | 3ef787d | 2012-04-12 10:51:47 +0100 | [diff] [blame] | 558 | int SearchString(Isolate* isolate, |
| 559 | Vector<const SubjectChar> subject, |
| 560 | Vector<const PatternChar> pattern, |
| 561 | int start_index) { |
Steve Block | 44f0eee | 2011-05-26 01:26:41 +0100 | [diff] [blame] | 562 | StringSearch<PatternChar, SubjectChar> search(isolate, pattern); |
Ben Murdoch | f87a203 | 2010-10-22 12:50:53 +0100 | [diff] [blame] | 563 | return search.Search(subject, start_index); |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 564 | } |
| 565 | |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 566 | } // namespace internal |
| 567 | } // namespace v8 |
Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame] | 568 | |
| 569 | #endif // V8_STRING_SEARCH_H_ |