Steve Block | 5915150 | 2010-09-22 15:07:15 +0100 | [diff] [blame^] | 1 | // Copyright 2010 the V8 project authors. All rights reserved. |
| 2 | // Redistribution and use in source and binary forms, with or without |
| 3 | // modification, are permitted provided that the following conditions are |
| 4 | // met: |
| 5 | // |
| 6 | // * Redistributions of source code must retain the above copyright |
| 7 | // notice, this list of conditions and the following disclaimer. |
| 8 | // * Redistributions in binary form must reproduce the above |
| 9 | // copyright notice, this list of conditions and the following |
| 10 | // disclaimer in the documentation and/or other materials provided |
| 11 | // with the distribution. |
| 12 | // * Neither the name of Google Inc. nor the names of its |
| 13 | // contributors may be used to endorse or promote products derived |
| 14 | // from this software without specific prior written permission. |
| 15 | // |
| 16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 27 | |
| 28 | #ifndef V8_STRING_SEARCH_H_ |
| 29 | #define V8_STRING_SEARCH_H_ |
| 30 | |
| 31 | namespace v8 { |
| 32 | namespace internal { |
| 33 | |
| 34 | |
| 35 | // Cap on the maximal shift in the Boyer-Moore implementation. By setting a |
| 36 | // limit, we can fix the size of tables. For a needle longer than this limit, |
| 37 | // search will not be optimal, since we only build tables for a smaller suffix |
| 38 | // of the string, which is a safe approximation. |
| 39 | static const int kBMMaxShift = 250; |
| 40 | // Reduce alphabet to this size. |
| 41 | // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size |
| 42 | // proportional to the input alphabet. We reduce the alphabet size by |
| 43 | // equating input characters modulo a smaller alphabet size. This gives |
| 44 | // a potentially less efficient searching, but is a safe approximation. |
| 45 | // For needles using only characters in the same Unicode 256-code point page, |
| 46 | // there is no search speed degradation. |
| 47 | static const int kBMAlphabetSize = 256; |
| 48 | // For patterns below this length, the skip length of Boyer-Moore is too short |
| 49 | // to compensate for the algorithmic overhead compared to simple brute force. |
| 50 | static const int kBMMinPatternLength = 7; |
| 51 | |
| 52 | // Holds the two buffers used by Boyer-Moore string search's Good Suffix |
| 53 | // shift. Only allows the last kBMMaxShift characters of the needle |
| 54 | // to be indexed. |
| 55 | class BMGoodSuffixBuffers { |
| 56 | public: |
| 57 | BMGoodSuffixBuffers() {} |
| 58 | inline void Initialize(int needle_length) { |
| 59 | ASSERT(needle_length > 1); |
| 60 | int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift; |
| 61 | int len = needle_length - start; |
| 62 | biased_suffixes_ = suffixes_ - start; |
| 63 | biased_good_suffix_shift_ = good_suffix_shift_ - start; |
| 64 | for (int i = 0; i <= len; i++) { |
| 65 | good_suffix_shift_[i] = len; |
| 66 | } |
| 67 | } |
| 68 | inline int& suffix(int index) { |
| 69 | ASSERT(biased_suffixes_ + index >= suffixes_); |
| 70 | return biased_suffixes_[index]; |
| 71 | } |
| 72 | inline int& shift(int index) { |
| 73 | ASSERT(biased_good_suffix_shift_ + index >= good_suffix_shift_); |
| 74 | return biased_good_suffix_shift_[index]; |
| 75 | } |
| 76 | private: |
| 77 | int suffixes_[kBMMaxShift + 1]; |
| 78 | int good_suffix_shift_[kBMMaxShift + 1]; |
| 79 | int* biased_suffixes_; |
| 80 | int* biased_good_suffix_shift_; |
| 81 | DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers); |
| 82 | }; |
| 83 | |
| 84 | // buffers reused by BoyerMoore |
| 85 | struct BMBuffers { |
| 86 | public: |
| 87 | static int bad_char_occurrence[kBMAlphabetSize]; |
| 88 | static BMGoodSuffixBuffers bmgs_buffers; |
| 89 | }; |
| 90 | |
| 91 | // State of the string match tables. |
| 92 | // SIMPLE: No usable content in the buffers. |
| 93 | // BOYER_MOORE_HORSPOOL: The bad_char_occurence table has been populated. |
| 94 | // BOYER_MOORE: The bmgs_buffers tables have also been populated. |
| 95 | // Whenever starting with a new needle, one should call InitializeStringSearch |
| 96 | // to determine which search strategy to use, and in the case of a long-needle |
| 97 | // strategy, the call also initializes the algorithm to SIMPLE. |
| 98 | enum StringSearchAlgorithm { SIMPLE_SEARCH, BOYER_MOORE_HORSPOOL, BOYER_MOORE }; |
| 99 | static StringSearchAlgorithm algorithm; |
| 100 | |
| 101 | |
| 102 | // Compute the bad-char table for Boyer-Moore in the static buffer. |
| 103 | template <typename PatternChar> |
| 104 | static void BoyerMoorePopulateBadCharTable(Vector<const PatternChar> pattern) { |
| 105 | // Only preprocess at most kBMMaxShift last characters of pattern. |
| 106 | int start = Max(pattern.length() - kBMMaxShift, 0); |
| 107 | // Run forwards to populate bad_char_table, so that *last* instance |
| 108 | // of character equivalence class is the one registered. |
| 109 | // Notice: Doesn't include the last character. |
| 110 | int table_size = (sizeof(PatternChar) == 1) ? String::kMaxAsciiCharCode + 1 |
| 111 | : kBMAlphabetSize; |
| 112 | if (start == 0) { // All patterns less than kBMMaxShift in length. |
| 113 | memset(BMBuffers::bad_char_occurrence, |
| 114 | -1, |
| 115 | table_size * sizeof(*BMBuffers::bad_char_occurrence)); |
| 116 | } else { |
| 117 | for (int i = 0; i < table_size; i++) { |
| 118 | BMBuffers::bad_char_occurrence[i] = start - 1; |
| 119 | } |
| 120 | } |
| 121 | for (int i = start; i < pattern.length() - 1; i++) { |
| 122 | PatternChar c = pattern[i]; |
| 123 | int bucket = (sizeof(PatternChar) ==1) ? c : c % kBMAlphabetSize; |
| 124 | BMBuffers::bad_char_occurrence[bucket] = i; |
| 125 | } |
| 126 | } |
| 127 | |
| 128 | |
| 129 | template <typename PatternChar> |
| 130 | static void BoyerMoorePopulateGoodSuffixTable( |
| 131 | Vector<const PatternChar> pattern) { |
| 132 | int m = pattern.length(); |
| 133 | int start = m < kBMMaxShift ? 0 : m - kBMMaxShift; |
| 134 | int len = m - start; |
| 135 | // Compute Good Suffix tables. |
| 136 | BMBuffers::bmgs_buffers.Initialize(m); |
| 137 | |
| 138 | BMBuffers::bmgs_buffers.shift(m-1) = 1; |
| 139 | BMBuffers::bmgs_buffers.suffix(m) = m + 1; |
| 140 | PatternChar last_char = pattern[m - 1]; |
| 141 | int suffix = m + 1; |
| 142 | { |
| 143 | int i = m; |
| 144 | while (i > start) { |
| 145 | PatternChar c = pattern[i - 1]; |
| 146 | while (suffix <= m && c != pattern[suffix - 1]) { |
| 147 | if (BMBuffers::bmgs_buffers.shift(suffix) == len) { |
| 148 | BMBuffers::bmgs_buffers.shift(suffix) = suffix - i; |
| 149 | } |
| 150 | suffix = BMBuffers::bmgs_buffers.suffix(suffix); |
| 151 | } |
| 152 | BMBuffers::bmgs_buffers.suffix(--i) = --suffix; |
| 153 | if (suffix == m) { |
| 154 | // No suffix to extend, so we check against last_char only. |
| 155 | while ((i > start) && (pattern[i - 1] != last_char)) { |
| 156 | if (BMBuffers::bmgs_buffers.shift(m) == len) { |
| 157 | BMBuffers::bmgs_buffers.shift(m) = m - i; |
| 158 | } |
| 159 | BMBuffers::bmgs_buffers.suffix(--i) = m; |
| 160 | } |
| 161 | if (i > start) { |
| 162 | BMBuffers::bmgs_buffers.suffix(--i) = --suffix; |
| 163 | } |
| 164 | } |
| 165 | } |
| 166 | } |
| 167 | if (suffix < m) { |
| 168 | for (int i = start; i <= m; i++) { |
| 169 | if (BMBuffers::bmgs_buffers.shift(i) == len) { |
| 170 | BMBuffers::bmgs_buffers.shift(i) = suffix - start; |
| 171 | } |
| 172 | if (i == suffix) { |
| 173 | suffix = BMBuffers::bmgs_buffers.suffix(suffix); |
| 174 | } |
| 175 | } |
| 176 | } |
| 177 | } |
| 178 | |
| 179 | |
| 180 | template <typename SubjectChar, typename PatternChar> |
| 181 | static inline int CharOccurrence(int char_code) { |
| 182 | if (sizeof(SubjectChar) == 1) { |
| 183 | return BMBuffers::bad_char_occurrence[char_code]; |
| 184 | } |
| 185 | if (sizeof(PatternChar) == 1) { |
| 186 | if (char_code > String::kMaxAsciiCharCode) { |
| 187 | return -1; |
| 188 | } |
| 189 | return BMBuffers::bad_char_occurrence[char_code]; |
| 190 | } |
| 191 | return BMBuffers::bad_char_occurrence[char_code % kBMAlphabetSize]; |
| 192 | } |
| 193 | |
| 194 | |
| 195 | // Restricted simplified Boyer-Moore string matching. |
| 196 | // Uses only the bad-shift table of Boyer-Moore and only uses it |
| 197 | // for the character compared to the last character of the needle. |
| 198 | template <typename SubjectChar, typename PatternChar> |
| 199 | static int BoyerMooreHorspool(Vector<const SubjectChar> subject, |
| 200 | Vector<const PatternChar> pattern, |
| 201 | int start_index, |
| 202 | bool* complete) { |
| 203 | ASSERT(algorithm <= BOYER_MOORE_HORSPOOL); |
| 204 | int n = subject.length(); |
| 205 | int m = pattern.length(); |
| 206 | |
| 207 | int badness = -m; |
| 208 | |
| 209 | // How bad we are doing without a good-suffix table. |
| 210 | int idx; // No matches found prior to this index. |
| 211 | PatternChar last_char = pattern[m - 1]; |
| 212 | int last_char_shift = |
| 213 | m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char); |
| 214 | // Perform search |
| 215 | for (idx = start_index; idx <= n - m;) { |
| 216 | int j = m - 1; |
| 217 | int c; |
| 218 | while (last_char != (c = subject[idx + j])) { |
| 219 | int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c); |
| 220 | int shift = j - bc_occ; |
| 221 | idx += shift; |
| 222 | badness += 1 - shift; // at most zero, so badness cannot increase. |
| 223 | if (idx > n - m) { |
| 224 | *complete = true; |
| 225 | return -1; |
| 226 | } |
| 227 | } |
| 228 | j--; |
| 229 | while (j >= 0 && pattern[j] == (subject[idx + j])) j--; |
| 230 | if (j < 0) { |
| 231 | *complete = true; |
| 232 | return idx; |
| 233 | } else { |
| 234 | idx += last_char_shift; |
| 235 | // Badness increases by the number of characters we have |
| 236 | // checked, and decreases by the number of characters we |
| 237 | // can skip by shifting. It's a measure of how we are doing |
| 238 | // compared to reading each character exactly once. |
| 239 | badness += (m - j) - last_char_shift; |
| 240 | if (badness > 0) { |
| 241 | *complete = false; |
| 242 | return idx; |
| 243 | } |
| 244 | } |
| 245 | } |
| 246 | *complete = true; |
| 247 | return -1; |
| 248 | } |
| 249 | |
| 250 | |
| 251 | template <typename SubjectChar, typename PatternChar> |
| 252 | static int BoyerMooreIndexOf(Vector<const SubjectChar> subject, |
| 253 | Vector<const PatternChar> pattern, |
| 254 | int idx) { |
| 255 | ASSERT(algorithm <= BOYER_MOORE); |
| 256 | int n = subject.length(); |
| 257 | int m = pattern.length(); |
| 258 | // Only preprocess at most kBMMaxShift last characters of pattern. |
| 259 | int start = m < kBMMaxShift ? 0 : m - kBMMaxShift; |
| 260 | |
| 261 | PatternChar last_char = pattern[m - 1]; |
| 262 | // Continue search from i. |
| 263 | while (idx <= n - m) { |
| 264 | int j = m - 1; |
| 265 | SubjectChar c; |
| 266 | while (last_char != (c = subject[idx + j])) { |
| 267 | int shift = j - CharOccurrence<SubjectChar, PatternChar>(c); |
| 268 | idx += shift; |
| 269 | if (idx > n - m) { |
| 270 | return -1; |
| 271 | } |
| 272 | } |
| 273 | while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--; |
| 274 | if (j < 0) { |
| 275 | return idx; |
| 276 | } else if (j < start) { |
| 277 | // we have matched more than our tables allow us to be smart about. |
| 278 | // Fall back on BMH shift. |
| 279 | idx += m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char); |
| 280 | } else { |
| 281 | int gs_shift = BMBuffers::bmgs_buffers.shift(j + 1); |
| 282 | int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c); |
| 283 | int shift = j - bc_occ; |
| 284 | if (gs_shift > shift) { |
| 285 | shift = gs_shift; |
| 286 | } |
| 287 | idx += shift; |
| 288 | } |
| 289 | } |
| 290 | |
| 291 | return -1; |
| 292 | } |
| 293 | |
| 294 | |
| 295 | // Trivial string search for shorter strings. |
| 296 | // On return, if "complete" is set to true, the return value is the |
| 297 | // final result of searching for the patter in the subject. |
| 298 | // If "complete" is set to false, the return value is the index where |
| 299 | // further checking should start, i.e., it's guaranteed that the pattern |
| 300 | // does not occur at a position prior to the returned index. |
| 301 | template <typename PatternChar, typename SubjectChar> |
| 302 | static int SimpleIndexOf(Vector<const SubjectChar> subject, |
| 303 | Vector<const PatternChar> pattern, |
| 304 | int idx, |
| 305 | bool* complete) { |
| 306 | ASSERT(pattern.length() > 1); |
| 307 | int pattern_length = pattern.length(); |
| 308 | // Badness is a count of how much work we have done. When we have |
| 309 | // done enough work we decide it's probably worth switching to a better |
| 310 | // algorithm. |
| 311 | int badness = -10 - (pattern_length << 2); |
| 312 | |
| 313 | // We know our pattern is at least 2 characters, we cache the first so |
| 314 | // the common case of the first character not matching is faster. |
| 315 | PatternChar pattern_first_char = pattern[0]; |
| 316 | for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) { |
| 317 | badness++; |
| 318 | if (badness > 0) { |
| 319 | *complete = false; |
| 320 | return i; |
| 321 | } |
| 322 | if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { |
| 323 | const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( |
| 324 | memchr(subject.start() + i, |
| 325 | pattern_first_char, |
| 326 | n - i + 1)); |
| 327 | if (pos == NULL) { |
| 328 | *complete = true; |
| 329 | return -1; |
| 330 | } |
| 331 | i = static_cast<int>(pos - subject.start()); |
| 332 | } else { |
| 333 | if (subject[i] != pattern_first_char) continue; |
| 334 | } |
| 335 | int j = 1; |
| 336 | do { |
| 337 | if (pattern[j] != subject[i+j]) { |
| 338 | break; |
| 339 | } |
| 340 | j++; |
| 341 | } while (j < pattern_length); |
| 342 | if (j == pattern_length) { |
| 343 | *complete = true; |
| 344 | return i; |
| 345 | } |
| 346 | badness += j; |
| 347 | } |
| 348 | *complete = true; |
| 349 | return -1; |
| 350 | } |
| 351 | |
| 352 | // Simple indexOf that never bails out. For short patterns only. |
| 353 | template <typename PatternChar, typename SubjectChar> |
| 354 | static int SimpleIndexOf(Vector<const SubjectChar> subject, |
| 355 | Vector<const PatternChar> pattern, |
| 356 | int idx) { |
| 357 | int pattern_length = pattern.length(); |
| 358 | PatternChar pattern_first_char = pattern[0]; |
| 359 | for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) { |
| 360 | if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { |
| 361 | const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( |
| 362 | memchr(subject.start() + i, |
| 363 | pattern_first_char, |
| 364 | n - i + 1)); |
| 365 | if (pos == NULL) return -1; |
| 366 | i = static_cast<int>(pos - subject.start()); |
| 367 | } else { |
| 368 | if (subject[i] != pattern_first_char) continue; |
| 369 | } |
| 370 | int j = 1; |
| 371 | while (j < pattern_length) { |
| 372 | if (pattern[j] != subject[i+j]) { |
| 373 | break; |
| 374 | } |
| 375 | j++; |
| 376 | } |
| 377 | if (j == pattern_length) { |
| 378 | return i; |
| 379 | } |
| 380 | } |
| 381 | return -1; |
| 382 | } |
| 383 | |
| 384 | |
| 385 | // Strategy for searching for a string in another string. |
| 386 | enum StringSearchStrategy { SEARCH_FAIL, SEARCH_SHORT, SEARCH_LONG }; |
| 387 | |
| 388 | |
| 389 | template <typename PatternChar> |
| 390 | static inline StringSearchStrategy InitializeStringSearch( |
| 391 | Vector<const PatternChar> pat, bool ascii_subject) { |
| 392 | // We have an ASCII haystack and a non-ASCII needle. Check if there |
| 393 | // really is a non-ASCII character in the needle and bail out if there |
| 394 | // is. |
| 395 | if (ascii_subject && sizeof(PatternChar) > 1) { |
| 396 | for (int i = 0; i < pat.length(); i++) { |
| 397 | uc16 c = pat[i]; |
| 398 | if (c > String::kMaxAsciiCharCode) { |
| 399 | return SEARCH_FAIL; |
| 400 | } |
| 401 | } |
| 402 | } |
| 403 | if (pat.length() < kBMMinPatternLength) { |
| 404 | return SEARCH_SHORT; |
| 405 | } |
| 406 | algorithm = SIMPLE_SEARCH; |
| 407 | return SEARCH_LONG; |
| 408 | } |
| 409 | |
| 410 | |
| 411 | // Dispatch long needle searches to different algorithms. |
| 412 | template <typename SubjectChar, typename PatternChar> |
| 413 | static int ComplexIndexOf(Vector<const SubjectChar> sub, |
| 414 | Vector<const PatternChar> pat, |
| 415 | int start_index) { |
| 416 | ASSERT(pat.length() >= kBMMinPatternLength); |
| 417 | // Try algorithms in order of increasing setup cost and expected performance. |
| 418 | bool complete; |
| 419 | int idx = start_index; |
| 420 | switch (algorithm) { |
| 421 | case SIMPLE_SEARCH: |
| 422 | idx = SimpleIndexOf(sub, pat, idx, &complete); |
| 423 | if (complete) return idx; |
| 424 | BoyerMoorePopulateBadCharTable(pat); |
| 425 | algorithm = BOYER_MOORE_HORSPOOL; |
| 426 | // FALLTHROUGH. |
| 427 | case BOYER_MOORE_HORSPOOL: |
| 428 | idx = BoyerMooreHorspool(sub, pat, idx, &complete); |
| 429 | if (complete) return idx; |
| 430 | // Build the Good Suffix table and continue searching. |
| 431 | BoyerMoorePopulateGoodSuffixTable(pat); |
| 432 | algorithm = BOYER_MOORE; |
| 433 | // FALLTHROUGH. |
| 434 | case BOYER_MOORE: |
| 435 | return BoyerMooreIndexOf(sub, pat, idx); |
| 436 | } |
| 437 | UNREACHABLE(); |
| 438 | return -1; |
| 439 | } |
| 440 | |
| 441 | |
| 442 | // Dispatch to different search strategies for a single search. |
| 443 | // If searching multiple times on the same needle, the search |
| 444 | // strategy should only be computed once and then dispatch to different |
| 445 | // loops. |
| 446 | template <typename SubjectChar, typename PatternChar> |
| 447 | static int StringSearch(Vector<const SubjectChar> sub, |
| 448 | Vector<const PatternChar> pat, |
| 449 | int start_index) { |
| 450 | bool ascii_subject = (sizeof(SubjectChar) == 1); |
| 451 | StringSearchStrategy strategy = InitializeStringSearch(pat, ascii_subject); |
| 452 | switch (strategy) { |
| 453 | case SEARCH_FAIL: return -1; |
| 454 | case SEARCH_SHORT: return SimpleIndexOf(sub, pat, start_index); |
| 455 | case SEARCH_LONG: return ComplexIndexOf(sub, pat, start_index); |
| 456 | } |
| 457 | UNREACHABLE(); |
| 458 | return -1; |
| 459 | } |
| 460 | |
| 461 | }} // namespace v8::internal |
| 462 | |
| 463 | #endif // V8_STRING_SEARCH_H_ |