| Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 1 | /* stringlib: fastsearch implementation */ | 
|  | 2 |  | 
| Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 3 | #define STRINGLIB_FASTSEARCH_H | 
|  | 4 |  | 
|  | 5 | /* fast search/count implementation, based on a mix between boyer- | 
|  | 6 | moore and horspool, with a few more bells and whistles on the top. | 
| Antoine Pitrou | da2ecaf | 2010-01-02 21:40:36 +0000 | [diff] [blame] | 7 | for some more background, see: http://effbot.org/zone/stringlib.htm */ | 
| Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 8 |  | 
|  | 9 | /* note: fastsearch may access s[n], which isn't a problem when using | 
|  | 10 | Python's ordinary string types, but may cause problems if you're | 
|  | 11 | using this code in other contexts.  also, the count mode returns -1 | 
|  | 12 | if there cannot possible be a match in the target string, and 0 if | 
|  | 13 | it has actually checked for matches, but didn't find any.  callers | 
|  | 14 | beware! */ | 
|  | 15 |  | 
|  | 16 | #define FAST_COUNT 0 | 
|  | 17 | #define FAST_SEARCH 1 | 
| Antoine Pitrou | da2ecaf | 2010-01-02 21:40:36 +0000 | [diff] [blame] | 18 | #define FAST_RSEARCH 2 | 
| Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 19 |  | 
| Antoine Pitrou | f068f94 | 2010-01-13 14:19:12 +0000 | [diff] [blame] | 20 | #if LONG_BIT >= 128 | 
|  | 21 | #define STRINGLIB_BLOOM_WIDTH 128 | 
|  | 22 | #elif LONG_BIT >= 64 | 
|  | 23 | #define STRINGLIB_BLOOM_WIDTH 64 | 
|  | 24 | #elif LONG_BIT >= 32 | 
|  | 25 | #define STRINGLIB_BLOOM_WIDTH 32 | 
|  | 26 | #else | 
|  | 27 | #error "LONG_BIT is smaller than 32" | 
|  | 28 | #endif | 
|  | 29 |  | 
|  | 30 | #define STRINGLIB_BLOOM_ADD(mask, ch) \ | 
|  | 31 | ((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) | 
|  | 32 | #define STRINGLIB_BLOOM(mask, ch)     \ | 
|  | 33 | ((mask &  (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) | 
| Antoine Pitrou | f2c5484 | 2010-01-13 08:07:53 +0000 | [diff] [blame] | 34 |  | 
| Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 35 | Py_LOCAL_INLINE(Py_ssize_t) | 
| Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 36 | FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, | 
| Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 37 | const STRINGLIB_CHAR* p, Py_ssize_t m, | 
| Antoine Pitrou | f2c5484 | 2010-01-13 08:07:53 +0000 | [diff] [blame] | 38 | Py_ssize_t maxcount, int mode) | 
| Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 39 | { | 
| Antoine Pitrou | f068f94 | 2010-01-13 14:19:12 +0000 | [diff] [blame] | 40 | unsigned long mask; | 
| Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 41 | Py_ssize_t skip, count = 0; | 
|  | 42 | Py_ssize_t i, j, mlast, w; | 
|  | 43 |  | 
|  | 44 | w = n - m; | 
|  | 45 |  | 
| Antoine Pitrou | f2c5484 | 2010-01-13 08:07:53 +0000 | [diff] [blame] | 46 | if (w < 0 || (mode == FAST_COUNT && maxcount == 0)) | 
| Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 47 | return -1; | 
|  | 48 |  | 
|  | 49 | /* look for special cases */ | 
|  | 50 | if (m <= 1) { | 
|  | 51 | if (m <= 0) | 
|  | 52 | return -1; | 
|  | 53 | /* use special case for 1-character strings */ | 
|  | 54 | if (mode == FAST_COUNT) { | 
|  | 55 | for (i = 0; i < n; i++) | 
| Antoine Pitrou | f2c5484 | 2010-01-13 08:07:53 +0000 | [diff] [blame] | 56 | if (s[i] == p[0]) { | 
| Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 57 | count++; | 
| Antoine Pitrou | f2c5484 | 2010-01-13 08:07:53 +0000 | [diff] [blame] | 58 | if (count == maxcount) | 
|  | 59 | return maxcount; | 
|  | 60 | } | 
| Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 61 | return count; | 
| Antoine Pitrou | da2ecaf | 2010-01-02 21:40:36 +0000 | [diff] [blame] | 62 | } else if (mode == FAST_SEARCH) { | 
| Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 63 | for (i = 0; i < n; i++) | 
|  | 64 | if (s[i] == p[0]) | 
|  | 65 | return i; | 
| Antoine Pitrou | da2ecaf | 2010-01-02 21:40:36 +0000 | [diff] [blame] | 66 | } else {    /* FAST_RSEARCH */ | 
|  | 67 | for (i = n - 1; i > -1; i--) | 
|  | 68 | if (s[i] == p[0]) | 
|  | 69 | return i; | 
| Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 70 | } | 
|  | 71 | return -1; | 
|  | 72 | } | 
|  | 73 |  | 
|  | 74 | mlast = m - 1; | 
| Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 75 | skip = mlast - 1; | 
| Antoine Pitrou | f2c5484 | 2010-01-13 08:07:53 +0000 | [diff] [blame] | 76 | mask = 0; | 
| Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 77 |  | 
| Antoine Pitrou | da2ecaf | 2010-01-02 21:40:36 +0000 | [diff] [blame] | 78 | if (mode != FAST_RSEARCH) { | 
|  | 79 |  | 
|  | 80 | /* create compressed boyer-moore delta 1 table */ | 
|  | 81 |  | 
|  | 82 | /* process pattern[:-1] */ | 
| Antoine Pitrou | f2c5484 | 2010-01-13 08:07:53 +0000 | [diff] [blame] | 83 | for (i = 0; i < mlast; i++) { | 
| Antoine Pitrou | f068f94 | 2010-01-13 14:19:12 +0000 | [diff] [blame] | 84 | STRINGLIB_BLOOM_ADD(mask, p[i]); | 
| Antoine Pitrou | da2ecaf | 2010-01-02 21:40:36 +0000 | [diff] [blame] | 85 | if (p[i] == p[mlast]) | 
|  | 86 | skip = mlast - i - 1; | 
|  | 87 | } | 
|  | 88 | /* process pattern[-1] outside the loop */ | 
| Antoine Pitrou | f068f94 | 2010-01-13 14:19:12 +0000 | [diff] [blame] | 89 | STRINGLIB_BLOOM_ADD(mask, p[mlast]); | 
| Antoine Pitrou | da2ecaf | 2010-01-02 21:40:36 +0000 | [diff] [blame] | 90 |  | 
|  | 91 | for (i = 0; i <= w; i++) { | 
|  | 92 | /* note: using mlast in the skip path slows things down on x86 */ | 
|  | 93 | if (s[i+m-1] == p[m-1]) { | 
|  | 94 | /* candidate match */ | 
|  | 95 | for (j = 0; j < mlast; j++) | 
|  | 96 | if (s[i+j] != p[j]) | 
|  | 97 | break; | 
|  | 98 | if (j == mlast) { | 
|  | 99 | /* got a match! */ | 
|  | 100 | if (mode != FAST_COUNT) | 
|  | 101 | return i; | 
|  | 102 | count++; | 
| Antoine Pitrou | f2c5484 | 2010-01-13 08:07:53 +0000 | [diff] [blame] | 103 | if (count == maxcount) | 
|  | 104 | return maxcount; | 
| Antoine Pitrou | da2ecaf | 2010-01-02 21:40:36 +0000 | [diff] [blame] | 105 | i = i + mlast; | 
|  | 106 | continue; | 
|  | 107 | } | 
|  | 108 | /* miss: check if next character is part of pattern */ | 
| Antoine Pitrou | f068f94 | 2010-01-13 14:19:12 +0000 | [diff] [blame] | 109 | if (!STRINGLIB_BLOOM(mask, s[i+m])) | 
| Antoine Pitrou | da2ecaf | 2010-01-02 21:40:36 +0000 | [diff] [blame] | 110 | i = i + m; | 
|  | 111 | else | 
|  | 112 | i = i + skip; | 
|  | 113 | } else { | 
|  | 114 | /* skip: check if next character is part of pattern */ | 
| Antoine Pitrou | f068f94 | 2010-01-13 14:19:12 +0000 | [diff] [blame] | 115 | if (!STRINGLIB_BLOOM(mask, s[i+m])) | 
| Antoine Pitrou | da2ecaf | 2010-01-02 21:40:36 +0000 | [diff] [blame] | 116 | i = i + m; | 
| Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 117 | } | 
| Antoine Pitrou | da2ecaf | 2010-01-02 21:40:36 +0000 | [diff] [blame] | 118 | } | 
|  | 119 | } else {    /* FAST_RSEARCH */ | 
|  | 120 |  | 
|  | 121 | /* create compressed boyer-moore delta 1 table */ | 
|  | 122 |  | 
|  | 123 | /* process pattern[0] outside the loop */ | 
| Antoine Pitrou | f068f94 | 2010-01-13 14:19:12 +0000 | [diff] [blame] | 124 | STRINGLIB_BLOOM_ADD(mask, p[0]); | 
| Antoine Pitrou | da2ecaf | 2010-01-02 21:40:36 +0000 | [diff] [blame] | 125 | /* process pattern[:0:-1] */ | 
|  | 126 | for (i = mlast; i > 0; i--) { | 
| Antoine Pitrou | f068f94 | 2010-01-13 14:19:12 +0000 | [diff] [blame] | 127 | STRINGLIB_BLOOM_ADD(mask, p[i]); | 
| Antoine Pitrou | da2ecaf | 2010-01-02 21:40:36 +0000 | [diff] [blame] | 128 | if (p[i] == p[0]) | 
|  | 129 | skip = i - 1; | 
|  | 130 | } | 
|  | 131 |  | 
|  | 132 | for (i = w; i >= 0; i--) { | 
|  | 133 | if (s[i] == p[0]) { | 
|  | 134 | /* candidate match */ | 
|  | 135 | for (j = mlast; j > 0; j--) | 
|  | 136 | if (s[i+j] != p[j]) | 
|  | 137 | break; | 
|  | 138 | if (j == 0) | 
|  | 139 | /* got a match! */ | 
|  | 140 | return i; | 
|  | 141 | /* miss: check if previous character is part of pattern */ | 
| Florent Xicluna | eb6f3ea | 2010-08-08 22:07:16 +0000 | [diff] [blame] | 142 | if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) | 
| Antoine Pitrou | da2ecaf | 2010-01-02 21:40:36 +0000 | [diff] [blame] | 143 | i = i - m; | 
|  | 144 | else | 
|  | 145 | i = i - skip; | 
|  | 146 | } else { | 
|  | 147 | /* skip: check if previous character is part of pattern */ | 
| Florent Xicluna | eb6f3ea | 2010-08-08 22:07:16 +0000 | [diff] [blame] | 148 | if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) | 
| Antoine Pitrou | da2ecaf | 2010-01-02 21:40:36 +0000 | [diff] [blame] | 149 | i = i - m; | 
|  | 150 | } | 
| Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 151 | } | 
|  | 152 | } | 
|  | 153 |  | 
|  | 154 | if (mode != FAST_COUNT) | 
|  | 155 | return -1; | 
|  | 156 | return count; | 
|  | 157 | } | 
|  | 158 |  |