blob: 7525951c065084e5d0f626af4e92daa3da06fdf2 [file] [log] [blame]
Thomas Wouters477c8d52006-05-27 19:21:47 +00001/* stringlib: fastsearch implementation */
2
3#ifndef STRINGLIB_FASTSEARCH_H
4#define STRINGLIB_FASTSEARCH_H
5
6/* fast search/count implementation, based on a mix between boyer-
7 moore and horspool, with a few more bells and whistles on the top.
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +00008 for some more background, see: http://effbot.org/zone/stringlib.htm */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009
10/* note: fastsearch may access s[n], which isn't a problem when using
11 Python's ordinary string types, but may cause problems if you're
12 using this code in other contexts. also, the count mode returns -1
13 if there cannot possible be a match in the target string, and 0 if
14 it has actually checked for matches, but didn't find any. callers
15 beware! */
16
17#define FAST_COUNT 0
18#define FAST_SEARCH 1
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +000019#define FAST_RSEARCH 2
Thomas Wouters477c8d52006-05-27 19:21:47 +000020
Antoine Pitrouf068f942010-01-13 14:19:12 +000021#if LONG_BIT >= 128
22#define STRINGLIB_BLOOM_WIDTH 128
23#elif LONG_BIT >= 64
24#define STRINGLIB_BLOOM_WIDTH 64
25#elif LONG_BIT >= 32
26#define STRINGLIB_BLOOM_WIDTH 32
27#else
28#error "LONG_BIT is smaller than 32"
29#endif
30
31#define STRINGLIB_BLOOM_ADD(mask, ch) \
32 ((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))
33#define STRINGLIB_BLOOM(mask, ch) \
34 ((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))
Antoine Pitrouf2c54842010-01-13 08:07:53 +000035
Thomas Wouters477c8d52006-05-27 19:21:47 +000036Py_LOCAL_INLINE(Py_ssize_t)
37fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
38 const STRINGLIB_CHAR* p, Py_ssize_t m,
Antoine Pitrouf2c54842010-01-13 08:07:53 +000039 Py_ssize_t maxcount, int mode)
Thomas Wouters477c8d52006-05-27 19:21:47 +000040{
Antoine Pitrouf068f942010-01-13 14:19:12 +000041 unsigned long mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +000042 Py_ssize_t skip, count = 0;
43 Py_ssize_t i, j, mlast, w;
44
45 w = n - m;
46
Antoine Pitrouf2c54842010-01-13 08:07:53 +000047 if (w < 0 || (mode == FAST_COUNT && maxcount == 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +000048 return -1;
49
50 /* look for special cases */
51 if (m <= 1) {
52 if (m <= 0)
53 return -1;
54 /* use special case for 1-character strings */
55 if (mode == FAST_COUNT) {
56 for (i = 0; i < n; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000057 if (s[i] == p[0]) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000058 count++;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000059 if (count == maxcount)
60 return maxcount;
61 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000062 return count;
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +000063 } else if (mode == FAST_SEARCH) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000064 for (i = 0; i < n; i++)
65 if (s[i] == p[0])
66 return i;
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +000067 } else { /* FAST_RSEARCH */
68 for (i = n - 1; i > -1; i--)
69 if (s[i] == p[0])
70 return i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000071 }
72 return -1;
73 }
74
75 mlast = m - 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000076 skip = mlast - 1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000077 mask = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000078
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +000079 if (mode != FAST_RSEARCH) {
80
81 /* create compressed boyer-moore delta 1 table */
82
83 /* process pattern[:-1] */
Antoine Pitrouf2c54842010-01-13 08:07:53 +000084 for (i = 0; i < mlast; i++) {
Antoine Pitrouf068f942010-01-13 14:19:12 +000085 STRINGLIB_BLOOM_ADD(mask, p[i]);
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +000086 if (p[i] == p[mlast])
87 skip = mlast - i - 1;
88 }
89 /* process pattern[-1] outside the loop */
Antoine Pitrouf068f942010-01-13 14:19:12 +000090 STRINGLIB_BLOOM_ADD(mask, p[mlast]);
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +000091
92 for (i = 0; i <= w; i++) {
93 /* note: using mlast in the skip path slows things down on x86 */
94 if (s[i+m-1] == p[m-1]) {
95 /* candidate match */
96 for (j = 0; j < mlast; j++)
97 if (s[i+j] != p[j])
98 break;
99 if (j == mlast) {
100 /* got a match! */
101 if (mode != FAST_COUNT)
102 return i;
103 count++;
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000104 if (count == maxcount)
105 return maxcount;
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +0000106 i = i + mlast;
107 continue;
108 }
109 /* miss: check if next character is part of pattern */
Antoine Pitrouf068f942010-01-13 14:19:12 +0000110 if (!STRINGLIB_BLOOM(mask, s[i+m]))
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +0000111 i = i + m;
112 else
113 i = i + skip;
114 } else {
115 /* skip: check if next character is part of pattern */
Antoine Pitrouf068f942010-01-13 14:19:12 +0000116 if (!STRINGLIB_BLOOM(mask, s[i+m]))
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +0000117 i = i + m;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000118 }
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +0000119 }
120 } else { /* FAST_RSEARCH */
121
122 /* create compressed boyer-moore delta 1 table */
123
124 /* process pattern[0] outside the loop */
Antoine Pitrouf068f942010-01-13 14:19:12 +0000125 STRINGLIB_BLOOM_ADD(mask, p[0]);
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +0000126 /* process pattern[:0:-1] */
127 for (i = mlast; i > 0; i--) {
Antoine Pitrouf068f942010-01-13 14:19:12 +0000128 STRINGLIB_BLOOM_ADD(mask, p[i]);
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +0000129 if (p[i] == p[0])
130 skip = i - 1;
131 }
132
133 for (i = w; i >= 0; i--) {
134 if (s[i] == p[0]) {
135 /* candidate match */
136 for (j = mlast; j > 0; j--)
137 if (s[i+j] != p[j])
138 break;
139 if (j == 0)
140 /* got a match! */
141 return i;
142 /* miss: check if previous character is part of pattern */
Antoine Pitrouf068f942010-01-13 14:19:12 +0000143 if (!STRINGLIB_BLOOM(mask, s[i-1]))
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +0000144 i = i - m;
145 else
146 i = i - skip;
147 } else {
148 /* skip: check if previous character is part of pattern */
Antoine Pitrouf068f942010-01-13 14:19:12 +0000149 if (!STRINGLIB_BLOOM(mask, s[i-1]))
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +0000150 i = i - m;
151 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000152 }
153 }
154
155 if (mode != FAST_COUNT)
156 return -1;
157 return count;
158}
159
160#endif