blob: d35cba31c10bfb5b04c9eb88c283f4bf40278e3b [file] [log] [blame]
Thomas Wouters477c8d52006-05-27 19:21:47 +00001/* stringlib: fastsearch implementation */
2
Thomas Wouters477c8d52006-05-27 19:21:47 +00003#define STRINGLIB_FASTSEARCH_H
4
5/* fast search/count implementation, based on a mix between boyer-
6 moore and horspool, with a few more bells and whistles on the top.
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +00007 for some more background, see: http://effbot.org/zone/stringlib.htm */
Thomas Wouters477c8d52006-05-27 19:21:47 +00008
9/* note: fastsearch may access s[n], which isn't a problem when using
10 Python's ordinary string types, but may cause problems if you're
11 using this code in other contexts. also, the count mode returns -1
12 if there cannot possible be a match in the target string, and 0 if
13 it has actually checked for matches, but didn't find any. callers
14 beware! */
15
16#define FAST_COUNT 0
17#define FAST_SEARCH 1
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +000018#define FAST_RSEARCH 2
Thomas Wouters477c8d52006-05-27 19:21:47 +000019
Antoine Pitrouf068f942010-01-13 14:19:12 +000020#if LONG_BIT >= 128
21#define STRINGLIB_BLOOM_WIDTH 128
22#elif LONG_BIT >= 64
23#define STRINGLIB_BLOOM_WIDTH 64
24#elif LONG_BIT >= 32
25#define STRINGLIB_BLOOM_WIDTH 32
26#else
27#error "LONG_BIT is smaller than 32"
28#endif
29
30#define STRINGLIB_BLOOM_ADD(mask, ch) \
31 ((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))
32#define STRINGLIB_BLOOM(mask, ch) \
33 ((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))
Antoine Pitrouf2c54842010-01-13 08:07:53 +000034
Thomas Wouters477c8d52006-05-27 19:21:47 +000035Py_LOCAL_INLINE(Py_ssize_t)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020036FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
Thomas Wouters477c8d52006-05-27 19:21:47 +000037 const STRINGLIB_CHAR* p, Py_ssize_t m,
Antoine Pitrouf2c54842010-01-13 08:07:53 +000038 Py_ssize_t maxcount, int mode)
Thomas Wouters477c8d52006-05-27 19:21:47 +000039{
Antoine Pitrouf068f942010-01-13 14:19:12 +000040 unsigned long mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +000041 Py_ssize_t skip, count = 0;
42 Py_ssize_t i, j, mlast, w;
43
44 w = n - m;
45
Antoine Pitrouf2c54842010-01-13 08:07:53 +000046 if (w < 0 || (mode == FAST_COUNT && maxcount == 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +000047 return -1;
48
49 /* look for special cases */
50 if (m <= 1) {
51 if (m <= 0)
52 return -1;
53 /* use special case for 1-character strings */
54 if (mode == FAST_COUNT) {
55 for (i = 0; i < n; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000056 if (s[i] == p[0]) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000057 count++;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000058 if (count == maxcount)
59 return maxcount;
60 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000061 return count;
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +000062 } else if (mode == FAST_SEARCH) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000063 for (i = 0; i < n; i++)
64 if (s[i] == p[0])
65 return i;
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +000066 } else { /* FAST_RSEARCH */
67 for (i = n - 1; i > -1; i--)
68 if (s[i] == p[0])
69 return i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000070 }
71 return -1;
72 }
73
74 mlast = m - 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000075 skip = mlast - 1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000076 mask = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000077
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +000078 if (mode != FAST_RSEARCH) {
79
80 /* create compressed boyer-moore delta 1 table */
81
82 /* process pattern[:-1] */
Antoine Pitrouf2c54842010-01-13 08:07:53 +000083 for (i = 0; i < mlast; i++) {
Antoine Pitrouf068f942010-01-13 14:19:12 +000084 STRINGLIB_BLOOM_ADD(mask, p[i]);
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +000085 if (p[i] == p[mlast])
86 skip = mlast - i - 1;
87 }
88 /* process pattern[-1] outside the loop */
Antoine Pitrouf068f942010-01-13 14:19:12 +000089 STRINGLIB_BLOOM_ADD(mask, p[mlast]);
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +000090
91 for (i = 0; i <= w; i++) {
92 /* note: using mlast in the skip path slows things down on x86 */
93 if (s[i+m-1] == p[m-1]) {
94 /* candidate match */
95 for (j = 0; j < mlast; j++)
96 if (s[i+j] != p[j])
97 break;
98 if (j == mlast) {
99 /* got a match! */
100 if (mode != FAST_COUNT)
101 return i;
102 count++;
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000103 if (count == maxcount)
104 return maxcount;
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +0000105 i = i + mlast;
106 continue;
107 }
108 /* miss: check if next character is part of pattern */
Antoine Pitrouf068f942010-01-13 14:19:12 +0000109 if (!STRINGLIB_BLOOM(mask, s[i+m]))
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +0000110 i = i + m;
111 else
112 i = i + skip;
113 } else {
114 /* skip: check if next character is part of pattern */
Antoine Pitrouf068f942010-01-13 14:19:12 +0000115 if (!STRINGLIB_BLOOM(mask, s[i+m]))
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +0000116 i = i + m;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000117 }
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +0000118 }
119 } else { /* FAST_RSEARCH */
120
121 /* create compressed boyer-moore delta 1 table */
122
123 /* process pattern[0] outside the loop */
Antoine Pitrouf068f942010-01-13 14:19:12 +0000124 STRINGLIB_BLOOM_ADD(mask, p[0]);
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +0000125 /* process pattern[:0:-1] */
126 for (i = mlast; i > 0; i--) {
Antoine Pitrouf068f942010-01-13 14:19:12 +0000127 STRINGLIB_BLOOM_ADD(mask, p[i]);
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +0000128 if (p[i] == p[0])
129 skip = i - 1;
130 }
131
132 for (i = w; i >= 0; i--) {
133 if (s[i] == p[0]) {
134 /* candidate match */
135 for (j = mlast; j > 0; j--)
136 if (s[i+j] != p[j])
137 break;
138 if (j == 0)
139 /* got a match! */
140 return i;
141 /* miss: check if previous character is part of pattern */
Florent Xiclunaeb6f3ea2010-08-08 22:07:16 +0000142 if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1]))
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +0000143 i = i - m;
144 else
145 i = i - skip;
146 } else {
147 /* skip: check if previous character is part of pattern */
Florent Xiclunaeb6f3ea2010-08-08 22:07:16 +0000148 if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1]))
Antoine Pitrouda2ecaf2010-01-02 21:40:36 +0000149 i = i - m;
150 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000151 }
152 }
153
154 if (mode != FAST_COUNT)
155 return -1;
156 return count;
157}
158