blob: e231c587e4764fe281861759d76de7a8da9ec891 [file] [log] [blame]
Fredrik Lundha50d2012006-05-26 17:04:58 +00001/* stringlib: fastsearch implementation */
2
3#ifndef STRINGLIB_FASTSEARCH_H
4#define STRINGLIB_FASTSEARCH_H
5
6/* fast search/count implementation, based on a mix between boyer-
7 moore and horspool, with a few more bells and whistles on the top.
Antoine Pitrou5b7139a2010-01-02 21:12:58 +00008 for some more background, see: http://effbot.org/zone/stringlib.htm */
Fredrik Lundha50d2012006-05-26 17:04:58 +00009
10/* note: fastsearch may access s[n], which isn't a problem when using
11 Python's ordinary string types, but may cause problems if you're
12 using this code in other contexts. also, the count mode returns -1
13 if there cannot possible be a match in the target string, and 0 if
14 it has actually checked for matches, but didn't find any. callers
15 beware! */
16
17#define FAST_COUNT 0
18#define FAST_SEARCH 1
Antoine Pitrou5b7139a2010-01-02 21:12:58 +000019#define FAST_RSEARCH 2
Fredrik Lundha50d2012006-05-26 17:04:58 +000020
Antoine Pitrou10042922010-01-13 14:01:26 +000021#if LONG_BIT >= 128
22#define STRINGLIB_BLOOM_WIDTH 128
23#elif LONG_BIT >= 64
24#define STRINGLIB_BLOOM_WIDTH 64
25#elif LONG_BIT >= 32
26#define STRINGLIB_BLOOM_WIDTH 32
27#else
28#error "LONG_BIT is smaller than 32"
29#endif
30
31#define STRINGLIB_BLOOM_ADD(mask, ch) \
32 ((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))
33#define STRINGLIB_BLOOM(mask, ch) \
34 ((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))
Antoine Pitrou64672132010-01-13 07:55:48 +000035
Fredrik Lundhc2d29c52006-05-27 14:58:20 +000036Py_LOCAL_INLINE(Py_ssize_t)
Fredrik Lundha50d2012006-05-26 17:04:58 +000037fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
38 const STRINGLIB_CHAR* p, Py_ssize_t m,
Antoine Pitrou64672132010-01-13 07:55:48 +000039 Py_ssize_t maxcount, int mode)
Fredrik Lundha50d2012006-05-26 17:04:58 +000040{
Antoine Pitrou10042922010-01-13 14:01:26 +000041 unsigned long mask;
Fredrik Lundha50d2012006-05-26 17:04:58 +000042 Py_ssize_t skip, count = 0;
43 Py_ssize_t i, j, mlast, w;
44
45 w = n - m;
46
Antoine Pitrou64672132010-01-13 07:55:48 +000047 if (w < 0 || (mode == FAST_COUNT && maxcount == 0))
Fredrik Lundha50d2012006-05-26 17:04:58 +000048 return -1;
49
50 /* look for special cases */
51 if (m <= 1) {
52 if (m <= 0)
53 return -1;
54 /* use special case for 1-character strings */
55 if (mode == FAST_COUNT) {
56 for (i = 0; i < n; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +000057 if (s[i] == p[0]) {
Fredrik Lundha50d2012006-05-26 17:04:58 +000058 count++;
Antoine Pitrou64672132010-01-13 07:55:48 +000059 if (count == maxcount)
60 return maxcount;
61 }
Fredrik Lundha50d2012006-05-26 17:04:58 +000062 return count;
Antoine Pitrou5b7139a2010-01-02 21:12:58 +000063 } else if (mode == FAST_SEARCH) {
Fredrik Lundha50d2012006-05-26 17:04:58 +000064 for (i = 0; i < n; i++)
65 if (s[i] == p[0])
66 return i;
Antoine Pitrou5b7139a2010-01-02 21:12:58 +000067 } else { /* FAST_RSEARCH */
68 for (i = n - 1; i > -1; i--)
69 if (s[i] == p[0])
70 return i;
Fredrik Lundha50d2012006-05-26 17:04:58 +000071 }
72 return -1;
73 }
74
75 mlast = m - 1;
Fredrik Lundha50d2012006-05-26 17:04:58 +000076 skip = mlast - 1;
Antoine Pitrou64672132010-01-13 07:55:48 +000077 mask = 0;
Fredrik Lundha50d2012006-05-26 17:04:58 +000078
Antoine Pitrou5b7139a2010-01-02 21:12:58 +000079 if (mode != FAST_RSEARCH) {
80
81 /* create compressed boyer-moore delta 1 table */
82
83 /* process pattern[:-1] */
Antoine Pitrou64672132010-01-13 07:55:48 +000084 for (i = 0; i < mlast; i++) {
Antoine Pitrou10042922010-01-13 14:01:26 +000085 STRINGLIB_BLOOM_ADD(mask, p[i]);
Antoine Pitrou5b7139a2010-01-02 21:12:58 +000086 if (p[i] == p[mlast])
87 skip = mlast - i - 1;
88 }
89 /* process pattern[-1] outside the loop */
Antoine Pitrou10042922010-01-13 14:01:26 +000090 STRINGLIB_BLOOM_ADD(mask, p[mlast]);
Antoine Pitrou5b7139a2010-01-02 21:12:58 +000091
92 for (i = 0; i <= w; i++) {
93 /* note: using mlast in the skip path slows things down on x86 */
94 if (s[i+m-1] == p[m-1]) {
95 /* candidate match */
96 for (j = 0; j < mlast; j++)
97 if (s[i+j] != p[j])
98 break;
99 if (j == mlast) {
100 /* got a match! */
101 if (mode != FAST_COUNT)
102 return i;
103 count++;
Antoine Pitrou64672132010-01-13 07:55:48 +0000104 if (count == maxcount)
105 return maxcount;
Antoine Pitrou5b7139a2010-01-02 21:12:58 +0000106 i = i + mlast;
107 continue;
108 }
109 /* miss: check if next character is part of pattern */
Antoine Pitrou10042922010-01-13 14:01:26 +0000110 if (!STRINGLIB_BLOOM(mask, s[i+m]))
Antoine Pitrou5b7139a2010-01-02 21:12:58 +0000111 i = i + m;
112 else
113 i = i + skip;
114 } else {
115 /* skip: check if next character is part of pattern */
Antoine Pitrou10042922010-01-13 14:01:26 +0000116 if (!STRINGLIB_BLOOM(mask, s[i+m]))
Antoine Pitrou5b7139a2010-01-02 21:12:58 +0000117 i = i + m;
Fredrik Lundha50d2012006-05-26 17:04:58 +0000118 }
Antoine Pitrou5b7139a2010-01-02 21:12:58 +0000119 }
120 } else { /* FAST_RSEARCH */
121
122 /* create compressed boyer-moore delta 1 table */
123
124 /* process pattern[0] outside the loop */
Antoine Pitrou10042922010-01-13 14:01:26 +0000125 STRINGLIB_BLOOM_ADD(mask, p[0]);
Antoine Pitrou5b7139a2010-01-02 21:12:58 +0000126 /* process pattern[:0:-1] */
127 for (i = mlast; i > 0; i--) {
Antoine Pitrou10042922010-01-13 14:01:26 +0000128 STRINGLIB_BLOOM_ADD(mask, p[i]);
Antoine Pitrou5b7139a2010-01-02 21:12:58 +0000129 if (p[i] == p[0])
130 skip = i - 1;
131 }
132
133 for (i = w; i >= 0; i--) {
134 if (s[i] == p[0]) {
135 /* candidate match */
136 for (j = mlast; j > 0; j--)
137 if (s[i+j] != p[j])
138 break;
139 if (j == 0)
140 /* got a match! */
141 return i;
142 /* miss: check if previous character is part of pattern */
Florent Xicluna172e15f2010-08-09 20:02:00 +0000143 if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1]))
Antoine Pitrou5b7139a2010-01-02 21:12:58 +0000144 i = i - m;
145 else
146 i = i - skip;
147 } else {
148 /* skip: check if previous character is part of pattern */
Florent Xicluna172e15f2010-08-09 20:02:00 +0000149 if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1]))
Antoine Pitrou5b7139a2010-01-02 21:12:58 +0000150 i = i - m;
151 }
Fredrik Lundha50d2012006-05-26 17:04:58 +0000152 }
153 }
154
155 if (mode != FAST_COUNT)
156 return -1;
157 return count;
158}
159
160#endif