blob: 82f050a0da7fd4d41f280c8c92d4232339fff98f [file] [log] [blame]
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -07001use std::mem;
2
3use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder};
Joel Galenson38748082021-05-19 16:51:51 -07004use memchr::{memchr, memchr2, memchr3, memmem};
5use regex_syntax::hir::literal::{Literal, Literals};
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -07006
7/// A prefix extracted from a compiled regular expression.
8///
9/// A regex prefix is a set of literal strings that *must* be matched at the
10/// beginning of a regex in order for the entire regex to match. Similarly
11/// for a regex suffix.
12#[derive(Clone, Debug)]
13pub struct LiteralSearcher {
14 complete: bool,
Joel Galenson38748082021-05-19 16:51:51 -070015 lcp: Memmem,
16 lcs: Memmem,
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -070017 matcher: Matcher,
18}
19
20#[derive(Clone, Debug)]
21enum Matcher {
22 /// No literals. (Never advances through the input.)
23 Empty,
24 /// A set of four or more single byte literals.
25 Bytes(SingleByteSet),
Joel Galenson38748082021-05-19 16:51:51 -070026 /// A single substring, using vector accelerated routines when available.
27 Memmem(Memmem),
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -070028 /// An Aho-Corasick automaton.
29 AC { ac: AhoCorasick<u32>, lits: Vec<Literal> },
30 /// A packed multiple substring searcher, using SIMD.
31 ///
32 /// Note that Aho-Corasick will actually use this packed searcher
33 /// internally automatically, however, there is some overhead associated
34 /// with going through the Aho-Corasick machinery. So using the packed
35 /// searcher directly results in some gains.
36 Packed { s: packed::Searcher, lits: Vec<Literal> },
37}
38
39impl LiteralSearcher {
40 /// Returns a matcher that never matches and never advances the input.
41 pub fn empty() -> Self {
42 Self::new(Literals::empty(), Matcher::Empty)
43 }
44
45 /// Returns a matcher for literal prefixes from the given set.
46 pub fn prefixes(lits: Literals) -> Self {
47 let matcher = Matcher::prefixes(&lits);
48 Self::new(lits, matcher)
49 }
50
51 /// Returns a matcher for literal suffixes from the given set.
52 pub fn suffixes(lits: Literals) -> Self {
53 let matcher = Matcher::suffixes(&lits);
54 Self::new(lits, matcher)
55 }
56
57 fn new(lits: Literals, matcher: Matcher) -> Self {
58 let complete = lits.all_complete();
59 LiteralSearcher {
60 complete: complete,
Joel Galenson38748082021-05-19 16:51:51 -070061 lcp: Memmem::new(lits.longest_common_prefix()),
62 lcs: Memmem::new(lits.longest_common_suffix()),
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -070063 matcher: matcher,
64 }
65 }
66
67 /// Returns true if all matches comprise the entire regular expression.
68 ///
69 /// This does not necessarily mean that a literal match implies a match
Haibo Huang47619dd2021-01-08 17:05:43 -080070 /// of the regular expression. For example, the regular expression `^a`
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -070071 /// is comprised of a single complete literal `a`, but the regular
72 /// expression demands that it only match at the beginning of a string.
73 pub fn complete(&self) -> bool {
74 self.complete && !self.is_empty()
75 }
76
77 /// Find the position of a literal in `haystack` if it exists.
78 #[cfg_attr(feature = "perf-inline", inline(always))]
79 pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> {
80 use self::Matcher::*;
81 match self.matcher {
82 Empty => Some((0, 0)),
83 Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
Joel Galenson38748082021-05-19 16:51:51 -070084 Memmem(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -070085 AC { ref ac, .. } => {
86 ac.find(haystack).map(|m| (m.start(), m.end()))
87 }
88 Packed { ref s, .. } => {
89 s.find(haystack).map(|m| (m.start(), m.end()))
90 }
91 }
92 }
93
94 /// Like find, except matches must start at index `0`.
95 pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> {
96 for lit in self.iter() {
97 if lit.len() > haystack.len() {
98 continue;
99 }
100 if lit == &haystack[0..lit.len()] {
101 return Some((0, lit.len()));
102 }
103 }
104 None
105 }
106
107 /// Like find, except matches must end at index `haystack.len()`.
108 pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> {
109 for lit in self.iter() {
110 if lit.len() > haystack.len() {
111 continue;
112 }
113 if lit == &haystack[haystack.len() - lit.len()..] {
114 return Some((haystack.len() - lit.len(), haystack.len()));
115 }
116 }
117 None
118 }
119
120 /// Returns an iterator over all literals to be matched.
Joel Galenson38748082021-05-19 16:51:51 -0700121 pub fn iter(&self) -> LiteralIter<'_> {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700122 match self.matcher {
123 Matcher::Empty => LiteralIter::Empty,
124 Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
Joel Galenson38748082021-05-19 16:51:51 -0700125 Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()),
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700126 Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
127 Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
128 }
129 }
130
131 /// Returns a matcher for the longest common prefix of this matcher.
Joel Galenson38748082021-05-19 16:51:51 -0700132 pub fn lcp(&self) -> &Memmem {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700133 &self.lcp
134 }
135
136 /// Returns a matcher for the longest common suffix of this matcher.
Joel Galenson38748082021-05-19 16:51:51 -0700137 pub fn lcs(&self) -> &Memmem {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700138 &self.lcs
139 }
140
141 /// Returns true iff this prefix is empty.
142 pub fn is_empty(&self) -> bool {
143 self.len() == 0
144 }
145
146 /// Returns the number of prefixes in this machine.
147 pub fn len(&self) -> usize {
148 use self::Matcher::*;
149 match self.matcher {
150 Empty => 0,
151 Bytes(ref sset) => sset.dense.len(),
Joel Galenson38748082021-05-19 16:51:51 -0700152 Memmem(_) => 1,
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700153 AC { ref ac, .. } => ac.pattern_count(),
154 Packed { ref lits, .. } => lits.len(),
155 }
156 }
157
158 /// Return the approximate heap usage of literals in bytes.
159 pub fn approximate_size(&self) -> usize {
160 use self::Matcher::*;
161 match self.matcher {
162 Empty => 0,
163 Bytes(ref sset) => sset.approximate_size(),
Joel Galenson38748082021-05-19 16:51:51 -0700164 Memmem(ref single) => single.approximate_size(),
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700165 AC { ref ac, .. } => ac.heap_bytes(),
166 Packed { ref s, .. } => s.heap_bytes(),
167 }
168 }
169}
170
171impl Matcher {
172 fn prefixes(lits: &Literals) -> Self {
173 let sset = SingleByteSet::prefixes(lits);
174 Matcher::new(lits, sset)
175 }
176
177 fn suffixes(lits: &Literals) -> Self {
178 let sset = SingleByteSet::suffixes(lits);
179 Matcher::new(lits, sset)
180 }
181
182 fn new(lits: &Literals, sset: SingleByteSet) -> Self {
183 if lits.literals().is_empty() {
184 return Matcher::Empty;
185 }
186 if sset.dense.len() >= 26 {
187 // Avoid trying to match a large number of single bytes.
188 // This is *very* sensitive to a frequency analysis comparison
189 // between the bytes in sset and the composition of the haystack.
190 // No matter the size of sset, if its members all are rare in the
191 // haystack, then it'd be worth using it. How to tune this... IDK.
192 // ---AG
193 return Matcher::Empty;
194 }
195 if sset.complete {
196 return Matcher::Bytes(sset);
197 }
198 if lits.literals().len() == 1 {
Joel Galenson38748082021-05-19 16:51:51 -0700199 return Matcher::Memmem(Memmem::new(&lits.literals()[0]));
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700200 }
201
202 let pats = lits.literals().to_owned();
203 let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii;
204 if lits.literals().len() <= 100 && !is_aho_corasick_fast {
205 let mut builder = packed::Config::new()
206 .match_kind(packed::MatchKind::LeftmostFirst)
207 .builder();
208 if let Some(s) = builder.extend(&pats).build() {
209 return Matcher::Packed { s, lits: pats };
210 }
211 }
212 let ac = AhoCorasickBuilder::new()
213 .match_kind(aho_corasick::MatchKind::LeftmostFirst)
214 .dfa(true)
215 .build_with_size::<u32, _, _>(&pats)
216 .unwrap();
217 Matcher::AC { ac, lits: pats }
218 }
219}
220
Haibo Huang47619dd2021-01-08 17:05:43 -0800221#[derive(Debug)]
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700222pub enum LiteralIter<'a> {
223 Empty,
224 Bytes(&'a [u8]),
225 Single(&'a [u8]),
226 AC(&'a [Literal]),
227 Packed(&'a [Literal]),
228}
229
230impl<'a> Iterator for LiteralIter<'a> {
231 type Item = &'a [u8];
232
233 fn next(&mut self) -> Option<Self::Item> {
234 match *self {
235 LiteralIter::Empty => None,
236 LiteralIter::Bytes(ref mut many) => {
237 if many.is_empty() {
238 None
239 } else {
240 let next = &many[0..1];
241 *many = &many[1..];
242 Some(next)
243 }
244 }
245 LiteralIter::Single(ref mut one) => {
246 if one.is_empty() {
247 None
248 } else {
249 let next = &one[..];
250 *one = &[];
251 Some(next)
252 }
253 }
254 LiteralIter::AC(ref mut lits) => {
255 if lits.is_empty() {
256 None
257 } else {
258 let next = &lits[0];
259 *lits = &lits[1..];
260 Some(&**next)
261 }
262 }
263 LiteralIter::Packed(ref mut lits) => {
264 if lits.is_empty() {
265 None
266 } else {
267 let next = &lits[0];
268 *lits = &lits[1..];
269 Some(&**next)
270 }
271 }
272 }
273 }
274}
275
276#[derive(Clone, Debug)]
277struct SingleByteSet {
278 sparse: Vec<bool>,
279 dense: Vec<u8>,
280 complete: bool,
281 all_ascii: bool,
282}
283
284impl SingleByteSet {
285 fn new() -> SingleByteSet {
286 SingleByteSet {
287 sparse: vec![false; 256],
288 dense: vec![],
289 complete: true,
290 all_ascii: true,
291 }
292 }
293
294 fn prefixes(lits: &Literals) -> SingleByteSet {
295 let mut sset = SingleByteSet::new();
296 for lit in lits.literals() {
297 sset.complete = sset.complete && lit.len() == 1;
298 if let Some(&b) = lit.get(0) {
299 if !sset.sparse[b as usize] {
300 if b > 0x7F {
301 sset.all_ascii = false;
302 }
303 sset.dense.push(b);
304 sset.sparse[b as usize] = true;
305 }
306 }
307 }
308 sset
309 }
310
311 fn suffixes(lits: &Literals) -> SingleByteSet {
312 let mut sset = SingleByteSet::new();
313 for lit in lits.literals() {
314 sset.complete = sset.complete && lit.len() == 1;
315 if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) {
316 if !sset.sparse[b as usize] {
317 if b > 0x7F {
318 sset.all_ascii = false;
319 }
320 sset.dense.push(b);
321 sset.sparse[b as usize] = true;
322 }
323 }
324 }
325 sset
326 }
327
328 /// Faster find that special cases certain sizes to use memchr.
329 #[cfg_attr(feature = "perf-inline", inline(always))]
330 fn find(&self, text: &[u8]) -> Option<usize> {
331 match self.dense.len() {
332 0 => None,
333 1 => memchr(self.dense[0], text),
334 2 => memchr2(self.dense[0], self.dense[1], text),
335 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text),
336 _ => self._find(text),
337 }
338 }
339
340 /// Generic find that works on any sized set.
341 fn _find(&self, haystack: &[u8]) -> Option<usize> {
342 for (i, &b) in haystack.iter().enumerate() {
343 if self.sparse[b as usize] {
344 return Some(i);
345 }
346 }
347 None
348 }
349
350 fn approximate_size(&self) -> usize {
351 (self.dense.len() * mem::size_of::<u8>())
352 + (self.sparse.len() * mem::size_of::<bool>())
353 }
354}
355
Joel Galenson38748082021-05-19 16:51:51 -0700356/// A simple wrapper around the memchr crate's memmem implementation.
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700357///
Joel Galenson38748082021-05-19 16:51:51 -0700358/// The API this exposes mirrors the API of previous substring searchers that
359/// this supplanted.
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700360#[derive(Clone, Debug)]
Joel Galenson38748082021-05-19 16:51:51 -0700361pub struct Memmem {
362 finder: memmem::Finder<'static>,
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700363 char_len: usize,
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700364}
365
Joel Galenson38748082021-05-19 16:51:51 -0700366impl Memmem {
367 fn new(pat: &[u8]) -> Memmem {
368 Memmem {
369 finder: memmem::Finder::new(pat).into_owned(),
370 char_len: char_len_lossy(pat),
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700371 }
372 }
373
374 #[cfg_attr(feature = "perf-inline", inline(always))]
375 pub fn find(&self, haystack: &[u8]) -> Option<usize> {
Joel Galenson38748082021-05-19 16:51:51 -0700376 self.finder.find(haystack)
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700377 }
378
379 #[cfg_attr(feature = "perf-inline", inline(always))]
380 pub fn is_suffix(&self, text: &[u8]) -> bool {
381 if text.len() < self.len() {
382 return false;
383 }
Joel Galenson38748082021-05-19 16:51:51 -0700384 &text[text.len() - self.len()..] == self.finder.needle()
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700385 }
386
387 pub fn len(&self) -> usize {
Joel Galenson38748082021-05-19 16:51:51 -0700388 self.finder.needle().len()
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700389 }
390
391 pub fn char_len(&self) -> usize {
392 self.char_len
393 }
394
395 fn approximate_size(&self) -> usize {
Joel Galenson38748082021-05-19 16:51:51 -0700396 self.finder.needle().len() * mem::size_of::<u8>()
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700397 }
398}
399
400fn char_len_lossy(bytes: &[u8]) -> usize {
401 String::from_utf8_lossy(bytes).chars().count()
402}