Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 1 | use std::mem; |
| 2 | |
| 3 | use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder}; |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 4 | use memchr::{memchr, memchr2, memchr3, memmem}; |
| 5 | use regex_syntax::hir::literal::{Literal, Literals}; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 6 | |
| 7 | /// A prefix extracted from a compiled regular expression. |
| 8 | /// |
| 9 | /// A regex prefix is a set of literal strings that *must* be matched at the |
| 10 | /// beginning of a regex in order for the entire regex to match. Similarly |
| 11 | /// for a regex suffix. |
| 12 | #[derive(Clone, Debug)] |
| 13 | pub struct LiteralSearcher { |
| 14 | complete: bool, |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 15 | lcp: Memmem, |
| 16 | lcs: Memmem, |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 17 | matcher: Matcher, |
| 18 | } |
| 19 | |
| 20 | #[derive(Clone, Debug)] |
| 21 | enum Matcher { |
| 22 | /// No literals. (Never advances through the input.) |
| 23 | Empty, |
| 24 | /// A set of four or more single byte literals. |
| 25 | Bytes(SingleByteSet), |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 26 | /// A single substring, using vector accelerated routines when available. |
| 27 | Memmem(Memmem), |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 28 | /// An Aho-Corasick automaton. |
| 29 | AC { ac: AhoCorasick<u32>, lits: Vec<Literal> }, |
| 30 | /// A packed multiple substring searcher, using SIMD. |
| 31 | /// |
| 32 | /// Note that Aho-Corasick will actually use this packed searcher |
| 33 | /// internally automatically, however, there is some overhead associated |
| 34 | /// with going through the Aho-Corasick machinery. So using the packed |
| 35 | /// searcher directly results in some gains. |
| 36 | Packed { s: packed::Searcher, lits: Vec<Literal> }, |
| 37 | } |
| 38 | |
| 39 | impl LiteralSearcher { |
| 40 | /// Returns a matcher that never matches and never advances the input. |
| 41 | pub fn empty() -> Self { |
| 42 | Self::new(Literals::empty(), Matcher::Empty) |
| 43 | } |
| 44 | |
| 45 | /// Returns a matcher for literal prefixes from the given set. |
| 46 | pub fn prefixes(lits: Literals) -> Self { |
| 47 | let matcher = Matcher::prefixes(&lits); |
| 48 | Self::new(lits, matcher) |
| 49 | } |
| 50 | |
| 51 | /// Returns a matcher for literal suffixes from the given set. |
| 52 | pub fn suffixes(lits: Literals) -> Self { |
| 53 | let matcher = Matcher::suffixes(&lits); |
| 54 | Self::new(lits, matcher) |
| 55 | } |
| 56 | |
| 57 | fn new(lits: Literals, matcher: Matcher) -> Self { |
| 58 | let complete = lits.all_complete(); |
| 59 | LiteralSearcher { |
| 60 | complete: complete, |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 61 | lcp: Memmem::new(lits.longest_common_prefix()), |
| 62 | lcs: Memmem::new(lits.longest_common_suffix()), |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 63 | matcher: matcher, |
| 64 | } |
| 65 | } |
| 66 | |
| 67 | /// Returns true if all matches comprise the entire regular expression. |
| 68 | /// |
| 69 | /// This does not necessarily mean that a literal match implies a match |
Haibo Huang | 47619dd | 2021-01-08 17:05:43 -0800 | [diff] [blame] | 70 | /// of the regular expression. For example, the regular expression `^a` |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 71 | /// is comprised of a single complete literal `a`, but the regular |
| 72 | /// expression demands that it only match at the beginning of a string. |
| 73 | pub fn complete(&self) -> bool { |
| 74 | self.complete && !self.is_empty() |
| 75 | } |
| 76 | |
| 77 | /// Find the position of a literal in `haystack` if it exists. |
| 78 | #[cfg_attr(feature = "perf-inline", inline(always))] |
| 79 | pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> { |
| 80 | use self::Matcher::*; |
| 81 | match self.matcher { |
| 82 | Empty => Some((0, 0)), |
| 83 | Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)), |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 84 | Memmem(ref s) => s.find(haystack).map(|i| (i, i + s.len())), |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 85 | AC { ref ac, .. } => { |
| 86 | ac.find(haystack).map(|m| (m.start(), m.end())) |
| 87 | } |
| 88 | Packed { ref s, .. } => { |
| 89 | s.find(haystack).map(|m| (m.start(), m.end())) |
| 90 | } |
| 91 | } |
| 92 | } |
| 93 | |
| 94 | /// Like find, except matches must start at index `0`. |
| 95 | pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> { |
| 96 | for lit in self.iter() { |
| 97 | if lit.len() > haystack.len() { |
| 98 | continue; |
| 99 | } |
| 100 | if lit == &haystack[0..lit.len()] { |
| 101 | return Some((0, lit.len())); |
| 102 | } |
| 103 | } |
| 104 | None |
| 105 | } |
| 106 | |
| 107 | /// Like find, except matches must end at index `haystack.len()`. |
| 108 | pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> { |
| 109 | for lit in self.iter() { |
| 110 | if lit.len() > haystack.len() { |
| 111 | continue; |
| 112 | } |
| 113 | if lit == &haystack[haystack.len() - lit.len()..] { |
| 114 | return Some((haystack.len() - lit.len(), haystack.len())); |
| 115 | } |
| 116 | } |
| 117 | None |
| 118 | } |
| 119 | |
| 120 | /// Returns an iterator over all literals to be matched. |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 121 | pub fn iter(&self) -> LiteralIter<'_> { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 122 | match self.matcher { |
| 123 | Matcher::Empty => LiteralIter::Empty, |
| 124 | Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense), |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 125 | Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()), |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 126 | Matcher::AC { ref lits, .. } => LiteralIter::AC(lits), |
| 127 | Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits), |
| 128 | } |
| 129 | } |
| 130 | |
| 131 | /// Returns a matcher for the longest common prefix of this matcher. |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 132 | pub fn lcp(&self) -> &Memmem { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 133 | &self.lcp |
| 134 | } |
| 135 | |
| 136 | /// Returns a matcher for the longest common suffix of this matcher. |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 137 | pub fn lcs(&self) -> &Memmem { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 138 | &self.lcs |
| 139 | } |
| 140 | |
| 141 | /// Returns true iff this prefix is empty. |
| 142 | pub fn is_empty(&self) -> bool { |
| 143 | self.len() == 0 |
| 144 | } |
| 145 | |
| 146 | /// Returns the number of prefixes in this machine. |
| 147 | pub fn len(&self) -> usize { |
| 148 | use self::Matcher::*; |
| 149 | match self.matcher { |
| 150 | Empty => 0, |
| 151 | Bytes(ref sset) => sset.dense.len(), |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 152 | Memmem(_) => 1, |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 153 | AC { ref ac, .. } => ac.pattern_count(), |
| 154 | Packed { ref lits, .. } => lits.len(), |
| 155 | } |
| 156 | } |
| 157 | |
| 158 | /// Return the approximate heap usage of literals in bytes. |
| 159 | pub fn approximate_size(&self) -> usize { |
| 160 | use self::Matcher::*; |
| 161 | match self.matcher { |
| 162 | Empty => 0, |
| 163 | Bytes(ref sset) => sset.approximate_size(), |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 164 | Memmem(ref single) => single.approximate_size(), |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 165 | AC { ref ac, .. } => ac.heap_bytes(), |
| 166 | Packed { ref s, .. } => s.heap_bytes(), |
| 167 | } |
| 168 | } |
| 169 | } |
| 170 | |
| 171 | impl Matcher { |
| 172 | fn prefixes(lits: &Literals) -> Self { |
| 173 | let sset = SingleByteSet::prefixes(lits); |
| 174 | Matcher::new(lits, sset) |
| 175 | } |
| 176 | |
| 177 | fn suffixes(lits: &Literals) -> Self { |
| 178 | let sset = SingleByteSet::suffixes(lits); |
| 179 | Matcher::new(lits, sset) |
| 180 | } |
| 181 | |
| 182 | fn new(lits: &Literals, sset: SingleByteSet) -> Self { |
| 183 | if lits.literals().is_empty() { |
| 184 | return Matcher::Empty; |
| 185 | } |
| 186 | if sset.dense.len() >= 26 { |
| 187 | // Avoid trying to match a large number of single bytes. |
| 188 | // This is *very* sensitive to a frequency analysis comparison |
| 189 | // between the bytes in sset and the composition of the haystack. |
| 190 | // No matter the size of sset, if its members all are rare in the |
| 191 | // haystack, then it'd be worth using it. How to tune this... IDK. |
| 192 | // ---AG |
| 193 | return Matcher::Empty; |
| 194 | } |
| 195 | if sset.complete { |
| 196 | return Matcher::Bytes(sset); |
| 197 | } |
| 198 | if lits.literals().len() == 1 { |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 199 | return Matcher::Memmem(Memmem::new(&lits.literals()[0])); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 200 | } |
| 201 | |
| 202 | let pats = lits.literals().to_owned(); |
| 203 | let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii; |
| 204 | if lits.literals().len() <= 100 && !is_aho_corasick_fast { |
| 205 | let mut builder = packed::Config::new() |
| 206 | .match_kind(packed::MatchKind::LeftmostFirst) |
| 207 | .builder(); |
| 208 | if let Some(s) = builder.extend(&pats).build() { |
| 209 | return Matcher::Packed { s, lits: pats }; |
| 210 | } |
| 211 | } |
| 212 | let ac = AhoCorasickBuilder::new() |
| 213 | .match_kind(aho_corasick::MatchKind::LeftmostFirst) |
| 214 | .dfa(true) |
| 215 | .build_with_size::<u32, _, _>(&pats) |
| 216 | .unwrap(); |
| 217 | Matcher::AC { ac, lits: pats } |
| 218 | } |
| 219 | } |
| 220 | |
Haibo Huang | 47619dd | 2021-01-08 17:05:43 -0800 | [diff] [blame] | 221 | #[derive(Debug)] |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 222 | pub enum LiteralIter<'a> { |
| 223 | Empty, |
| 224 | Bytes(&'a [u8]), |
| 225 | Single(&'a [u8]), |
| 226 | AC(&'a [Literal]), |
| 227 | Packed(&'a [Literal]), |
| 228 | } |
| 229 | |
| 230 | impl<'a> Iterator for LiteralIter<'a> { |
| 231 | type Item = &'a [u8]; |
| 232 | |
| 233 | fn next(&mut self) -> Option<Self::Item> { |
| 234 | match *self { |
| 235 | LiteralIter::Empty => None, |
| 236 | LiteralIter::Bytes(ref mut many) => { |
| 237 | if many.is_empty() { |
| 238 | None |
| 239 | } else { |
| 240 | let next = &many[0..1]; |
| 241 | *many = &many[1..]; |
| 242 | Some(next) |
| 243 | } |
| 244 | } |
| 245 | LiteralIter::Single(ref mut one) => { |
| 246 | if one.is_empty() { |
| 247 | None |
| 248 | } else { |
| 249 | let next = &one[..]; |
| 250 | *one = &[]; |
| 251 | Some(next) |
| 252 | } |
| 253 | } |
| 254 | LiteralIter::AC(ref mut lits) => { |
| 255 | if lits.is_empty() { |
| 256 | None |
| 257 | } else { |
| 258 | let next = &lits[0]; |
| 259 | *lits = &lits[1..]; |
| 260 | Some(&**next) |
| 261 | } |
| 262 | } |
| 263 | LiteralIter::Packed(ref mut lits) => { |
| 264 | if lits.is_empty() { |
| 265 | None |
| 266 | } else { |
| 267 | let next = &lits[0]; |
| 268 | *lits = &lits[1..]; |
| 269 | Some(&**next) |
| 270 | } |
| 271 | } |
| 272 | } |
| 273 | } |
| 274 | } |
| 275 | |
| 276 | #[derive(Clone, Debug)] |
| 277 | struct SingleByteSet { |
| 278 | sparse: Vec<bool>, |
| 279 | dense: Vec<u8>, |
| 280 | complete: bool, |
| 281 | all_ascii: bool, |
| 282 | } |
| 283 | |
| 284 | impl SingleByteSet { |
| 285 | fn new() -> SingleByteSet { |
| 286 | SingleByteSet { |
| 287 | sparse: vec![false; 256], |
| 288 | dense: vec![], |
| 289 | complete: true, |
| 290 | all_ascii: true, |
| 291 | } |
| 292 | } |
| 293 | |
| 294 | fn prefixes(lits: &Literals) -> SingleByteSet { |
| 295 | let mut sset = SingleByteSet::new(); |
| 296 | for lit in lits.literals() { |
| 297 | sset.complete = sset.complete && lit.len() == 1; |
| 298 | if let Some(&b) = lit.get(0) { |
| 299 | if !sset.sparse[b as usize] { |
| 300 | if b > 0x7F { |
| 301 | sset.all_ascii = false; |
| 302 | } |
| 303 | sset.dense.push(b); |
| 304 | sset.sparse[b as usize] = true; |
| 305 | } |
| 306 | } |
| 307 | } |
| 308 | sset |
| 309 | } |
| 310 | |
| 311 | fn suffixes(lits: &Literals) -> SingleByteSet { |
| 312 | let mut sset = SingleByteSet::new(); |
| 313 | for lit in lits.literals() { |
| 314 | sset.complete = sset.complete && lit.len() == 1; |
| 315 | if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) { |
| 316 | if !sset.sparse[b as usize] { |
| 317 | if b > 0x7F { |
| 318 | sset.all_ascii = false; |
| 319 | } |
| 320 | sset.dense.push(b); |
| 321 | sset.sparse[b as usize] = true; |
| 322 | } |
| 323 | } |
| 324 | } |
| 325 | sset |
| 326 | } |
| 327 | |
| 328 | /// Faster find that special cases certain sizes to use memchr. |
| 329 | #[cfg_attr(feature = "perf-inline", inline(always))] |
| 330 | fn find(&self, text: &[u8]) -> Option<usize> { |
| 331 | match self.dense.len() { |
| 332 | 0 => None, |
| 333 | 1 => memchr(self.dense[0], text), |
| 334 | 2 => memchr2(self.dense[0], self.dense[1], text), |
| 335 | 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text), |
| 336 | _ => self._find(text), |
| 337 | } |
| 338 | } |
| 339 | |
| 340 | /// Generic find that works on any sized set. |
| 341 | fn _find(&self, haystack: &[u8]) -> Option<usize> { |
| 342 | for (i, &b) in haystack.iter().enumerate() { |
| 343 | if self.sparse[b as usize] { |
| 344 | return Some(i); |
| 345 | } |
| 346 | } |
| 347 | None |
| 348 | } |
| 349 | |
| 350 | fn approximate_size(&self) -> usize { |
| 351 | (self.dense.len() * mem::size_of::<u8>()) |
| 352 | + (self.sparse.len() * mem::size_of::<bool>()) |
| 353 | } |
| 354 | } |
| 355 | |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 356 | /// A simple wrapper around the memchr crate's memmem implementation. |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 357 | /// |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 358 | /// The API this exposes mirrors the API of previous substring searchers that |
| 359 | /// this supplanted. |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 360 | #[derive(Clone, Debug)] |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 361 | pub struct Memmem { |
| 362 | finder: memmem::Finder<'static>, |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 363 | char_len: usize, |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 364 | } |
| 365 | |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 366 | impl Memmem { |
| 367 | fn new(pat: &[u8]) -> Memmem { |
| 368 | Memmem { |
| 369 | finder: memmem::Finder::new(pat).into_owned(), |
| 370 | char_len: char_len_lossy(pat), |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 371 | } |
| 372 | } |
| 373 | |
| 374 | #[cfg_attr(feature = "perf-inline", inline(always))] |
| 375 | pub fn find(&self, haystack: &[u8]) -> Option<usize> { |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 376 | self.finder.find(haystack) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 377 | } |
| 378 | |
| 379 | #[cfg_attr(feature = "perf-inline", inline(always))] |
| 380 | pub fn is_suffix(&self, text: &[u8]) -> bool { |
| 381 | if text.len() < self.len() { |
| 382 | return false; |
| 383 | } |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 384 | &text[text.len() - self.len()..] == self.finder.needle() |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 385 | } |
| 386 | |
| 387 | pub fn len(&self) -> usize { |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 388 | self.finder.needle().len() |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 389 | } |
| 390 | |
| 391 | pub fn char_len(&self) -> usize { |
| 392 | self.char_len |
| 393 | } |
| 394 | |
| 395 | fn approximate_size(&self) -> usize { |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 396 | self.finder.needle().len() * mem::size_of::<u8>() |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 397 | } |
| 398 | } |
| 399 | |
| 400 | fn char_len_lossy(bytes: &[u8]) -> usize { |
| 401 | String::from_utf8_lossy(bytes).chars().count() |
| 402 | } |