| use std::cell::RefCell; |
| use std::collections::HashMap; |
| use std::sync::Arc; |
| |
| #[cfg(feature = "perf-literal")] |
| use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; |
| use syntax::hir::literal::Literals; |
| use syntax::hir::Hir; |
| use syntax::ParserBuilder; |
| |
| use backtrack; |
| use cache::{Cached, CachedGuard}; |
| use compile::Compiler; |
| #[cfg(feature = "perf-dfa")] |
| use dfa; |
| use error::Error; |
| use input::{ByteInput, CharInput}; |
| use literal::LiteralSearcher; |
| use pikevm; |
| use prog::Program; |
| use re_builder::RegexOptions; |
| use re_bytes; |
| use re_set; |
| use re_trait::{Locations, RegularExpression, Slot}; |
| use re_unicode; |
| use utf8::next_utf8; |
| |
| /// `Exec` manages the execution of a regular expression. |
| /// |
| /// In particular, this manages the various compiled forms of a single regular |
| /// expression and the choice of which matching engine to use to execute a |
| /// regular expression. |
| pub struct Exec { |
| /// All read only state. |
| ro: Arc<ExecReadOnly>, |
| /// Caches for the various matching engines. |
| cache: Cached<ProgramCache>, |
| } |
| |
| /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This |
| /// means it is no longer Sync, but we can now avoid the overhead of |
| /// synchronization to fetch the cache. |
| #[derive(Debug)] |
| pub struct ExecNoSync<'c> { |
| /// All read only state. |
| ro: &'c Arc<ExecReadOnly>, |
| /// Caches for the various matching engines. |
| cache: CachedGuard<'c, ProgramCache>, |
| } |
| |
| /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8]. |
| pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>); |
| |
| /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such |
| /// state is determined at compile time and never changes during search. |
| #[derive(Debug)] |
| struct ExecReadOnly { |
| /// The original regular expressions given by the caller to compile. |
| res: Vec<String>, |
| /// A compiled program that is used in the NFA simulation and backtracking. |
| /// It can be byte-based or Unicode codepoint based. |
| /// |
| /// N.B. It is not possibly to make this byte-based from the public API. |
| /// It is only used for testing byte based programs in the NFA simulations. |
| nfa: Program, |
| /// A compiled byte based program for DFA execution. This is only used |
| /// if a DFA can be executed. (Currently, only word boundary assertions are |
| /// not supported.) Note that this program contains an embedded `.*?` |
| /// preceding the first capture group, unless the regex is anchored at the |
| /// beginning. |
| dfa: Program, |
| /// The same as above, except the program is reversed (and there is no |
| /// preceding `.*?`). This is used by the DFA to find the starting location |
| /// of matches. |
| dfa_reverse: Program, |
| /// A set of suffix literals extracted from the regex. |
| /// |
| /// Prefix literals are stored on the `Program`, since they are used inside |
| /// the matching engines. |
| suffixes: LiteralSearcher, |
| /// An Aho-Corasick automaton with leftmost-first match semantics. |
| /// |
| /// This is only set when the entire regex is a simple unanchored |
| /// alternation of literals. We could probably use it more circumstances, |
| /// but this is already hacky enough in this architecture. |
| /// |
| /// N.B. We use u32 as a state ID representation under the assumption that |
| /// if we were to exhaust the ID space, we probably would have long |
| /// surpassed the compilation size limit. |
| #[cfg(feature = "perf-literal")] |
| ac: Option<AhoCorasick<u32>>, |
| /// match_type encodes as much upfront knowledge about how we're going to |
| /// execute a search as possible. |
| match_type: MatchType, |
| } |
| |
| /// Facilitates the construction of an executor by exposing various knobs |
| /// to control how a regex is executed and what kinds of resources it's |
| /// permitted to use. |
| pub struct ExecBuilder { |
| options: RegexOptions, |
| match_type: Option<MatchType>, |
| bytes: bool, |
| only_utf8: bool, |
| } |
| |
| /// Parsed represents a set of parsed regular expressions and their detected |
| /// literals. |
| struct Parsed { |
| exprs: Vec<Hir>, |
| prefixes: Literals, |
| suffixes: Literals, |
| bytes: bool, |
| } |
| |
| impl ExecBuilder { |
| /// Create a regex execution builder. |
| /// |
| /// This uses default settings for everything except the regex itself, |
| /// which must be provided. Further knobs can be set by calling methods, |
| /// and then finally, `build` to actually create the executor. |
| pub fn new(re: &str) -> Self { |
| Self::new_many(&[re]) |
| } |
| |
| /// Like new, but compiles the union of the given regular expressions. |
| /// |
| /// Note that when compiling 2 or more regular expressions, capture groups |
| /// are completely unsupported. (This means both `find` and `captures` |
| /// wont work.) |
| pub fn new_many<I, S>(res: I) -> Self |
| where |
| S: AsRef<str>, |
| I: IntoIterator<Item = S>, |
| { |
| let mut opts = RegexOptions::default(); |
| opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect(); |
| Self::new_options(opts) |
| } |
| |
| /// Create a regex execution builder. |
| pub fn new_options(opts: RegexOptions) -> Self { |
| ExecBuilder { |
| options: opts, |
| match_type: None, |
| bytes: false, |
| only_utf8: true, |
| } |
| } |
| |
| /// Set the matching engine to be automatically determined. |
| /// |
| /// This is the default state and will apply whatever optimizations are |
| /// possible, such as running a DFA. |
| /// |
| /// This overrides whatever was previously set via the `nfa` or |
| /// `bounded_backtracking` methods. |
| pub fn automatic(mut self) -> Self { |
| self.match_type = None; |
| self |
| } |
| |
| /// Sets the matching engine to use the NFA algorithm no matter what |
| /// optimizations are possible. |
| /// |
| /// This overrides whatever was previously set via the `automatic` or |
| /// `bounded_backtracking` methods. |
| pub fn nfa(mut self) -> Self { |
| self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM)); |
| self |
| } |
| |
| /// Sets the matching engine to use a bounded backtracking engine no |
| /// matter what optimizations are possible. |
| /// |
| /// One must use this with care, since the bounded backtracking engine |
| /// uses memory proportion to `len(regex) * len(text)`. |
| /// |
| /// This overrides whatever was previously set via the `automatic` or |
| /// `nfa` methods. |
| pub fn bounded_backtracking(mut self) -> Self { |
| self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack)); |
| self |
| } |
| |
| /// Compiles byte based programs for use with the NFA matching engines. |
| /// |
| /// By default, the NFA engines match on Unicode scalar values. They can |
| /// be made to use byte based programs instead. In general, the byte based |
| /// programs are slower because of a less efficient encoding of character |
| /// classes. |
| /// |
| /// Note that this does not impact DFA matching engines, which always |
| /// execute on bytes. |
| pub fn bytes(mut self, yes: bool) -> Self { |
| self.bytes = yes; |
| self |
| } |
| |
| /// When disabled, the program compiled may match arbitrary bytes. |
| /// |
| /// When enabled (the default), all compiled programs exclusively match |
| /// valid UTF-8 bytes. |
| pub fn only_utf8(mut self, yes: bool) -> Self { |
| self.only_utf8 = yes; |
| self |
| } |
| |
| /// Set the Unicode flag. |
| pub fn unicode(mut self, yes: bool) -> Self { |
| self.options.unicode = yes; |
| self |
| } |
| |
| /// Parse the current set of patterns into their AST and extract literals. |
| fn parse(&self) -> Result<Parsed, Error> { |
| let mut exprs = Vec::with_capacity(self.options.pats.len()); |
| let mut prefixes = Some(Literals::empty()); |
| let mut suffixes = Some(Literals::empty()); |
| let mut bytes = false; |
| let is_set = self.options.pats.len() > 1; |
| // If we're compiling a regex set and that set has any anchored |
| // expressions, then disable all literal optimizations. |
| for pat in &self.options.pats { |
| let mut parser = ParserBuilder::new() |
| .octal(self.options.octal) |
| .case_insensitive(self.options.case_insensitive) |
| .multi_line(self.options.multi_line) |
| .dot_matches_new_line(self.options.dot_matches_new_line) |
| .swap_greed(self.options.swap_greed) |
| .ignore_whitespace(self.options.ignore_whitespace) |
| .unicode(self.options.unicode) |
| .allow_invalid_utf8(!self.only_utf8) |
| .nest_limit(self.options.nest_limit) |
| .build(); |
| let expr = |
| parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?; |
| bytes = bytes || !expr.is_always_utf8(); |
| |
| if cfg!(feature = "perf-literal") { |
| if !expr.is_anchored_start() && expr.is_any_anchored_start() { |
| // Partial anchors unfortunately make it hard to use |
| // prefixes, so disable them. |
| prefixes = None; |
| } else if is_set && expr.is_anchored_start() { |
| // Regex sets with anchors do not go well with literal |
| // optimizations. |
| prefixes = None; |
| } |
| prefixes = prefixes.and_then(|mut prefixes| { |
| if !prefixes.union_prefixes(&expr) { |
| None |
| } else { |
| Some(prefixes) |
| } |
| }); |
| |
| if !expr.is_anchored_end() && expr.is_any_anchored_end() { |
| // Partial anchors unfortunately make it hard to use |
| // suffixes, so disable them. |
| suffixes = None; |
| } else if is_set && expr.is_anchored_end() { |
| // Regex sets with anchors do not go well with literal |
| // optimizations. |
| suffixes = None; |
| } |
| suffixes = suffixes.and_then(|mut suffixes| { |
| if !suffixes.union_suffixes(&expr) { |
| None |
| } else { |
| Some(suffixes) |
| } |
| }); |
| } |
| exprs.push(expr); |
| } |
| Ok(Parsed { |
| exprs: exprs, |
| prefixes: prefixes.unwrap_or_else(Literals::empty), |
| suffixes: suffixes.unwrap_or_else(Literals::empty), |
| bytes: bytes, |
| }) |
| } |
| |
| /// Build an executor that can run a regular expression. |
| pub fn build(self) -> Result<Exec, Error> { |
| // Special case when we have no patterns to compile. |
| // This can happen when compiling a regex set. |
| if self.options.pats.is_empty() { |
| let ro = Arc::new(ExecReadOnly { |
| res: vec![], |
| nfa: Program::new(), |
| dfa: Program::new(), |
| dfa_reverse: Program::new(), |
| suffixes: LiteralSearcher::empty(), |
| #[cfg(feature = "perf-literal")] |
| ac: None, |
| match_type: MatchType::Nothing, |
| }); |
| return Ok(Exec { ro: ro, cache: Cached::new() }); |
| } |
| let parsed = self.parse()?; |
| let mut nfa = Compiler::new() |
| .size_limit(self.options.size_limit) |
| .bytes(self.bytes || parsed.bytes) |
| .only_utf8(self.only_utf8) |
| .compile(&parsed.exprs)?; |
| let mut dfa = Compiler::new() |
| .size_limit(self.options.size_limit) |
| .dfa(true) |
| .only_utf8(self.only_utf8) |
| .compile(&parsed.exprs)?; |
| let mut dfa_reverse = Compiler::new() |
| .size_limit(self.options.size_limit) |
| .dfa(true) |
| .only_utf8(self.only_utf8) |
| .reverse(true) |
| .compile(&parsed.exprs)?; |
| |
| #[cfg(feature = "perf-literal")] |
| let ac = self.build_aho_corasick(&parsed); |
| nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes); |
| dfa.prefixes = nfa.prefixes.clone(); |
| dfa.dfa_size_limit = self.options.dfa_size_limit; |
| dfa_reverse.dfa_size_limit = self.options.dfa_size_limit; |
| |
| let mut ro = ExecReadOnly { |
| res: self.options.pats, |
| nfa: nfa, |
| dfa: dfa, |
| dfa_reverse: dfa_reverse, |
| suffixes: LiteralSearcher::suffixes(parsed.suffixes), |
| #[cfg(feature = "perf-literal")] |
| ac: ac, |
| match_type: MatchType::Nothing, |
| }; |
| ro.match_type = ro.choose_match_type(self.match_type); |
| |
| let ro = Arc::new(ro); |
| Ok(Exec { ro: ro, cache: Cached::new() }) |
| } |
| |
| #[cfg(feature = "perf-literal")] |
| fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> { |
| if parsed.exprs.len() != 1 { |
| return None; |
| } |
| let lits = match alternation_literals(&parsed.exprs[0]) { |
| None => return None, |
| Some(lits) => lits, |
| }; |
| // If we have a small number of literals, then let Teddy handle |
| // things (see literal/mod.rs). |
| if lits.len() <= 32 { |
| return None; |
| } |
| Some( |
| AhoCorasickBuilder::new() |
| .match_kind(MatchKind::LeftmostFirst) |
| .auto_configure(&lits) |
| // We always want this to reduce size, regardless |
| // of what auto-configure does. |
| .byte_classes(true) |
| .build_with_size::<u32, _, _>(&lits) |
| // This should never happen because we'd long exceed the |
| // compilation limit for regexes first. |
| .expect("AC automaton too big"), |
| ) |
| } |
| } |
| |
| impl<'c> RegularExpression for ExecNoSyncStr<'c> { |
| type Text = str; |
| |
| fn slots_len(&self) -> usize { |
| self.0.slots_len() |
| } |
| |
| fn next_after_empty(&self, text: &str, i: usize) -> usize { |
| next_utf8(text.as_bytes(), i) |
| } |
| |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> { |
| self.0.shortest_match_at(text.as_bytes(), start) |
| } |
| |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn is_match_at(&self, text: &str, start: usize) -> bool { |
| self.0.is_match_at(text.as_bytes(), start) |
| } |
| |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { |
| self.0.find_at(text.as_bytes(), start) |
| } |
| |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn captures_read_at( |
| &self, |
| locs: &mut Locations, |
| text: &str, |
| start: usize, |
| ) -> Option<(usize, usize)> { |
| self.0.captures_read_at(locs, text.as_bytes(), start) |
| } |
| } |
| |
| impl<'c> RegularExpression for ExecNoSync<'c> { |
| type Text = [u8]; |
| |
| /// Returns the number of capture slots in the regular expression. (There |
| /// are two slots for every capture group, corresponding to possibly empty |
| /// start and end locations of the capture.) |
| fn slots_len(&self) -> usize { |
| self.ro.nfa.captures.len() * 2 |
| } |
| |
| fn next_after_empty(&self, _text: &[u8], i: usize) -> usize { |
| i + 1 |
| } |
| |
| /// Returns the end of a match location, possibly occurring before the |
| /// end location of the correct leftmost-first match. |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> { |
| if !self.is_anchor_end_match(text) { |
| return None; |
| } |
| match self.ro.match_type { |
| #[cfg(feature = "perf-literal")] |
| MatchType::Literal(ty) => { |
| self.find_literals(ty, text, start).map(|(_, e)| e) |
| } |
| #[cfg(feature = "perf-dfa")] |
| MatchType::Dfa | MatchType::DfaMany => { |
| match self.shortest_dfa(text, start) { |
| dfa::Result::Match(end) => Some(end), |
| dfa::Result::NoMatch(_) => None, |
| dfa::Result::Quit => self.shortest_nfa(text, start), |
| } |
| } |
| #[cfg(feature = "perf-dfa")] |
| MatchType::DfaAnchoredReverse => { |
| match dfa::Fsm::reverse( |
| &self.ro.dfa_reverse, |
| self.cache.value(), |
| true, |
| &text[start..], |
| text.len(), |
| ) { |
| dfa::Result::Match(_) => Some(text.len()), |
| dfa::Result::NoMatch(_) => None, |
| dfa::Result::Quit => self.shortest_nfa(text, start), |
| } |
| } |
| #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] |
| MatchType::DfaSuffix => { |
| match self.shortest_dfa_reverse_suffix(text, start) { |
| dfa::Result::Match(e) => Some(e), |
| dfa::Result::NoMatch(_) => None, |
| dfa::Result::Quit => self.shortest_nfa(text, start), |
| } |
| } |
| MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start), |
| MatchType::Nothing => None, |
| } |
| } |
| |
| /// Returns true if and only if the regex matches text. |
| /// |
| /// For single regular expressions, this is equivalent to calling |
| /// shortest_match(...).is_some(). |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn is_match_at(&self, text: &[u8], start: usize) -> bool { |
| if !self.is_anchor_end_match(text) { |
| return false; |
| } |
| // We need to do this dance because shortest_match relies on the NFA |
| // filling in captures[1], but a RegexSet has no captures. In other |
| // words, a RegexSet can't (currently) use shortest_match. ---AG |
| match self.ro.match_type { |
| #[cfg(feature = "perf-literal")] |
| MatchType::Literal(ty) => { |
| self.find_literals(ty, text, start).is_some() |
| } |
| #[cfg(feature = "perf-dfa")] |
| MatchType::Dfa | MatchType::DfaMany => { |
| match self.shortest_dfa(text, start) { |
| dfa::Result::Match(_) => true, |
| dfa::Result::NoMatch(_) => false, |
| dfa::Result::Quit => self.match_nfa(text, start), |
| } |
| } |
| #[cfg(feature = "perf-dfa")] |
| MatchType::DfaAnchoredReverse => { |
| match dfa::Fsm::reverse( |
| &self.ro.dfa_reverse, |
| self.cache.value(), |
| true, |
| &text[start..], |
| text.len(), |
| ) { |
| dfa::Result::Match(_) => true, |
| dfa::Result::NoMatch(_) => false, |
| dfa::Result::Quit => self.match_nfa(text, start), |
| } |
| } |
| #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] |
| MatchType::DfaSuffix => { |
| match self.shortest_dfa_reverse_suffix(text, start) { |
| dfa::Result::Match(_) => true, |
| dfa::Result::NoMatch(_) => false, |
| dfa::Result::Quit => self.match_nfa(text, start), |
| } |
| } |
| MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start), |
| MatchType::Nothing => false, |
| } |
| } |
| |
| /// Finds the start and end location of the leftmost-first match, starting |
| /// at the given location. |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> { |
| if !self.is_anchor_end_match(text) { |
| return None; |
| } |
| match self.ro.match_type { |
| #[cfg(feature = "perf-literal")] |
| MatchType::Literal(ty) => self.find_literals(ty, text, start), |
| #[cfg(feature = "perf-dfa")] |
| MatchType::Dfa => match self.find_dfa_forward(text, start) { |
| dfa::Result::Match((s, e)) => Some((s, e)), |
| dfa::Result::NoMatch(_) => None, |
| dfa::Result::Quit => { |
| self.find_nfa(MatchNfaType::Auto, text, start) |
| } |
| }, |
| #[cfg(feature = "perf-dfa")] |
| MatchType::DfaAnchoredReverse => { |
| match self.find_dfa_anchored_reverse(text, start) { |
| dfa::Result::Match((s, e)) => Some((s, e)), |
| dfa::Result::NoMatch(_) => None, |
| dfa::Result::Quit => { |
| self.find_nfa(MatchNfaType::Auto, text, start) |
| } |
| } |
| } |
| #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] |
| MatchType::DfaSuffix => { |
| match self.find_dfa_reverse_suffix(text, start) { |
| dfa::Result::Match((s, e)) => Some((s, e)), |
| dfa::Result::NoMatch(_) => None, |
| dfa::Result::Quit => { |
| self.find_nfa(MatchNfaType::Auto, text, start) |
| } |
| } |
| } |
| MatchType::Nfa(ty) => self.find_nfa(ty, text, start), |
| MatchType::Nothing => None, |
| #[cfg(feature = "perf-dfa")] |
| MatchType::DfaMany => { |
| unreachable!("BUG: RegexSet cannot be used with find") |
| } |
| } |
| } |
| |
| /// Finds the start and end location of the leftmost-first match and also |
| /// fills in all matching capture groups. |
| /// |
| /// The number of capture slots given should be equal to the total number |
| /// of capture slots in the compiled program. |
| /// |
| /// Note that the first two slots always correspond to the start and end |
| /// locations of the overall match. |
| fn captures_read_at( |
| &self, |
| locs: &mut Locations, |
| text: &[u8], |
| start: usize, |
| ) -> Option<(usize, usize)> { |
| let slots = locs.as_slots(); |
| for slot in slots.iter_mut() { |
| *slot = None; |
| } |
| // If the caller unnecessarily uses this, then we try to save them |
| // from themselves. |
| match slots.len() { |
| 0 => return self.find_at(text, start), |
| 2 => { |
| return self.find_at(text, start).map(|(s, e)| { |
| slots[0] = Some(s); |
| slots[1] = Some(e); |
| (s, e) |
| }); |
| } |
| _ => {} // fallthrough |
| } |
| if !self.is_anchor_end_match(text) { |
| return None; |
| } |
| match self.ro.match_type { |
| #[cfg(feature = "perf-literal")] |
| MatchType::Literal(ty) => { |
| self.find_literals(ty, text, start).and_then(|(s, e)| { |
| self.captures_nfa_type( |
| MatchNfaType::Auto, |
| slots, |
| text, |
| s, |
| e, |
| ) |
| }) |
| } |
| #[cfg(feature = "perf-dfa")] |
| MatchType::Dfa => { |
| if self.ro.nfa.is_anchored_start { |
| self.captures_nfa(slots, text, start) |
| } else { |
| match self.find_dfa_forward(text, start) { |
| dfa::Result::Match((s, e)) => self.captures_nfa_type( |
| MatchNfaType::Auto, |
| slots, |
| text, |
| s, |
| e, |
| ), |
| dfa::Result::NoMatch(_) => None, |
| dfa::Result::Quit => { |
| self.captures_nfa(slots, text, start) |
| } |
| } |
| } |
| } |
| #[cfg(feature = "perf-dfa")] |
| MatchType::DfaAnchoredReverse => { |
| match self.find_dfa_anchored_reverse(text, start) { |
| dfa::Result::Match((s, e)) => self.captures_nfa_type( |
| MatchNfaType::Auto, |
| slots, |
| text, |
| s, |
| e, |
| ), |
| dfa::Result::NoMatch(_) => None, |
| dfa::Result::Quit => self.captures_nfa(slots, text, start), |
| } |
| } |
| #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] |
| MatchType::DfaSuffix => { |
| match self.find_dfa_reverse_suffix(text, start) { |
| dfa::Result::Match((s, e)) => self.captures_nfa_type( |
| MatchNfaType::Auto, |
| slots, |
| text, |
| s, |
| e, |
| ), |
| dfa::Result::NoMatch(_) => None, |
| dfa::Result::Quit => self.captures_nfa(slots, text, start), |
| } |
| } |
| MatchType::Nfa(ty) => { |
| self.captures_nfa_type(ty, slots, text, start, text.len()) |
| } |
| MatchType::Nothing => None, |
| #[cfg(feature = "perf-dfa")] |
| MatchType::DfaMany => { |
| unreachable!("BUG: RegexSet cannot be used with captures") |
| } |
| } |
| } |
| } |
| |
| impl<'c> ExecNoSync<'c> { |
| /// Finds the leftmost-first match using only literal search. |
| #[cfg(feature = "perf-literal")] |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn find_literals( |
| &self, |
| ty: MatchLiteralType, |
| text: &[u8], |
| start: usize, |
| ) -> Option<(usize, usize)> { |
| use self::MatchLiteralType::*; |
| match ty { |
| Unanchored => { |
| let lits = &self.ro.nfa.prefixes; |
| lits.find(&text[start..]).map(|(s, e)| (start + s, start + e)) |
| } |
| AnchoredStart => { |
| let lits = &self.ro.nfa.prefixes; |
| if start == 0 || !self.ro.nfa.is_anchored_start { |
| lits.find_start(&text[start..]) |
| .map(|(s, e)| (start + s, start + e)) |
| } else { |
| None |
| } |
| } |
| AnchoredEnd => { |
| let lits = &self.ro.suffixes; |
| lits.find_end(&text[start..]) |
| .map(|(s, e)| (start + s, start + e)) |
| } |
| AhoCorasick => self |
| .ro |
| .ac |
| .as_ref() |
| .unwrap() |
| .find(&text[start..]) |
| .map(|m| (start + m.start(), start + m.end())), |
| } |
| } |
| |
| /// Finds the leftmost-first match (start and end) using only the DFA. |
| /// |
| /// If the result returned indicates that the DFA quit, then another |
| /// matching engine should be used. |
| #[cfg(feature = "perf-dfa")] |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn find_dfa_forward( |
| &self, |
| text: &[u8], |
| start: usize, |
| ) -> dfa::Result<(usize, usize)> { |
| use dfa::Result::*; |
| let end = match dfa::Fsm::forward( |
| &self.ro.dfa, |
| self.cache.value(), |
| false, |
| text, |
| start, |
| ) { |
| NoMatch(i) => return NoMatch(i), |
| Quit => return Quit, |
| Match(end) if start == end => return Match((start, start)), |
| Match(end) => end, |
| }; |
| // Now run the DFA in reverse to find the start of the match. |
| match dfa::Fsm::reverse( |
| &self.ro.dfa_reverse, |
| self.cache.value(), |
| false, |
| &text[start..], |
| end - start, |
| ) { |
| Match(s) => Match((start + s, end)), |
| NoMatch(i) => NoMatch(i), |
| Quit => Quit, |
| } |
| } |
| |
| /// Finds the leftmost-first match (start and end) using only the DFA, |
| /// but assumes the regex is anchored at the end and therefore starts at |
| /// the end of the regex and matches in reverse. |
| /// |
| /// If the result returned indicates that the DFA quit, then another |
| /// matching engine should be used. |
| #[cfg(feature = "perf-dfa")] |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn find_dfa_anchored_reverse( |
| &self, |
| text: &[u8], |
| start: usize, |
| ) -> dfa::Result<(usize, usize)> { |
| use dfa::Result::*; |
| match dfa::Fsm::reverse( |
| &self.ro.dfa_reverse, |
| self.cache.value(), |
| false, |
| &text[start..], |
| text.len() - start, |
| ) { |
| Match(s) => Match((start + s, text.len())), |
| NoMatch(i) => NoMatch(i), |
| Quit => Quit, |
| } |
| } |
| |
| /// Finds the end of the shortest match using only the DFA. |
| #[cfg(feature = "perf-dfa")] |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> { |
| dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start) |
| } |
| |
| /// Finds the end of the shortest match using only the DFA by scanning for |
| /// suffix literals. |
| #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn shortest_dfa_reverse_suffix( |
| &self, |
| text: &[u8], |
| start: usize, |
| ) -> dfa::Result<usize> { |
| match self.exec_dfa_reverse_suffix(text, start) { |
| None => self.shortest_dfa(text, start), |
| Some(r) => r.map(|(_, end)| end), |
| } |
| } |
| |
| /// Finds the end of the shortest match using only the DFA by scanning for |
| /// suffix literals. It also reports the start of the match. |
| /// |
| /// Note that if None is returned, then the optimization gave up to avoid |
| /// worst case quadratic behavior. A forward scanning DFA should be tried |
| /// next. |
| /// |
| /// If a match is returned and the full leftmost-first match is desired, |
| /// then a forward scan starting from the beginning of the match must be |
| /// done. |
| /// |
| /// If the result returned indicates that the DFA quit, then another |
| /// matching engine should be used. |
| #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn exec_dfa_reverse_suffix( |
| &self, |
| text: &[u8], |
| original_start: usize, |
| ) -> Option<dfa::Result<(usize, usize)>> { |
| use dfa::Result::*; |
| |
| let lcs = self.ro.suffixes.lcs(); |
| debug_assert!(lcs.len() >= 1); |
| let mut start = original_start; |
| let mut end = start; |
| let mut last_literal = start; |
| while end <= text.len() { |
| last_literal += match lcs.find(&text[last_literal..]) { |
| None => return Some(NoMatch(text.len())), |
| Some(i) => i, |
| }; |
| end = last_literal + lcs.len(); |
| match dfa::Fsm::reverse( |
| &self.ro.dfa_reverse, |
| self.cache.value(), |
| false, |
| &text[start..end], |
| end - start, |
| ) { |
| Match(0) | NoMatch(0) => return None, |
| Match(i) => return Some(Match((start + i, end))), |
| NoMatch(i) => { |
| start += i; |
| last_literal += 1; |
| continue; |
| } |
| Quit => return Some(Quit), |
| }; |
| } |
| Some(NoMatch(text.len())) |
| } |
| |
| /// Finds the leftmost-first match (start and end) using only the DFA |
| /// by scanning for suffix literals. |
| /// |
| /// If the result returned indicates that the DFA quit, then another |
| /// matching engine should be used. |
| #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn find_dfa_reverse_suffix( |
| &self, |
| text: &[u8], |
| start: usize, |
| ) -> dfa::Result<(usize, usize)> { |
| use dfa::Result::*; |
| |
| let match_start = match self.exec_dfa_reverse_suffix(text, start) { |
| None => return self.find_dfa_forward(text, start), |
| Some(Match((start, _))) => start, |
| Some(r) => return r, |
| }; |
| // At this point, we've found a match. The only way to quit now |
| // without a match is if the DFA gives up (seems unlikely). |
| // |
| // Now run the DFA forwards to find the proper end of the match. |
| // (The suffix literal match can only indicate the earliest |
| // possible end location, which may appear before the end of the |
| // leftmost-first match.) |
| match dfa::Fsm::forward( |
| &self.ro.dfa, |
| self.cache.value(), |
| false, |
| text, |
| match_start, |
| ) { |
| NoMatch(_) => panic!("BUG: reverse match implies forward match"), |
| Quit => Quit, |
| Match(e) => Match((match_start, e)), |
| } |
| } |
| |
| /// Executes the NFA engine to return whether there is a match or not. |
| /// |
| /// Ideally, we could use shortest_nfa(...).is_some() and get the same |
| /// performance characteristics, but regex sets don't have captures, which |
| /// shortest_nfa depends on. |
| #[cfg(feature = "perf-dfa")] |
| fn match_nfa(&self, text: &[u8], start: usize) -> bool { |
| self.match_nfa_type(MatchNfaType::Auto, text, start) |
| } |
| |
| /// Like match_nfa, but allows specification of the type of NFA engine. |
| fn match_nfa_type( |
| &self, |
| ty: MatchNfaType, |
| text: &[u8], |
| start: usize, |
| ) -> bool { |
| self.exec_nfa( |
| ty, |
| &mut [false], |
| &mut [], |
| true, |
| false, |
| text, |
| start, |
| text.len(), |
| ) |
| } |
| |
| /// Finds the shortest match using an NFA. |
| #[cfg(feature = "perf-dfa")] |
| fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> { |
| self.shortest_nfa_type(MatchNfaType::Auto, text, start) |
| } |
| |
| /// Like shortest_nfa, but allows specification of the type of NFA engine. |
| fn shortest_nfa_type( |
| &self, |
| ty: MatchNfaType, |
| text: &[u8], |
| start: usize, |
| ) -> Option<usize> { |
| let mut slots = [None, None]; |
| if self.exec_nfa( |
| ty, |
| &mut [false], |
| &mut slots, |
| true, |
| true, |
| text, |
| start, |
| text.len(), |
| ) { |
| slots[1] |
| } else { |
| None |
| } |
| } |
| |
| /// Like find, but executes an NFA engine. |
| fn find_nfa( |
| &self, |
| ty: MatchNfaType, |
| text: &[u8], |
| start: usize, |
| ) -> Option<(usize, usize)> { |
| let mut slots = [None, None]; |
| if self.exec_nfa( |
| ty, |
| &mut [false], |
| &mut slots, |
| false, |
| false, |
| text, |
| start, |
| text.len(), |
| ) { |
| match (slots[0], slots[1]) { |
| (Some(s), Some(e)) => Some((s, e)), |
| _ => None, |
| } |
| } else { |
| None |
| } |
| } |
| |
| /// Like find_nfa, but fills in captures. |
| /// |
| /// `slots` should have length equal to `2 * nfa.captures.len()`. |
| #[cfg(feature = "perf-dfa")] |
| fn captures_nfa( |
| &self, |
| slots: &mut [Slot], |
| text: &[u8], |
| start: usize, |
| ) -> Option<(usize, usize)> { |
| self.captures_nfa_type( |
| MatchNfaType::Auto, |
| slots, |
| text, |
| start, |
| text.len(), |
| ) |
| } |
| |
| /// Like captures_nfa, but allows specification of type of NFA engine. |
| fn captures_nfa_type( |
| &self, |
| ty: MatchNfaType, |
| slots: &mut [Slot], |
| text: &[u8], |
| start: usize, |
| end: usize, |
| ) -> Option<(usize, usize)> { |
| if self.exec_nfa( |
| ty, |
| &mut [false], |
| slots, |
| false, |
| false, |
| text, |
| start, |
| end, |
| ) { |
| match (slots[0], slots[1]) { |
| (Some(s), Some(e)) => Some((s, e)), |
| _ => None, |
| } |
| } else { |
| None |
| } |
| } |
| |
| fn exec_nfa( |
| &self, |
| mut ty: MatchNfaType, |
| matches: &mut [bool], |
| slots: &mut [Slot], |
| quit_after_match: bool, |
| quit_after_match_with_pos: bool, |
| text: &[u8], |
| start: usize, |
| end: usize, |
| ) -> bool { |
| use self::MatchNfaType::*; |
| if let Auto = ty { |
| if backtrack::should_exec(self.ro.nfa.len(), text.len()) { |
| ty = Backtrack; |
| } else { |
| ty = PikeVM; |
| } |
| } |
| // The backtracker can't return the shortest match position as it is |
| // implemented today. So if someone calls `shortest_match` and we need |
| // to run an NFA, then use the PikeVM. |
| if quit_after_match_with_pos || ty == PikeVM { |
| self.exec_pikevm( |
| matches, |
| slots, |
| quit_after_match, |
| text, |
| start, |
| end, |
| ) |
| } else { |
| self.exec_backtrack(matches, slots, text, start, end) |
| } |
| } |
| |
| /// Always run the NFA algorithm. |
| fn exec_pikevm( |
| &self, |
| matches: &mut [bool], |
| slots: &mut [Slot], |
| quit_after_match: bool, |
| text: &[u8], |
| start: usize, |
| end: usize, |
| ) -> bool { |
| if self.ro.nfa.uses_bytes() { |
| pikevm::Fsm::exec( |
| &self.ro.nfa, |
| self.cache.value(), |
| matches, |
| slots, |
| quit_after_match, |
| ByteInput::new(text, self.ro.nfa.only_utf8), |
| start, |
| end, |
| ) |
| } else { |
| pikevm::Fsm::exec( |
| &self.ro.nfa, |
| self.cache.value(), |
| matches, |
| slots, |
| quit_after_match, |
| CharInput::new(text), |
| start, |
| end, |
| ) |
| } |
| } |
| |
| /// Always runs the NFA using bounded backtracking. |
| fn exec_backtrack( |
| &self, |
| matches: &mut [bool], |
| slots: &mut [Slot], |
| text: &[u8], |
| start: usize, |
| end: usize, |
| ) -> bool { |
| if self.ro.nfa.uses_bytes() { |
| backtrack::Bounded::exec( |
| &self.ro.nfa, |
| self.cache.value(), |
| matches, |
| slots, |
| ByteInput::new(text, self.ro.nfa.only_utf8), |
| start, |
| end, |
| ) |
| } else { |
| backtrack::Bounded::exec( |
| &self.ro.nfa, |
| self.cache.value(), |
| matches, |
| slots, |
| CharInput::new(text), |
| start, |
| end, |
| ) |
| } |
| } |
| |
| /// Finds which regular expressions match the given text. |
| /// |
| /// `matches` should have length equal to the number of regexes being |
| /// searched. |
| /// |
| /// This is only useful when one wants to know which regexes in a set |
| /// match some text. |
| pub fn many_matches_at( |
| &self, |
| matches: &mut [bool], |
| text: &[u8], |
| start: usize, |
| ) -> bool { |
| use self::MatchType::*; |
| if !self.is_anchor_end_match(text) { |
| return false; |
| } |
| match self.ro.match_type { |
| #[cfg(feature = "perf-literal")] |
| Literal(ty) => { |
| debug_assert_eq!(matches.len(), 1); |
| matches[0] = self.find_literals(ty, text, start).is_some(); |
| matches[0] |
| } |
| #[cfg(feature = "perf-dfa")] |
| Dfa | DfaAnchoredReverse | DfaMany => { |
| match dfa::Fsm::forward_many( |
| &self.ro.dfa, |
| self.cache.value(), |
| matches, |
| text, |
| start, |
| ) { |
| dfa::Result::Match(_) => true, |
| dfa::Result::NoMatch(_) => false, |
| dfa::Result::Quit => self.exec_nfa( |
| MatchNfaType::Auto, |
| matches, |
| &mut [], |
| false, |
| false, |
| text, |
| start, |
| text.len(), |
| ), |
| } |
| } |
| #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] |
| DfaSuffix => { |
| match dfa::Fsm::forward_many( |
| &self.ro.dfa, |
| self.cache.value(), |
| matches, |
| text, |
| start, |
| ) { |
| dfa::Result::Match(_) => true, |
| dfa::Result::NoMatch(_) => false, |
| dfa::Result::Quit => self.exec_nfa( |
| MatchNfaType::Auto, |
| matches, |
| &mut [], |
| false, |
| false, |
| text, |
| start, |
| text.len(), |
| ), |
| } |
| } |
| Nfa(ty) => self.exec_nfa( |
| ty, |
| matches, |
| &mut [], |
| false, |
| false, |
| text, |
| start, |
| text.len(), |
| ), |
| Nothing => false, |
| } |
| } |
| |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn is_anchor_end_match(&self, text: &[u8]) -> bool { |
| #[cfg(not(feature = "perf-literal"))] |
| fn imp(_: &ExecReadOnly, _: &[u8]) -> bool { |
| true |
| } |
| |
| #[cfg(feature = "perf-literal")] |
| fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool { |
| // Only do this check if the haystack is big (>1MB). |
| if text.len() > (1 << 20) && ro.nfa.is_anchored_end { |
| let lcs = ro.suffixes.lcs(); |
| if lcs.len() >= 1 && !lcs.is_suffix(text) { |
| return false; |
| } |
| } |
| true |
| } |
| |
| imp(&self.ro, text) |
| } |
| |
| pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { |
| &self.ro.nfa.capture_name_idx |
| } |
| } |
| |
| impl<'c> ExecNoSyncStr<'c> { |
| pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { |
| self.0.capture_name_idx() |
| } |
| } |
| |
| impl Exec { |
| /// Get a searcher that isn't Sync. |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| pub fn searcher(&self) -> ExecNoSync { |
| let create = || RefCell::new(ProgramCacheInner::new(&self.ro)); |
| ExecNoSync { |
| ro: &self.ro, // a clone is too expensive here! (and not needed) |
| cache: self.cache.get_or(create), |
| } |
| } |
| |
| /// Get a searcher that isn't Sync and can match on &str. |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| pub fn searcher_str(&self) -> ExecNoSyncStr { |
| ExecNoSyncStr(self.searcher()) |
| } |
| |
| /// Build a Regex from this executor. |
| pub fn into_regex(self) -> re_unicode::Regex { |
| re_unicode::Regex::from(self) |
| } |
| |
| /// Build a RegexSet from this executor. |
| pub fn into_regex_set(self) -> re_set::unicode::RegexSet { |
| re_set::unicode::RegexSet::from(self) |
| } |
| |
| /// Build a Regex from this executor that can match arbitrary bytes. |
| pub fn into_byte_regex(self) -> re_bytes::Regex { |
| re_bytes::Regex::from(self) |
| } |
| |
| /// Build a RegexSet from this executor that can match arbitrary bytes. |
| pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet { |
| re_set::bytes::RegexSet::from(self) |
| } |
| |
| /// The original regular expressions given by the caller that were |
| /// compiled. |
| pub fn regex_strings(&self) -> &[String] { |
| &self.ro.res |
| } |
| |
| /// Return a slice of capture names. |
| /// |
| /// Any capture that isn't named is None. |
| pub fn capture_names(&self) -> &[Option<String>] { |
| &self.ro.nfa.captures |
| } |
| |
| /// Return a reference to named groups mapping (from group name to |
| /// group position). |
| pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { |
| &self.ro.nfa.capture_name_idx |
| } |
| } |
| |
| impl Clone for Exec { |
| fn clone(&self) -> Exec { |
| Exec { ro: self.ro.clone(), cache: Cached::new() } |
| } |
| } |
| |
| impl ExecReadOnly { |
| fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType { |
| if let Some(MatchType::Nfa(_)) = hint { |
| return hint.unwrap(); |
| } |
| // If the NFA is empty, then we'll never match anything. |
| if self.nfa.insts.is_empty() { |
| return MatchType::Nothing; |
| } |
| if let Some(literalty) = self.choose_literal_match_type() { |
| return literalty; |
| } |
| if let Some(dfaty) = self.choose_dfa_match_type() { |
| return dfaty; |
| } |
| // We're so totally hosed. |
| MatchType::Nfa(MatchNfaType::Auto) |
| } |
| |
| /// If a plain literal scan can be used, then a corresponding literal |
| /// search type is returned. |
| fn choose_literal_match_type(&self) -> Option<MatchType> { |
| #[cfg(not(feature = "perf-literal"))] |
| fn imp(_: &ExecReadOnly) -> Option<MatchType> { |
| None |
| } |
| |
| #[cfg(feature = "perf-literal")] |
| fn imp(ro: &ExecReadOnly) -> Option<MatchType> { |
| // If our set of prefixes is complete, then we can use it to find |
| // a match in lieu of a regex engine. This doesn't quite work well |
| // in the presence of multiple regexes, so only do it when there's |
| // one. |
| // |
| // TODO(burntsushi): Also, don't try to match literals if the regex |
| // is partially anchored. We could technically do it, but we'd need |
| // to create two sets of literals: all of them and then the subset |
| // that aren't anchored. We would then only search for all of them |
| // when at the beginning of the input and use the subset in all |
| // other cases. |
| if ro.res.len() != 1 { |
| return None; |
| } |
| if ro.ac.is_some() { |
| return Some(MatchType::Literal( |
| MatchLiteralType::AhoCorasick, |
| )); |
| } |
| if ro.nfa.prefixes.complete() { |
| return if ro.nfa.is_anchored_start { |
| Some(MatchType::Literal(MatchLiteralType::AnchoredStart)) |
| } else { |
| Some(MatchType::Literal(MatchLiteralType::Unanchored)) |
| }; |
| } |
| if ro.suffixes.complete() { |
| return if ro.nfa.is_anchored_end { |
| Some(MatchType::Literal(MatchLiteralType::AnchoredEnd)) |
| } else { |
| // This case shouldn't happen. When the regex isn't |
| // anchored, then complete prefixes should imply complete |
| // suffixes. |
| Some(MatchType::Literal(MatchLiteralType::Unanchored)) |
| }; |
| } |
| None |
| } |
| |
| imp(self) |
| } |
| |
| /// If a DFA scan can be used, then choose the appropriate DFA strategy. |
| fn choose_dfa_match_type(&self) -> Option<MatchType> { |
| #[cfg(not(feature = "perf-dfa"))] |
| fn imp(_: &ExecReadOnly) -> Option<MatchType> { |
| None |
| } |
| |
| #[cfg(feature = "perf-dfa")] |
| fn imp(ro: &ExecReadOnly) -> Option<MatchType> { |
| if !dfa::can_exec(&ro.dfa) { |
| return None; |
| } |
| // Regex sets require a slightly specialized path. |
| if ro.res.len() >= 2 { |
| return Some(MatchType::DfaMany); |
| } |
| // If the regex is anchored at the end but not the start, then |
| // just match in reverse from the end of the haystack. |
| if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end { |
| return Some(MatchType::DfaAnchoredReverse); |
| } |
| #[cfg(feature = "perf-literal")] |
| { |
| // If there's a longish suffix literal, then it might be faster |
| // to look for that first. |
| if ro.should_suffix_scan() { |
| return Some(MatchType::DfaSuffix); |
| } |
| } |
| // Fall back to your garden variety forward searching lazy DFA. |
| Some(MatchType::Dfa) |
| } |
| |
| imp(self) |
| } |
| |
| /// Returns true if the program is amenable to suffix scanning. |
| /// |
| /// When this is true, as a heuristic, we assume it is OK to quickly scan |
| /// for suffix literals and then do a *reverse* DFA match from any matches |
| /// produced by the literal scan. (And then followed by a forward DFA |
| /// search, since the previously found suffix literal maybe not actually be |
| /// the end of a match.) |
| /// |
| /// This is a bit of a specialized optimization, but can result in pretty |
| /// big performance wins if 1) there are no prefix literals and 2) the |
| /// suffix literals are pretty rare in the text. (1) is obviously easy to |
| /// account for but (2) is harder. As a proxy, we assume that longer |
| /// strings are generally rarer, so we only enable this optimization when |
| /// we have a meaty suffix. |
| #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] |
| fn should_suffix_scan(&self) -> bool { |
| if self.suffixes.is_empty() { |
| return false; |
| } |
| let lcs_len = self.suffixes.lcs().char_len(); |
| lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len() |
| } |
| } |
| |
| #[derive(Clone, Copy, Debug)] |
| enum MatchType { |
| /// A single or multiple literal search. This is only used when the regex |
| /// can be decomposed into a literal search. |
| #[cfg(feature = "perf-literal")] |
| Literal(MatchLiteralType), |
| /// A normal DFA search. |
| #[cfg(feature = "perf-dfa")] |
| Dfa, |
| /// A reverse DFA search starting from the end of a haystack. |
| #[cfg(feature = "perf-dfa")] |
| DfaAnchoredReverse, |
| /// A reverse DFA search with suffix literal scanning. |
| #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] |
| DfaSuffix, |
| /// Use the DFA on two or more regular expressions. |
| #[cfg(feature = "perf-dfa")] |
| DfaMany, |
| /// An NFA variant. |
| Nfa(MatchNfaType), |
| /// No match is ever possible, so don't ever try to search. |
| Nothing, |
| } |
| |
| #[derive(Clone, Copy, Debug)] |
| #[cfg(feature = "perf-literal")] |
| enum MatchLiteralType { |
| /// Match literals anywhere in text. |
| Unanchored, |
| /// Match literals only at the start of text. |
| AnchoredStart, |
| /// Match literals only at the end of text. |
| AnchoredEnd, |
| /// Use an Aho-Corasick automaton. This requires `ac` to be Some on |
| /// ExecReadOnly. |
| AhoCorasick, |
| } |
| |
| #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
| enum MatchNfaType { |
| /// Choose between Backtrack and PikeVM. |
| Auto, |
| /// NFA bounded backtracking. |
| /// |
| /// (This is only set by tests, since it never makes sense to always want |
| /// backtracking.) |
| Backtrack, |
| /// The Pike VM. |
| /// |
| /// (This is only set by tests, since it never makes sense to always want |
| /// the Pike VM.) |
| PikeVM, |
| } |
| |
| /// `ProgramCache` maintains reusable allocations for each matching engine |
| /// available to a particular program. |
| pub type ProgramCache = RefCell<ProgramCacheInner>; |
| |
| #[derive(Debug)] |
| pub struct ProgramCacheInner { |
| pub pikevm: pikevm::Cache, |
| pub backtrack: backtrack::Cache, |
| #[cfg(feature = "perf-dfa")] |
| pub dfa: dfa::Cache, |
| #[cfg(feature = "perf-dfa")] |
| pub dfa_reverse: dfa::Cache, |
| } |
| |
| impl ProgramCacheInner { |
| fn new(ro: &ExecReadOnly) -> Self { |
| ProgramCacheInner { |
| pikevm: pikevm::Cache::new(&ro.nfa), |
| backtrack: backtrack::Cache::new(&ro.nfa), |
| #[cfg(feature = "perf-dfa")] |
| dfa: dfa::Cache::new(&ro.dfa), |
| #[cfg(feature = "perf-dfa")] |
| dfa_reverse: dfa::Cache::new(&ro.dfa_reverse), |
| } |
| } |
| } |
| |
| /// Alternation literals checks if the given HIR is a simple alternation of |
| /// literals, and if so, returns them. Otherwise, this returns None. |
| #[cfg(feature = "perf-literal")] |
| fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> { |
| use syntax::hir::{HirKind, Literal}; |
| |
| // This is pretty hacky, but basically, if `is_alternation_literal` is |
| // true, then we can make several assumptions about the structure of our |
| // HIR. This is what justifies the `unreachable!` statements below. |
| // |
| // This code should be refactored once we overhaul this crate's |
| // optimization pipeline, because this is a terribly inflexible way to go |
| // about things. |
| |
| if !expr.is_alternation_literal() { |
| return None; |
| } |
| let alts = match *expr.kind() { |
| HirKind::Alternation(ref alts) => alts, |
| _ => return None, // one literal isn't worth it |
| }; |
| |
| let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit { |
| Literal::Unicode(c) => { |
| let mut buf = [0; 4]; |
| dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes()); |
| } |
| Literal::Byte(b) => { |
| dst.push(b); |
| } |
| }; |
| |
| let mut lits = vec![]; |
| for alt in alts { |
| let mut lit = vec![]; |
| match *alt.kind() { |
| HirKind::Literal(ref x) => extendlit(x, &mut lit), |
| HirKind::Concat(ref exprs) => { |
| for e in exprs { |
| match *e.kind() { |
| HirKind::Literal(ref x) => extendlit(x, &mut lit), |
| _ => unreachable!("expected literal, got {:?}", e), |
| } |
| } |
| } |
| _ => unreachable!("expected literal or concat, got {:?}", alt), |
| } |
| lits.push(lit); |
| } |
| Some(lits) |
| } |
| |
| #[cfg(test)] |
| mod test { |
| #[test] |
| fn uppercut_s_backtracking_bytes_default_bytes_mismatch() { |
| use internal::ExecBuilder; |
| |
| let backtrack_bytes_re = ExecBuilder::new("^S") |
| .bounded_backtracking() |
| .only_utf8(false) |
| .build() |
| .map(|exec| exec.into_byte_regex()) |
| .map_err(|err| format!("{}", err)) |
| .unwrap(); |
| |
| let default_bytes_re = ExecBuilder::new("^S") |
| .only_utf8(false) |
| .build() |
| .map(|exec| exec.into_byte_regex()) |
| .map_err(|err| format!("{}", err)) |
| .unwrap(); |
| |
| let input = vec![83, 83]; |
| |
| let s1 = backtrack_bytes_re.split(&input); |
| let s2 = default_bytes_re.split(&input); |
| for (chunk1, chunk2) in s1.zip(s2) { |
| assert_eq!(chunk1, chunk2); |
| } |
| } |
| |
| #[test] |
| fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() { |
| use internal::ExecBuilder; |
| |
| let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)") |
| .bounded_backtracking() |
| .bytes(true) |
| .build() |
| .map(|exec| exec.into_regex()) |
| .map_err(|err| format!("{}", err)) |
| .unwrap(); |
| |
| let default_bytes_re = ExecBuilder::new(r"^(?u:\*)") |
| .bytes(true) |
| .build() |
| .map(|exec| exec.into_regex()) |
| .map_err(|err| format!("{}", err)) |
| .unwrap(); |
| |
| let input = "**"; |
| |
| let s1 = backtrack_bytes_re.split(input); |
| let s2 = default_bytes_re.split(input); |
| for (chunk1, chunk2) in s1.zip(s2) { |
| assert_eq!(chunk1, chunk2); |
| } |
| } |
| } |