| use std::cmp::Ordering; |
| use std::collections::HashMap; |
| use std::fmt; |
| use std::mem; |
| use std::ops::Deref; |
| use std::slice; |
| use std::sync::Arc; |
| |
| use input::Char; |
| use literal::LiteralSearcher; |
| |
| /// `InstPtr` represents the index of an instruction in a regex program. |
| pub type InstPtr = usize; |
| |
| /// Program is a sequence of instructions and various facts about thos |
| /// instructions. |
| #[derive(Clone)] |
| pub struct Program { |
| /// A sequence of instructions that represents an NFA. |
| pub insts: Vec<Inst>, |
| /// Pointers to each Match instruction in the sequence. |
| /// |
| /// This is always length 1 unless this program represents a regex set. |
| pub matches: Vec<InstPtr>, |
| /// The ordered sequence of all capture groups extracted from the AST. |
| /// Unnamed groups are `None`. |
| pub captures: Vec<Option<String>>, |
| /// Pointers to all named capture groups into `captures`. |
| pub capture_name_idx: Arc<HashMap<String, usize>>, |
| /// A pointer to the start instruction. This can vary depending on how |
| /// the program was compiled. For example, programs for use with the DFA |
| /// engine have a `.*?` inserted at the beginning of unanchored regular |
| /// expressions. The actual starting point of the program is after the |
| /// `.*?`. |
| pub start: InstPtr, |
| /// A set of equivalence classes for discriminating bytes in the compiled |
| /// program. |
| pub byte_classes: Vec<u8>, |
| /// When true, this program can only match valid UTF-8. |
| pub only_utf8: bool, |
| /// When true, this program uses byte range instructions instead of Unicode |
| /// range instructions. |
| pub is_bytes: bool, |
| /// When true, the program is compiled for DFA matching. For example, this |
| /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored |
| /// regexes. |
| pub is_dfa: bool, |
| /// When true, the program matches text in reverse (for use only in the |
| /// DFA). |
| pub is_reverse: bool, |
| /// Whether the regex must match from the start of the input. |
| pub is_anchored_start: bool, |
| /// Whether the regex must match at the end of the input. |
| pub is_anchored_end: bool, |
| /// Whether this program contains a Unicode word boundary instruction. |
| pub has_unicode_word_boundary: bool, |
| /// A possibly empty machine for very quickly matching prefix literals. |
| pub prefixes: LiteralSearcher, |
| /// A limit on the size of the cache that the DFA is allowed to use while |
| /// matching. |
| /// |
| /// The cache limit specifies approximately how much space we're willing to |
| /// give to the state cache. Once the state cache exceeds the size, it is |
| /// wiped and all states must be re-computed. |
| /// |
| /// Note that this value does not impact correctness. It can be set to 0 |
| /// and the DFA will run just fine. (It will only ever store exactly one |
| /// state in the cache, and will likely run very slowly, but it will work.) |
| /// |
| /// Also note that this limit is *per thread of execution*. That is, |
| /// if the same regex is used to search text across multiple threads |
| /// simultaneously, then the DFA cache is not shared. Instead, copies are |
| /// made. |
| pub dfa_size_limit: usize, |
| } |
| |
| impl Program { |
| /// Creates an empty instruction sequence. Fields are given default |
| /// values. |
| pub fn new() -> Self { |
| Program { |
| insts: vec![], |
| matches: vec![], |
| captures: vec![], |
| capture_name_idx: Arc::new(HashMap::new()), |
| start: 0, |
| byte_classes: vec![0; 256], |
| only_utf8: true, |
| is_bytes: false, |
| is_dfa: false, |
| is_reverse: false, |
| is_anchored_start: false, |
| is_anchored_end: false, |
| has_unicode_word_boundary: false, |
| prefixes: LiteralSearcher::empty(), |
| dfa_size_limit: 2 * (1 << 20), |
| } |
| } |
| |
| /// If pc is an index to a no-op instruction (like Save), then return the |
| /// next pc that is not a no-op instruction. |
| pub fn skip(&self, mut pc: usize) -> usize { |
| loop { |
| match self[pc] { |
| Inst::Save(ref i) => pc = i.goto, |
| _ => return pc, |
| } |
| } |
| } |
| |
| /// Return true if and only if an execution engine at instruction `pc` will |
| /// always lead to a match. |
| pub fn leads_to_match(&self, pc: usize) -> bool { |
| if self.matches.len() > 1 { |
| // If we have a regex set, then we have more than one ending |
| // state, so leading to one of those states is generally |
| // meaningless. |
| return false; |
| } |
| match self[self.skip(pc)] { |
| Inst::Match(_) => true, |
| _ => false, |
| } |
| } |
| |
| /// Returns true if the current configuration demands that an implicit |
| /// `.*?` be prepended to the instruction sequence. |
| pub fn needs_dotstar(&self) -> bool { |
| self.is_dfa && !self.is_reverse && !self.is_anchored_start |
| } |
| |
| /// Returns true if this program uses Byte instructions instead of |
| /// Char/Range instructions. |
| pub fn uses_bytes(&self) -> bool { |
| self.is_bytes || self.is_dfa |
| } |
| |
| /// Returns true if this program exclusively matches valid UTF-8 bytes. |
| /// |
| /// That is, if an invalid UTF-8 byte is seen, then no match is possible. |
| pub fn only_utf8(&self) -> bool { |
| self.only_utf8 |
| } |
| |
| /// Return the approximate heap usage of this instruction sequence in |
| /// bytes. |
| pub fn approximate_size(&self) -> usize { |
| // The only instruction that uses heap space is Ranges (for |
| // Unicode codepoint programs) to store non-overlapping codepoint |
| // ranges. To keep this operation constant time, we ignore them. |
| (self.len() * mem::size_of::<Inst>()) |
| + (self.matches.len() * mem::size_of::<InstPtr>()) |
| + (self.captures.len() * mem::size_of::<Option<String>>()) |
| + (self.capture_name_idx.len() |
| * (mem::size_of::<String>() + mem::size_of::<usize>())) |
| + (self.byte_classes.len() * mem::size_of::<u8>()) |
| + self.prefixes.approximate_size() |
| } |
| } |
| |
| impl Deref for Program { |
| type Target = [Inst]; |
| |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn deref(&self) -> &Self::Target { |
| &*self.insts |
| } |
| } |
| |
| impl fmt::Debug for Program { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| use self::Inst::*; |
| |
| fn with_goto(cur: usize, goto: usize, fmtd: String) -> String { |
| if goto == cur + 1 { |
| fmtd |
| } else { |
| format!("{} (goto: {})", fmtd, goto) |
| } |
| } |
| |
| fn visible_byte(b: u8) -> String { |
| use std::ascii::escape_default; |
| let escaped = escape_default(b).collect::<Vec<u8>>(); |
| String::from_utf8_lossy(&escaped).into_owned() |
| } |
| |
| for (pc, inst) in self.iter().enumerate() { |
| match *inst { |
| Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?, |
| Save(ref inst) => { |
| let s = format!("{:04} Save({})", pc, inst.slot); |
| write!(f, "{}", with_goto(pc, inst.goto, s))?; |
| } |
| Split(ref inst) => { |
| write!( |
| f, |
| "{:04} Split({}, {})", |
| pc, inst.goto1, inst.goto2 |
| )?; |
| } |
| EmptyLook(ref inst) => { |
| let s = format!("{:?}", inst.look); |
| write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; |
| } |
| Char(ref inst) => { |
| let s = format!("{:?}", inst.c); |
| write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; |
| } |
| Ranges(ref inst) => { |
| let ranges = inst |
| .ranges |
| .iter() |
| .map(|r| format!("{:?}-{:?}", r.0, r.1)) |
| .collect::<Vec<String>>() |
| .join(", "); |
| write!( |
| f, |
| "{:04} {}", |
| pc, |
| with_goto(pc, inst.goto, ranges) |
| )?; |
| } |
| Bytes(ref inst) => { |
| let s = format!( |
| "Bytes({}, {})", |
| visible_byte(inst.start), |
| visible_byte(inst.end) |
| ); |
| write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; |
| } |
| } |
| if pc == self.start { |
| write!(f, " (start)")?; |
| } |
| write!(f, "\n")?; |
| } |
| Ok(()) |
| } |
| } |
| |
| impl<'a> IntoIterator for &'a Program { |
| type Item = &'a Inst; |
| type IntoIter = slice::Iter<'a, Inst>; |
| fn into_iter(self) -> Self::IntoIter { |
| self.iter() |
| } |
| } |
| |
| /// Inst is an instruction code in a Regex program. |
| /// |
| /// Regrettably, a regex program either contains Unicode codepoint |
| /// instructions (Char and Ranges) or it contains byte instructions (Bytes). |
| /// A regex program can never contain both. |
| /// |
| /// It would be worth investigating splitting this into two distinct types and |
| /// then figuring out how to make the matching engines polymorphic over those |
| /// types without sacrificing performance. |
| /// |
| /// Other than the benefit of moving invariants into the type system, another |
| /// benefit is the decreased size. If we remove the `Char` and `Ranges` |
| /// instructions from the `Inst` enum, then its size shrinks from 40 bytes to |
| /// 24 bytes. (This is because of the removal of a `Vec` in the `Ranges` |
| /// variant.) Given that byte based machines are typically much bigger than |
| /// their Unicode analogues (because they can decode UTF-8 directly), this ends |
| /// up being a pretty significant savings. |
| #[derive(Clone, Debug)] |
| pub enum Inst { |
| /// Match indicates that the program has reached a match state. |
| /// |
| /// The number in the match corresponds to the Nth logical regular |
| /// expression in this program. This index is always 0 for normal regex |
| /// programs. Values greater than 0 appear when compiling regex sets, and |
| /// each match instruction gets its own unique value. The value corresponds |
| /// to the Nth regex in the set. |
| Match(usize), |
| /// Save causes the program to save the current location of the input in |
| /// the slot indicated by InstSave. |
| Save(InstSave), |
| /// Split causes the program to diverge to one of two paths in the |
| /// program, preferring goto1 in InstSplit. |
| Split(InstSplit), |
| /// EmptyLook represents a zero-width assertion in a regex program. A |
| /// zero-width assertion does not consume any of the input text. |
| EmptyLook(InstEmptyLook), |
| /// Char requires the regex program to match the character in InstChar at |
| /// the current position in the input. |
| Char(InstChar), |
| /// Ranges requires the regex program to match the character at the current |
| /// position in the input with one of the ranges specified in InstRanges. |
| Ranges(InstRanges), |
| /// Bytes is like Ranges, except it expresses a single byte range. It is |
| /// used in conjunction with Split instructions to implement multi-byte |
| /// character classes. |
| Bytes(InstBytes), |
| } |
| |
| impl Inst { |
| /// Returns true if and only if this is a match instruction. |
| pub fn is_match(&self) -> bool { |
| match *self { |
| Inst::Match(_) => true, |
| _ => false, |
| } |
| } |
| } |
| |
| /// Representation of the Save instruction. |
| #[derive(Clone, Debug)] |
| pub struct InstSave { |
| /// The next location to execute in the program. |
| pub goto: InstPtr, |
| /// The capture slot (there are two slots for every capture in a regex, |
| /// including the zeroth capture for the entire match). |
| pub slot: usize, |
| } |
| |
| /// Representation of the Split instruction. |
| #[derive(Clone, Debug)] |
| pub struct InstSplit { |
| /// The first instruction to try. A match resulting from following goto1 |
| /// has precedence over a match resulting from following goto2. |
| pub goto1: InstPtr, |
| /// The second instruction to try. A match resulting from following goto1 |
| /// has precedence over a match resulting from following goto2. |
| pub goto2: InstPtr, |
| } |
| |
| /// Representation of the `EmptyLook` instruction. |
| #[derive(Clone, Debug)] |
| pub struct InstEmptyLook { |
| /// The next location to execute in the program if this instruction |
| /// succeeds. |
| pub goto: InstPtr, |
| /// The type of zero-width assertion to check. |
| pub look: EmptyLook, |
| } |
| |
| /// The set of zero-width match instructions. |
| #[derive(Clone, Copy, Debug, PartialEq, Eq)] |
| pub enum EmptyLook { |
| /// Start of line or input. |
| StartLine, |
| /// End of line or input. |
| EndLine, |
| /// Start of input. |
| StartText, |
| /// End of input. |
| EndText, |
| /// Word character on one side and non-word character on other. |
| WordBoundary, |
| /// Word character on both sides or non-word character on both sides. |
| NotWordBoundary, |
| /// ASCII word boundary. |
| WordBoundaryAscii, |
| /// Not ASCII word boundary. |
| NotWordBoundaryAscii, |
| } |
| |
| /// Representation of the Char instruction. |
| #[derive(Clone, Debug)] |
| pub struct InstChar { |
| /// The next location to execute in the program if this instruction |
| /// succeeds. |
| pub goto: InstPtr, |
| /// The character to test. |
| pub c: char, |
| } |
| |
| /// Representation of the Ranges instruction. |
| #[derive(Clone, Debug)] |
| pub struct InstRanges { |
| /// The next location to execute in the program if this instruction |
| /// succeeds. |
| pub goto: InstPtr, |
| /// The set of Unicode scalar value ranges to test. |
| pub ranges: Vec<(char, char)>, |
| } |
| |
| impl InstRanges { |
| /// Tests whether the given input character matches this instruction. |
| pub fn matches(&self, c: Char) -> bool { |
| // This speeds up the `match_class_unicode` benchmark by checking |
| // some common cases quickly without binary search. e.g., Matching |
| // a Unicode class on predominantly ASCII text. |
| for r in self.ranges.iter().take(4) { |
| if c < r.0 { |
| return false; |
| } |
| if c <= r.1 { |
| return true; |
| } |
| } |
| self.ranges |
| .binary_search_by(|r| { |
| if r.1 < c { |
| Ordering::Less |
| } else if r.0 > c { |
| Ordering::Greater |
| } else { |
| Ordering::Equal |
| } |
| }) |
| .is_ok() |
| } |
| |
| /// Return the number of distinct characters represented by all of the |
| /// ranges. |
| pub fn num_chars(&self) -> usize { |
| self.ranges |
| .iter() |
| .map(|&(s, e)| 1 + (e as u32) - (s as u32)) |
| .sum::<u32>() as usize |
| } |
| } |
| |
| /// Representation of the Bytes instruction. |
| #[derive(Clone, Debug)] |
| pub struct InstBytes { |
| /// The next location to execute in the program if this instruction |
| /// succeeds. |
| pub goto: InstPtr, |
| /// The start (inclusive) of this byte range. |
| pub start: u8, |
| /// The end (inclusive) of this byte range. |
| pub end: u8, |
| } |
| |
| impl InstBytes { |
| /// Returns true if and only if the given byte is in this range. |
| pub fn matches(&self, byte: u8) -> bool { |
| self.start <= byte && byte <= self.end |
| } |
| } |