Import 'regex' package vesion 1.3.6 * Add OWNERS * No Android.bp yet Bug: 152884384 Test: make Change-Id: I455caf7833b6c437c1c133bc7b2f47b83da9cbce

commit: e42c505f54ac2e7b2ca7d0197304cac1b4f605e9 [log] [tgz]
author: Chih-Hung Hsieh <chh@google.com> Thu Apr 16 10:44:21 2020 -0700
committer: Chih-Hung Hsieh <chh@google.com> Thu Apr 16 10:45:53 2020 -0700
tree: 7ca22ebdbdb7d0c9dc5c66d31f564aeaf51cece4
parent: 815d0a2344b5321fd47f0cb7dba96e4cb4e84615 [diff] [blame]
diff --git a/src/prog.rs b/src/prog.rs
new file mode 100644
index 0000000..74e5f2f
--- /dev/null
+++ b/src/prog.rs

@@ -0,0 +1,434 @@
+use std::cmp::Ordering;
+use std::collections::HashMap;
+use std::fmt;
+use std::mem;
+use std::ops::Deref;
+use std::slice;
+use std::sync::Arc;
+
+use input::Char;
+use literal::LiteralSearcher;
+
+/// `InstPtr` represents the index of an instruction in a regex program.
+pub type InstPtr = usize;
+
+/// Program is a sequence of instructions and various facts about thos
+/// instructions.
+#[derive(Clone)]
+pub struct Program {
+    /// A sequence of instructions that represents an NFA.
+    pub insts: Vec<Inst>,
+    /// Pointers to each Match instruction in the sequence.
+    ///
+    /// This is always length 1 unless this program represents a regex set.
+    pub matches: Vec<InstPtr>,
+    /// The ordered sequence of all capture groups extracted from the AST.
+    /// Unnamed groups are `None`.
+    pub captures: Vec<Option<String>>,
+    /// Pointers to all named capture groups into `captures`.
+    pub capture_name_idx: Arc<HashMap<String, usize>>,
+    /// A pointer to the start instruction. This can vary depending on how
+    /// the program was compiled. For example, programs for use with the DFA
+    /// engine have a `.*?` inserted at the beginning of unanchored regular
+    /// expressions. The actual starting point of the program is after the
+    /// `.*?`.
+    pub start: InstPtr,
+    /// A set of equivalence classes for discriminating bytes in the compiled
+    /// program.
+    pub byte_classes: Vec<u8>,
+    /// When true, this program can only match valid UTF-8.
+    pub only_utf8: bool,
+    /// When true, this program uses byte range instructions instead of Unicode
+    /// range instructions.
+    pub is_bytes: bool,
+    /// When true, the program is compiled for DFA matching. For example, this
+    /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored
+    /// regexes.
+    pub is_dfa: bool,
+    /// When true, the program matches text in reverse (for use only in the
+    /// DFA).
+    pub is_reverse: bool,
+    /// Whether the regex must match from the start of the input.
+    pub is_anchored_start: bool,
+    /// Whether the regex must match at the end of the input.
+    pub is_anchored_end: bool,
+    /// Whether this program contains a Unicode word boundary instruction.
+    pub has_unicode_word_boundary: bool,
+    /// A possibly empty machine for very quickly matching prefix literals.
+    pub prefixes: LiteralSearcher,
+    /// A limit on the size of the cache that the DFA is allowed to use while
+    /// matching.
+    ///
+    /// The cache limit specifies approximately how much space we're willing to
+    /// give to the state cache. Once the state cache exceeds the size, it is
+    /// wiped and all states must be re-computed.
+    ///
+    /// Note that this value does not impact correctness. It can be set to 0
+    /// and the DFA will run just fine. (It will only ever store exactly one
+    /// state in the cache, and will likely run very slowly, but it will work.)
+    ///
+    /// Also note that this limit is *per thread of execution*. That is,
+    /// if the same regex is used to search text across multiple threads
+    /// simultaneously, then the DFA cache is not shared. Instead, copies are
+    /// made.
+    pub dfa_size_limit: usize,
+}
+
+impl Program {
+    /// Creates an empty instruction sequence. Fields are given default
+    /// values.
+    pub fn new() -> Self {
+        Program {
+            insts: vec![],
+            matches: vec![],
+            captures: vec![],
+            capture_name_idx: Arc::new(HashMap::new()),
+            start: 0,
+            byte_classes: vec![0; 256],
+            only_utf8: true,
+            is_bytes: false,
+            is_dfa: false,
+            is_reverse: false,
+            is_anchored_start: false,
+            is_anchored_end: false,
+            has_unicode_word_boundary: false,
+            prefixes: LiteralSearcher::empty(),
+            dfa_size_limit: 2 * (1 << 20),
+        }
+    }
+
+    /// If pc is an index to a no-op instruction (like Save), then return the
+    /// next pc that is not a no-op instruction.
+    pub fn skip(&self, mut pc: usize) -> usize {
+        loop {
+            match self[pc] {
+                Inst::Save(ref i) => pc = i.goto,
+                _ => return pc,
+            }
+        }
+    }
+
+    /// Return true if and only if an execution engine at instruction `pc` will
+    /// always lead to a match.
+    pub fn leads_to_match(&self, pc: usize) -> bool {
+        if self.matches.len() > 1 {
+            // If we have a regex set, then we have more than one ending
+            // state, so leading to one of those states is generally
+            // meaningless.
+            return false;
+        }
+        match self[self.skip(pc)] {
+            Inst::Match(_) => true,
+            _ => false,
+        }
+    }
+
+    /// Returns true if the current configuration demands that an implicit
+    /// `.*?` be prepended to the instruction sequence.
+    pub fn needs_dotstar(&self) -> bool {
+        self.is_dfa && !self.is_reverse && !self.is_anchored_start
+    }
+
+    /// Returns true if this program uses Byte instructions instead of
+    /// Char/Range instructions.
+    pub fn uses_bytes(&self) -> bool {
+        self.is_bytes || self.is_dfa
+    }
+
+    /// Returns true if this program exclusively matches valid UTF-8 bytes.
+    ///
+    /// That is, if an invalid UTF-8 byte is seen, then no match is possible.
+    pub fn only_utf8(&self) -> bool {
+        self.only_utf8
+    }
+
+    /// Return the approximate heap usage of this instruction sequence in
+    /// bytes.
+    pub fn approximate_size(&self) -> usize {
+        // The only instruction that uses heap space is Ranges (for
+        // Unicode codepoint programs) to store non-overlapping codepoint
+        // ranges. To keep this operation constant time, we ignore them.
+        (self.len() * mem::size_of::<Inst>())
+            + (self.matches.len() * mem::size_of::<InstPtr>())
+            + (self.captures.len() * mem::size_of::<Option<String>>())
+            + (self.capture_name_idx.len()
+                * (mem::size_of::<String>() + mem::size_of::<usize>()))
+            + (self.byte_classes.len() * mem::size_of::<u8>())
+            + self.prefixes.approximate_size()
+    }
+}
+
+impl Deref for Program {
+    type Target = [Inst];
+
+    #[cfg_attr(feature = "perf-inline", inline(always))]
+    fn deref(&self) -> &Self::Target {
+        &*self.insts
+    }
+}
+
+impl fmt::Debug for Program {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        use self::Inst::*;
+
+        fn with_goto(cur: usize, goto: usize, fmtd: String) -> String {
+            if goto == cur + 1 {
+                fmtd
+            } else {
+                format!("{} (goto: {})", fmtd, goto)
+            }
+        }
+
+        fn visible_byte(b: u8) -> String {
+            use std::ascii::escape_default;
+            let escaped = escape_default(b).collect::<Vec<u8>>();
+            String::from_utf8_lossy(&escaped).into_owned()
+        }
+
+        for (pc, inst) in self.iter().enumerate() {
+            match *inst {
+                Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?,
+                Save(ref inst) => {
+                    let s = format!("{:04} Save({})", pc, inst.slot);
+                    write!(f, "{}", with_goto(pc, inst.goto, s))?;
+                }
+                Split(ref inst) => {
+                    write!(
+                        f,
+                        "{:04} Split({}, {})",
+                        pc, inst.goto1, inst.goto2
+                    )?;
+                }
+                EmptyLook(ref inst) => {
+                    let s = format!("{:?}", inst.look);
+                    write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
+                }
+                Char(ref inst) => {
+                    let s = format!("{:?}", inst.c);
+                    write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
+                }
+                Ranges(ref inst) => {
+                    let ranges = inst
+                        .ranges
+                        .iter()
+                        .map(|r| format!("{:?}-{:?}", r.0, r.1))
+                        .collect::<Vec<String>>()
+                        .join(", ");
+                    write!(
+                        f,
+                        "{:04} {}",
+                        pc,
+                        with_goto(pc, inst.goto, ranges)
+                    )?;
+                }
+                Bytes(ref inst) => {
+                    let s = format!(
+                        "Bytes({}, {})",
+                        visible_byte(inst.start),
+                        visible_byte(inst.end)
+                    );
+                    write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
+                }
+            }
+            if pc == self.start {
+                write!(f, " (start)")?;
+            }
+            write!(f, "\n")?;
+        }
+        Ok(())
+    }
+}
+
+impl<'a> IntoIterator for &'a Program {
+    type Item = &'a Inst;
+    type IntoIter = slice::Iter<'a, Inst>;
+    fn into_iter(self) -> Self::IntoIter {
+        self.iter()
+    }
+}
+
+/// Inst is an instruction code in a Regex program.
+///
+/// Regrettably, a regex program either contains Unicode codepoint
+/// instructions (Char and Ranges) or it contains byte instructions (Bytes).
+/// A regex program can never contain both.
+///
+/// It would be worth investigating splitting this into two distinct types and
+/// then figuring out how to make the matching engines polymorphic over those
+/// types without sacrificing performance.
+///
+/// Other than the benefit of moving invariants into the type system, another
+/// benefit is the decreased size. If we remove the `Char` and `Ranges`
+/// instructions from the `Inst` enum, then its size shrinks from 40 bytes to
+/// 24 bytes. (This is because of the removal of a `Vec` in the `Ranges`
+/// variant.) Given that byte based machines are typically much bigger than
+/// their Unicode analogues (because they can decode UTF-8 directly), this ends
+/// up being a pretty significant savings.
+#[derive(Clone, Debug)]
+pub enum Inst {
+    /// Match indicates that the program has reached a match state.
+    ///
+    /// The number in the match corresponds to the Nth logical regular
+    /// expression in this program. This index is always 0 for normal regex
+    /// programs. Values greater than 0 appear when compiling regex sets, and
+    /// each match instruction gets its own unique value. The value corresponds
+    /// to the Nth regex in the set.
+    Match(usize),
+    /// Save causes the program to save the current location of the input in
+    /// the slot indicated by InstSave.
+    Save(InstSave),
+    /// Split causes the program to diverge to one of two paths in the
+    /// program, preferring goto1 in InstSplit.
+    Split(InstSplit),
+    /// EmptyLook represents a zero-width assertion in a regex program. A
+    /// zero-width assertion does not consume any of the input text.
+    EmptyLook(InstEmptyLook),
+    /// Char requires the regex program to match the character in InstChar at
+    /// the current position in the input.
+    Char(InstChar),
+    /// Ranges requires the regex program to match the character at the current
+    /// position in the input with one of the ranges specified in InstRanges.
+    Ranges(InstRanges),
+    /// Bytes is like Ranges, except it expresses a single byte range. It is
+    /// used in conjunction with Split instructions to implement multi-byte
+    /// character classes.
+    Bytes(InstBytes),
+}
+
+impl Inst {
+    /// Returns true if and only if this is a match instruction.
+    pub fn is_match(&self) -> bool {
+        match *self {
+            Inst::Match(_) => true,
+            _ => false,
+        }
+    }
+}
+
+/// Representation of the Save instruction.
+#[derive(Clone, Debug)]
+pub struct InstSave {
+    /// The next location to execute in the program.
+    pub goto: InstPtr,
+    /// The capture slot (there are two slots for every capture in a regex,
+    /// including the zeroth capture for the entire match).
+    pub slot: usize,
+}
+
+/// Representation of the Split instruction.
+#[derive(Clone, Debug)]
+pub struct InstSplit {
+    /// The first instruction to try. A match resulting from following goto1
+    /// has precedence over a match resulting from following goto2.
+    pub goto1: InstPtr,
+    /// The second instruction to try. A match resulting from following goto1
+    /// has precedence over a match resulting from following goto2.
+    pub goto2: InstPtr,
+}
+
+/// Representation of the `EmptyLook` instruction.
+#[derive(Clone, Debug)]
+pub struct InstEmptyLook {
+    /// The next location to execute in the program if this instruction
+    /// succeeds.
+    pub goto: InstPtr,
+    /// The type of zero-width assertion to check.
+    pub look: EmptyLook,
+}
+
+/// The set of zero-width match instructions.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum EmptyLook {
+    /// Start of line or input.
+    StartLine,
+    /// End of line or input.
+    EndLine,
+    /// Start of input.
+    StartText,
+    /// End of input.
+    EndText,
+    /// Word character on one side and non-word character on other.
+    WordBoundary,
+    /// Word character on both sides or non-word character on both sides.
+    NotWordBoundary,
+    /// ASCII word boundary.
+    WordBoundaryAscii,
+    /// Not ASCII word boundary.
+    NotWordBoundaryAscii,
+}
+
+/// Representation of the Char instruction.
+#[derive(Clone, Debug)]
+pub struct InstChar {
+    /// The next location to execute in the program if this instruction
+    /// succeeds.
+    pub goto: InstPtr,
+    /// The character to test.
+    pub c: char,
+}
+
+/// Representation of the Ranges instruction.
+#[derive(Clone, Debug)]
+pub struct InstRanges {
+    /// The next location to execute in the program if this instruction
+    /// succeeds.
+    pub goto: InstPtr,
+    /// The set of Unicode scalar value ranges to test.
+    pub ranges: Vec<(char, char)>,
+}
+
+impl InstRanges {
+    /// Tests whether the given input character matches this instruction.
+    pub fn matches(&self, c: Char) -> bool {
+        // This speeds up the `match_class_unicode` benchmark by checking
+        // some common cases quickly without binary search. e.g., Matching
+        // a Unicode class on predominantly ASCII text.
+        for r in self.ranges.iter().take(4) {
+            if c < r.0 {
+                return false;
+            }
+            if c <= r.1 {
+                return true;
+            }
+        }
+        self.ranges
+            .binary_search_by(|r| {
+                if r.1 < c {
+                    Ordering::Less
+                } else if r.0 > c {
+                    Ordering::Greater
+                } else {
+                    Ordering::Equal
+                }
+            })
+            .is_ok()
+    }
+
+    /// Return the number of distinct characters represented by all of the
+    /// ranges.
+    pub fn num_chars(&self) -> usize {
+        self.ranges
+            .iter()
+            .map(|&(s, e)| 1 + (e as u32) - (s as u32))
+            .sum::<u32>() as usize
+    }
+}
+
+/// Representation of the Bytes instruction.
+#[derive(Clone, Debug)]
+pub struct InstBytes {
+    /// The next location to execute in the program if this instruction
+    /// succeeds.
+    pub goto: InstPtr,
+    /// The start (inclusive) of this byte range.
+    pub start: u8,
+    /// The end (inclusive) of this byte range.
+    pub end: u8,
+}
+
+impl InstBytes {
+    /// Returns true if and only if the given byte is in this range.
+    pub fn matches(&self, byte: u8) -> bool {
+        self.start <= byte && byte <= self.end
+    }
+}
commit	e42c505f54ac2e7b2ca7d0197304cac1b4f605e9	[log] [tgz]
author	Chih-Hung Hsieh <chh@google.com>	Thu Apr 16 10:44:21 2020 -0700
committer	Chih-Hung Hsieh <chh@google.com>	Thu Apr 16 10:45:53 2020 -0700
tree	7ca22ebdbdb7d0c9dc5c66d31f564aeaf51cece4
parent	815d0a2344b5321fd47f0cb7dba96e4cb4e84615 [diff] [blame]