Initial import of regex-automata-0.1.9.

Bug: 155309706
Change-Id: I20031167cbe49d12754936285a0781eb7a3b8bfd
diff --git a/src/determinize.rs b/src/determinize.rs
new file mode 100644
index 0000000..f300316
--- /dev/null
+++ b/src/determinize.rs
@@ -0,0 +1,285 @@
+use std::collections::HashMap;
+use std::mem;
+use std::rc::Rc;
+
+use dense;
+use error::Result;
+use nfa::{self, NFA};
+use sparse_set::SparseSet;
+use state_id::{dead_id, StateID};
+
+type DFARepr<S> = dense::Repr<Vec<S>, S>;
+
+/// A determinizer converts an NFA to a DFA.
+///
+/// This determinizer follows the typical powerset construction, where each
+/// DFA state is comprised of one or more NFA states. In the worst case, there
+/// is one DFA state for every possible combination of NFA states. In practice,
+/// this only happens in certain conditions, typically when there are bounded
+/// repetitions.
+///
+/// The type variable `S` refers to the chosen state identifier representation
+/// used for the DFA.
+///
+/// The lifetime variable `'a` refers to the lifetime of the NFA being
+/// converted to a DFA.
+#[derive(Debug)]
+pub(crate) struct Determinizer<'a, S: StateID> {
+    /// The NFA we're converting into a DFA.
+    nfa: &'a NFA,
+    /// The DFA we're building.
+    dfa: DFARepr<S>,
+    /// Each DFA state being built is defined as an *ordered* set of NFA
+    /// states, along with a flag indicating whether the state is a match
+    /// state or not.
+    ///
+    /// This is never empty. The first state is always a dummy state such that
+    /// a state id == 0 corresponds to a dead state.
+    builder_states: Vec<Rc<State>>,
+    /// A cache of DFA states that already exist and can be easily looked up
+    /// via ordered sets of NFA states.
+    cache: HashMap<Rc<State>, S>,
+    /// Scratch space for a stack of NFA states to visit, for depth first
+    /// visiting without recursion.
+    stack: Vec<nfa::StateID>,
+    /// Scratch space for storing an ordered sequence of NFA states, for
+    /// amortizing allocation.
+    scratch_nfa_states: Vec<nfa::StateID>,
+    /// Whether to build a DFA that finds the longest possible match.
+    longest_match: bool,
+}
+
+/// An intermediate representation for a DFA state during determinization.
+#[derive(Debug, Eq, Hash, PartialEq)]
+struct State {
+    /// Whether this state is a match state or not.
+    is_match: bool,
+    /// An ordered sequence of NFA states that make up this DFA state.
+    nfa_states: Vec<nfa::StateID>,
+}
+
+impl<'a, S: StateID> Determinizer<'a, S> {
+    /// Create a new determinizer for converting the given NFA to a DFA.
+    pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> {
+        let dead = Rc::new(State::dead());
+        let mut cache = HashMap::default();
+        cache.insert(dead.clone(), dead_id());
+
+        Determinizer {
+            nfa,
+            dfa: DFARepr::empty().anchored(nfa.is_anchored()),
+            builder_states: vec![dead],
+            cache,
+            stack: vec![],
+            scratch_nfa_states: vec![],
+            longest_match: false,
+        }
+    }
+
+    /// Instruct the determinizer to use equivalence classes as the transition
+    /// alphabet instead of all possible byte values.
+    pub fn with_byte_classes(mut self) -> Determinizer<'a, S> {
+        let byte_classes = self.nfa.byte_classes().clone();
+        self.dfa = DFARepr::empty_with_byte_classes(byte_classes)
+            .anchored(self.nfa.is_anchored());
+        self
+    }
+
+    /// Instruct the determinizer to build a DFA that recognizes the longest
+    /// possible match instead of the leftmost first match. This is useful when
+    /// constructing reverse DFAs for finding the start of a match.
+    pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> {
+        self.longest_match = yes;
+        self
+    }
+
+    /// Build the DFA. If there was a problem constructing the DFA (e.g., if
+    /// the chosen state identifier representation is too small), then an error
+    /// is returned.
+    pub fn build(mut self) -> Result<DFARepr<S>> {
+        let representative_bytes: Vec<u8> =
+            self.dfa.byte_classes().representatives().collect();
+        let mut sparse = self.new_sparse_set();
+        let mut uncompiled = vec![self.add_start(&mut sparse)?];
+        while let Some(dfa_id) = uncompiled.pop() {
+            for &b in &representative_bytes {
+                let (next_dfa_id, is_new) =
+                    self.cached_state(dfa_id, b, &mut sparse)?;
+                self.dfa.add_transition(dfa_id, b, next_dfa_id);
+                if is_new {
+                    uncompiled.push(next_dfa_id);
+                }
+            }
+        }
+
+        // At this point, we shuffle the matching states in the final DFA to
+        // the beginning. This permits a DFA's match loop to detect a match
+        // condition by merely inspecting the current state's identifier, and
+        // avoids the need for any additional auxiliary storage.
+        let is_match: Vec<bool> =
+            self.builder_states.iter().map(|s| s.is_match).collect();
+        self.dfa.shuffle_match_states(&is_match);
+        Ok(self.dfa)
+    }
+
+    /// Return the identifier for the next DFA state given an existing DFA
+    /// state and an input byte. If the next DFA state already exists, then
+    /// return its identifier from the cache. Otherwise, build the state, cache
+    /// it and return its identifier.
+    ///
+    /// The given sparse set is used for scratch space. It must have a capacity
+    /// equivalent to the total number of NFA states, but its contents are
+    /// otherwise unspecified.
+    ///
+    /// This routine returns a boolean indicating whether a new state was
+    /// built. If a new state is built, then the caller needs to add it to its
+    /// frontier of uncompiled DFA states to compute transitions for.
+    fn cached_state(
+        &mut self,
+        dfa_id: S,
+        b: u8,
+        sparse: &mut SparseSet,
+    ) -> Result<(S, bool)> {
+        sparse.clear();
+        // Compute the set of all reachable NFA states, including epsilons.
+        self.next(dfa_id, b, sparse);
+        // Build a candidate state and check if it has already been built.
+        let state = self.new_state(sparse);
+        if let Some(&cached_id) = self.cache.get(&state) {
+            // Since we have a cached state, put the constructed state's
+            // memory back into our scratch space, so that it can be reused.
+            mem::replace(&mut self.scratch_nfa_states, state.nfa_states);
+            return Ok((cached_id, false));
+        }
+        // Nothing was in the cache, so add this state to the cache.
+        self.add_state(state).map(|s| (s, true))
+    }
+
+    /// Compute the set of all eachable NFA states, including the full epsilon
+    /// closure, from a DFA state for a single byte of input.
+    fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) {
+        next_nfa_states.clear();
+        for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() {
+            let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i];
+            match *self.nfa.state(nfa_id) {
+                nfa::State::Union { .. }
+                | nfa::State::Fail
+                | nfa::State::Match => {}
+                nfa::State::Range { range: ref r } => {
+                    if r.start <= b && b <= r.end {
+                        self.epsilon_closure(r.next, next_nfa_states);
+                    }
+                }
+                nfa::State::Sparse { ref ranges } => {
+                    for r in ranges.iter() {
+                        if r.start > b {
+                            break;
+                        } else if r.start <= b && b <= r.end {
+                            self.epsilon_closure(r.next, next_nfa_states);
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    /// Compute the epsilon closure for the given NFA state.
+    fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) {
+        if !self.nfa.state(start).is_epsilon() {
+            set.insert(start);
+            return;
+        }
+
+        self.stack.push(start);
+        while let Some(mut id) = self.stack.pop() {
+            loop {
+                if set.contains(id) {
+                    break;
+                }
+                set.insert(id);
+                match *self.nfa.state(id) {
+                    nfa::State::Range { .. }
+                    | nfa::State::Sparse { .. }
+                    | nfa::State::Fail
+                    | nfa::State::Match => break,
+                    nfa::State::Union { ref alternates } => {
+                        id = match alternates.get(0) {
+                            None => break,
+                            Some(&id) => id,
+                        };
+                        self.stack.extend(alternates[1..].iter().rev());
+                    }
+                }
+            }
+        }
+    }
+
+    /// Compute the initial DFA state and return its identifier.
+    ///
+    /// The sparse set given is used for scratch space, and must have capacity
+    /// equal to the total number of NFA states. Its contents are unspecified.
+    fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> {
+        sparse.clear();
+        self.epsilon_closure(self.nfa.start(), sparse);
+        let state = self.new_state(&sparse);
+        let id = self.add_state(state)?;
+        self.dfa.set_start_state(id);
+        Ok(id)
+    }
+
+    /// Add the given state to the DFA and make it available in the cache.
+    ///
+    /// The state initially has no transitions. That is, it transitions to the
+    /// dead state for all possible inputs.
+    fn add_state(&mut self, state: State) -> Result<S> {
+        let id = self.dfa.add_empty_state()?;
+        let rstate = Rc::new(state);
+        self.builder_states.push(rstate.clone());
+        self.cache.insert(rstate, id);
+        Ok(id)
+    }
+
+    /// Convert the given set of ordered NFA states to a DFA state.
+    fn new_state(&mut self, set: &SparseSet) -> State {
+        let mut state = State {
+            is_match: false,
+            nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]),
+        };
+        state.nfa_states.clear();
+
+        for &id in set {
+            match *self.nfa.state(id) {
+                nfa::State::Range { .. } => {
+                    state.nfa_states.push(id);
+                }
+                nfa::State::Sparse { .. } => {
+                    state.nfa_states.push(id);
+                }
+                nfa::State::Fail => {
+                    break;
+                }
+                nfa::State::Match => {
+                    state.is_match = true;
+                    if !self.longest_match {
+                        break;
+                    }
+                }
+                nfa::State::Union { .. } => {}
+            }
+        }
+        state
+    }
+
+    /// Create a new sparse set with enough capacity to hold all NFA states.
+    fn new_sparse_set(&self) -> SparseSet {
+        SparseSet::new(self.nfa.len())
+    }
+}
+
+impl State {
+    /// Create a new empty dead state.
+    fn dead() -> State {
+        State { nfa_states: vec![], is_match: false }
+    }
+}