blob: f3003161a96078e74d8022afa723b99b811a1d21 [file] [log] [blame]
Jakub Kotur3bceaeb2020-12-21 17:28:16 +01001use std::collections::HashMap;
2use std::mem;
3use std::rc::Rc;
4
5use dense;
6use error::Result;
7use nfa::{self, NFA};
8use sparse_set::SparseSet;
9use state_id::{dead_id, StateID};
10
11type DFARepr<S> = dense::Repr<Vec<S>, S>;
12
13/// A determinizer converts an NFA to a DFA.
14///
15/// This determinizer follows the typical powerset construction, where each
16/// DFA state is comprised of one or more NFA states. In the worst case, there
17/// is one DFA state for every possible combination of NFA states. In practice,
18/// this only happens in certain conditions, typically when there are bounded
19/// repetitions.
20///
21/// The type variable `S` refers to the chosen state identifier representation
22/// used for the DFA.
23///
24/// The lifetime variable `'a` refers to the lifetime of the NFA being
25/// converted to a DFA.
26#[derive(Debug)]
27pub(crate) struct Determinizer<'a, S: StateID> {
28 /// The NFA we're converting into a DFA.
29 nfa: &'a NFA,
30 /// The DFA we're building.
31 dfa: DFARepr<S>,
32 /// Each DFA state being built is defined as an *ordered* set of NFA
33 /// states, along with a flag indicating whether the state is a match
34 /// state or not.
35 ///
36 /// This is never empty. The first state is always a dummy state such that
37 /// a state id == 0 corresponds to a dead state.
38 builder_states: Vec<Rc<State>>,
39 /// A cache of DFA states that already exist and can be easily looked up
40 /// via ordered sets of NFA states.
41 cache: HashMap<Rc<State>, S>,
42 /// Scratch space for a stack of NFA states to visit, for depth first
43 /// visiting without recursion.
44 stack: Vec<nfa::StateID>,
45 /// Scratch space for storing an ordered sequence of NFA states, for
46 /// amortizing allocation.
47 scratch_nfa_states: Vec<nfa::StateID>,
48 /// Whether to build a DFA that finds the longest possible match.
49 longest_match: bool,
50}
51
52/// An intermediate representation for a DFA state during determinization.
53#[derive(Debug, Eq, Hash, PartialEq)]
54struct State {
55 /// Whether this state is a match state or not.
56 is_match: bool,
57 /// An ordered sequence of NFA states that make up this DFA state.
58 nfa_states: Vec<nfa::StateID>,
59}
60
61impl<'a, S: StateID> Determinizer<'a, S> {
62 /// Create a new determinizer for converting the given NFA to a DFA.
63 pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> {
64 let dead = Rc::new(State::dead());
65 let mut cache = HashMap::default();
66 cache.insert(dead.clone(), dead_id());
67
68 Determinizer {
69 nfa,
70 dfa: DFARepr::empty().anchored(nfa.is_anchored()),
71 builder_states: vec![dead],
72 cache,
73 stack: vec![],
74 scratch_nfa_states: vec![],
75 longest_match: false,
76 }
77 }
78
79 /// Instruct the determinizer to use equivalence classes as the transition
80 /// alphabet instead of all possible byte values.
81 pub fn with_byte_classes(mut self) -> Determinizer<'a, S> {
82 let byte_classes = self.nfa.byte_classes().clone();
83 self.dfa = DFARepr::empty_with_byte_classes(byte_classes)
84 .anchored(self.nfa.is_anchored());
85 self
86 }
87
88 /// Instruct the determinizer to build a DFA that recognizes the longest
89 /// possible match instead of the leftmost first match. This is useful when
90 /// constructing reverse DFAs for finding the start of a match.
91 pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> {
92 self.longest_match = yes;
93 self
94 }
95
96 /// Build the DFA. If there was a problem constructing the DFA (e.g., if
97 /// the chosen state identifier representation is too small), then an error
98 /// is returned.
99 pub fn build(mut self) -> Result<DFARepr<S>> {
100 let representative_bytes: Vec<u8> =
101 self.dfa.byte_classes().representatives().collect();
102 let mut sparse = self.new_sparse_set();
103 let mut uncompiled = vec![self.add_start(&mut sparse)?];
104 while let Some(dfa_id) = uncompiled.pop() {
105 for &b in &representative_bytes {
106 let (next_dfa_id, is_new) =
107 self.cached_state(dfa_id, b, &mut sparse)?;
108 self.dfa.add_transition(dfa_id, b, next_dfa_id);
109 if is_new {
110 uncompiled.push(next_dfa_id);
111 }
112 }
113 }
114
115 // At this point, we shuffle the matching states in the final DFA to
116 // the beginning. This permits a DFA's match loop to detect a match
117 // condition by merely inspecting the current state's identifier, and
118 // avoids the need for any additional auxiliary storage.
119 let is_match: Vec<bool> =
120 self.builder_states.iter().map(|s| s.is_match).collect();
121 self.dfa.shuffle_match_states(&is_match);
122 Ok(self.dfa)
123 }
124
125 /// Return the identifier for the next DFA state given an existing DFA
126 /// state and an input byte. If the next DFA state already exists, then
127 /// return its identifier from the cache. Otherwise, build the state, cache
128 /// it and return its identifier.
129 ///
130 /// The given sparse set is used for scratch space. It must have a capacity
131 /// equivalent to the total number of NFA states, but its contents are
132 /// otherwise unspecified.
133 ///
134 /// This routine returns a boolean indicating whether a new state was
135 /// built. If a new state is built, then the caller needs to add it to its
136 /// frontier of uncompiled DFA states to compute transitions for.
137 fn cached_state(
138 &mut self,
139 dfa_id: S,
140 b: u8,
141 sparse: &mut SparseSet,
142 ) -> Result<(S, bool)> {
143 sparse.clear();
144 // Compute the set of all reachable NFA states, including epsilons.
145 self.next(dfa_id, b, sparse);
146 // Build a candidate state and check if it has already been built.
147 let state = self.new_state(sparse);
148 if let Some(&cached_id) = self.cache.get(&state) {
149 // Since we have a cached state, put the constructed state's
150 // memory back into our scratch space, so that it can be reused.
151 mem::replace(&mut self.scratch_nfa_states, state.nfa_states);
152 return Ok((cached_id, false));
153 }
154 // Nothing was in the cache, so add this state to the cache.
155 self.add_state(state).map(|s| (s, true))
156 }
157
158 /// Compute the set of all eachable NFA states, including the full epsilon
159 /// closure, from a DFA state for a single byte of input.
160 fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) {
161 next_nfa_states.clear();
162 for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() {
163 let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i];
164 match *self.nfa.state(nfa_id) {
165 nfa::State::Union { .. }
166 | nfa::State::Fail
167 | nfa::State::Match => {}
168 nfa::State::Range { range: ref r } => {
169 if r.start <= b && b <= r.end {
170 self.epsilon_closure(r.next, next_nfa_states);
171 }
172 }
173 nfa::State::Sparse { ref ranges } => {
174 for r in ranges.iter() {
175 if r.start > b {
176 break;
177 } else if r.start <= b && b <= r.end {
178 self.epsilon_closure(r.next, next_nfa_states);
179 break;
180 }
181 }
182 }
183 }
184 }
185 }
186
187 /// Compute the epsilon closure for the given NFA state.
188 fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) {
189 if !self.nfa.state(start).is_epsilon() {
190 set.insert(start);
191 return;
192 }
193
194 self.stack.push(start);
195 while let Some(mut id) = self.stack.pop() {
196 loop {
197 if set.contains(id) {
198 break;
199 }
200 set.insert(id);
201 match *self.nfa.state(id) {
202 nfa::State::Range { .. }
203 | nfa::State::Sparse { .. }
204 | nfa::State::Fail
205 | nfa::State::Match => break,
206 nfa::State::Union { ref alternates } => {
207 id = match alternates.get(0) {
208 None => break,
209 Some(&id) => id,
210 };
211 self.stack.extend(alternates[1..].iter().rev());
212 }
213 }
214 }
215 }
216 }
217
218 /// Compute the initial DFA state and return its identifier.
219 ///
220 /// The sparse set given is used for scratch space, and must have capacity
221 /// equal to the total number of NFA states. Its contents are unspecified.
222 fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> {
223 sparse.clear();
224 self.epsilon_closure(self.nfa.start(), sparse);
225 let state = self.new_state(&sparse);
226 let id = self.add_state(state)?;
227 self.dfa.set_start_state(id);
228 Ok(id)
229 }
230
231 /// Add the given state to the DFA and make it available in the cache.
232 ///
233 /// The state initially has no transitions. That is, it transitions to the
234 /// dead state for all possible inputs.
235 fn add_state(&mut self, state: State) -> Result<S> {
236 let id = self.dfa.add_empty_state()?;
237 let rstate = Rc::new(state);
238 self.builder_states.push(rstate.clone());
239 self.cache.insert(rstate, id);
240 Ok(id)
241 }
242
243 /// Convert the given set of ordered NFA states to a DFA state.
244 fn new_state(&mut self, set: &SparseSet) -> State {
245 let mut state = State {
246 is_match: false,
247 nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]),
248 };
249 state.nfa_states.clear();
250
251 for &id in set {
252 match *self.nfa.state(id) {
253 nfa::State::Range { .. } => {
254 state.nfa_states.push(id);
255 }
256 nfa::State::Sparse { .. } => {
257 state.nfa_states.push(id);
258 }
259 nfa::State::Fail => {
260 break;
261 }
262 nfa::State::Match => {
263 state.is_match = true;
264 if !self.longest_match {
265 break;
266 }
267 }
268 nfa::State::Union { .. } => {}
269 }
270 }
271 state
272 }
273
274 /// Create a new sparse set with enough capacity to hold all NFA states.
275 fn new_sparse_set(&self) -> SparseSet {
276 SparseSet::new(self.nfa.len())
277 }
278}
279
280impl State {
281 /// Create a new empty dead state.
282 fn dead() -> State {
283 State { nfa_states: vec![], is_match: false }
284 }
285}