Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame^] | 1 | // This module implements the Pike VM. That is, it guarantees linear time |
| 2 | // search of a regex on any text with memory use proportional to the size of |
| 3 | // the regex. |
| 4 | // |
| 5 | // It is equal in power to the backtracking engine in this crate, except the |
| 6 | // backtracking engine is typically faster on small regexes/texts at the |
| 7 | // expense of a bigger memory footprint. |
| 8 | // |
| 9 | // It can do more than the DFA can (specifically, record capture locations |
| 10 | // and execute Unicode word boundary assertions), but at a slower speed. |
| 11 | // Specifically, the Pike VM exectues a DFA implicitly by repeatedly expanding |
| 12 | // epsilon transitions. That is, the Pike VM engine can be in multiple states |
| 13 | // at once where as the DFA is only ever in one state at a time. |
| 14 | // |
| 15 | // Therefore, the Pike VM is generally treated as the fallback when the other |
| 16 | // matching engines either aren't feasible to run or are insufficient. |
| 17 | |
| 18 | use std::mem; |
| 19 | |
| 20 | use exec::ProgramCache; |
| 21 | use input::{Input, InputAt}; |
| 22 | use prog::{InstPtr, Program}; |
| 23 | use re_trait::Slot; |
| 24 | use sparse::SparseSet; |
| 25 | |
| 26 | /// An NFA simulation matching engine. |
| 27 | #[derive(Debug)] |
| 28 | pub struct Fsm<'r, I> { |
| 29 | /// The sequence of opcodes (among other things) that is actually executed. |
| 30 | /// |
| 31 | /// The program may be byte oriented or Unicode codepoint oriented. |
| 32 | prog: &'r Program, |
| 33 | /// An explicit stack used for following epsilon transitions. (This is |
| 34 | /// borrowed from the cache.) |
| 35 | stack: &'r mut Vec<FollowEpsilon>, |
| 36 | /// The input to search. |
| 37 | input: I, |
| 38 | } |
| 39 | |
| 40 | /// A cached allocation that can be reused on each execution. |
| 41 | #[derive(Clone, Debug)] |
| 42 | pub struct Cache { |
| 43 | /// A pair of ordered sets for tracking NFA states. |
| 44 | clist: Threads, |
| 45 | nlist: Threads, |
| 46 | /// An explicit stack used for following epsilon transitions. |
| 47 | stack: Vec<FollowEpsilon>, |
| 48 | } |
| 49 | |
| 50 | /// An ordered set of NFA states and their captures. |
| 51 | #[derive(Clone, Debug)] |
| 52 | struct Threads { |
| 53 | /// An ordered set of opcodes (each opcode is an NFA state). |
| 54 | set: SparseSet, |
| 55 | /// Captures for every NFA state. |
| 56 | /// |
| 57 | /// It is stored in row-major order, where the columns are the capture |
| 58 | /// slots and the rows are the states. |
| 59 | caps: Vec<Slot>, |
| 60 | /// The number of capture slots stored per thread. (Every capture has |
| 61 | /// two slots.) |
| 62 | slots_per_thread: usize, |
| 63 | } |
| 64 | |
| 65 | /// A representation of an explicit stack frame when following epsilon |
| 66 | /// transitions. This is used to avoid recursion. |
| 67 | #[derive(Clone, Debug)] |
| 68 | enum FollowEpsilon { |
| 69 | /// Follow transitions at the given instruction pointer. |
| 70 | IP(InstPtr), |
| 71 | /// Restore the capture slot with the given position in the input. |
| 72 | Capture { slot: usize, pos: Slot }, |
| 73 | } |
| 74 | |
| 75 | impl Cache { |
| 76 | /// Create a new allocation used by the NFA machine to record execution |
| 77 | /// and captures. |
| 78 | pub fn new(_prog: &Program) -> Self { |
| 79 | Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] } |
| 80 | } |
| 81 | } |
| 82 | |
| 83 | impl<'r, I: Input> Fsm<'r, I> { |
| 84 | /// Execute the NFA matching engine. |
| 85 | /// |
| 86 | /// If there's a match, `exec` returns `true` and populates the given |
| 87 | /// captures accordingly. |
| 88 | pub fn exec( |
| 89 | prog: &'r Program, |
| 90 | cache: &ProgramCache, |
| 91 | matches: &mut [bool], |
| 92 | slots: &mut [Slot], |
| 93 | quit_after_match: bool, |
| 94 | input: I, |
| 95 | start: usize, |
| 96 | end: usize, |
| 97 | ) -> bool { |
| 98 | let mut cache = cache.borrow_mut(); |
| 99 | let cache = &mut cache.pikevm; |
| 100 | cache.clist.resize(prog.len(), prog.captures.len()); |
| 101 | cache.nlist.resize(prog.len(), prog.captures.len()); |
| 102 | let at = input.at(start); |
| 103 | Fsm { prog: prog, stack: &mut cache.stack, input: input }.exec_( |
| 104 | &mut cache.clist, |
| 105 | &mut cache.nlist, |
| 106 | matches, |
| 107 | slots, |
| 108 | quit_after_match, |
| 109 | at, |
| 110 | end, |
| 111 | ) |
| 112 | } |
| 113 | |
| 114 | fn exec_( |
| 115 | &mut self, |
| 116 | mut clist: &mut Threads, |
| 117 | mut nlist: &mut Threads, |
| 118 | matches: &mut [bool], |
| 119 | slots: &mut [Slot], |
| 120 | quit_after_match: bool, |
| 121 | mut at: InputAt, |
| 122 | end: usize, |
| 123 | ) -> bool { |
| 124 | let mut matched = false; |
| 125 | let mut all_matched = false; |
| 126 | clist.set.clear(); |
| 127 | nlist.set.clear(); |
| 128 | 'LOOP: loop { |
| 129 | if clist.set.is_empty() { |
| 130 | // Three ways to bail out when our current set of threads is |
| 131 | // empty. |
| 132 | // |
| 133 | // 1. We have a match---so we're done exploring any possible |
| 134 | // alternatives. Time to quit. (We can't do this if we're |
| 135 | // looking for matches for multiple regexes, unless we know |
| 136 | // they all matched.) |
| 137 | // |
| 138 | // 2. If the expression starts with a '^' we can terminate as |
| 139 | // soon as the last thread dies. |
| 140 | if (matched && matches.len() <= 1) |
| 141 | || all_matched |
| 142 | || (!at.is_start() && self.prog.is_anchored_start) |
| 143 | { |
| 144 | break; |
| 145 | } |
| 146 | |
| 147 | // 3. If there's a literal prefix for the program, try to |
| 148 | // jump ahead quickly. If it can't be found, then we can |
| 149 | // bail out early. |
| 150 | if !self.prog.prefixes.is_empty() { |
| 151 | at = match self.input.prefix_at(&self.prog.prefixes, at) { |
| 152 | None => break, |
| 153 | Some(at) => at, |
| 154 | }; |
| 155 | } |
| 156 | } |
| 157 | |
| 158 | // This simulates a preceding '.*?' for every regex by adding |
| 159 | // a state starting at the current position in the input for the |
| 160 | // beginning of the program only if we don't already have a match. |
| 161 | if clist.set.is_empty() |
| 162 | || (!self.prog.is_anchored_start && !all_matched) |
| 163 | { |
| 164 | self.add(&mut clist, slots, 0, at); |
| 165 | } |
| 166 | // The previous call to "add" actually inspects the position just |
| 167 | // before the current character. For stepping through the machine, |
| 168 | // we can to look at the current character, so we advance the |
| 169 | // input. |
| 170 | let at_next = self.input.at(at.next_pos()); |
| 171 | for i in 0..clist.set.len() { |
| 172 | let ip = clist.set[i]; |
| 173 | if self.step( |
| 174 | &mut nlist, |
| 175 | matches, |
| 176 | slots, |
| 177 | clist.caps(ip), |
| 178 | ip, |
| 179 | at, |
| 180 | at_next, |
| 181 | ) { |
| 182 | matched = true; |
| 183 | all_matched = all_matched || matches.iter().all(|&b| b); |
| 184 | if quit_after_match { |
| 185 | // If we only care if a match occurs (not its |
| 186 | // position), then we can quit right now. |
| 187 | break 'LOOP; |
| 188 | } |
| 189 | if self.prog.matches.len() == 1 { |
| 190 | // We don't need to check the rest of the threads |
| 191 | // in this set because we've matched something |
| 192 | // ("leftmost-first"). However, we still need to check |
| 193 | // threads in the next set to support things like |
| 194 | // greedy matching. |
| 195 | // |
| 196 | // This is only true on normal regexes. For regex sets, |
| 197 | // we need to mush on to observe other matches. |
| 198 | break; |
| 199 | } |
| 200 | } |
| 201 | } |
| 202 | if at.pos() >= end { |
| 203 | break; |
| 204 | } |
| 205 | at = at_next; |
| 206 | mem::swap(clist, nlist); |
| 207 | nlist.set.clear(); |
| 208 | } |
| 209 | matched |
| 210 | } |
| 211 | |
| 212 | /// Step through the input, one token (byte or codepoint) at a time. |
| 213 | /// |
| 214 | /// nlist is the set of states that will be processed on the next token |
| 215 | /// in the input. |
| 216 | /// |
| 217 | /// caps is the set of captures passed by the caller of the NFA. They are |
| 218 | /// written to only when a match state is visited. |
| 219 | /// |
| 220 | /// thread_caps is the set of captures set for the current NFA state, ip. |
| 221 | /// |
| 222 | /// at and at_next are the current and next positions in the input. at or |
| 223 | /// at_next may be EOF. |
| 224 | fn step( |
| 225 | &mut self, |
| 226 | nlist: &mut Threads, |
| 227 | matches: &mut [bool], |
| 228 | slots: &mut [Slot], |
| 229 | thread_caps: &mut [Option<usize>], |
| 230 | ip: usize, |
| 231 | at: InputAt, |
| 232 | at_next: InputAt, |
| 233 | ) -> bool { |
| 234 | use prog::Inst::*; |
| 235 | match self.prog[ip] { |
| 236 | Match(match_slot) => { |
| 237 | if match_slot < matches.len() { |
| 238 | matches[match_slot] = true; |
| 239 | } |
| 240 | for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) { |
| 241 | *slot = *val; |
| 242 | } |
| 243 | true |
| 244 | } |
| 245 | Char(ref inst) => { |
| 246 | if inst.c == at.char() { |
| 247 | self.add(nlist, thread_caps, inst.goto, at_next); |
| 248 | } |
| 249 | false |
| 250 | } |
| 251 | Ranges(ref inst) => { |
| 252 | if inst.matches(at.char()) { |
| 253 | self.add(nlist, thread_caps, inst.goto, at_next); |
| 254 | } |
| 255 | false |
| 256 | } |
| 257 | Bytes(ref inst) => { |
| 258 | if let Some(b) = at.byte() { |
| 259 | if inst.matches(b) { |
| 260 | self.add(nlist, thread_caps, inst.goto, at_next); |
| 261 | } |
| 262 | } |
| 263 | false |
| 264 | } |
| 265 | EmptyLook(_) | Save(_) | Split(_) => false, |
| 266 | } |
| 267 | } |
| 268 | |
| 269 | /// Follows epsilon transitions and adds them for processing to nlist, |
| 270 | /// starting at and including ip. |
| 271 | fn add( |
| 272 | &mut self, |
| 273 | nlist: &mut Threads, |
| 274 | thread_caps: &mut [Option<usize>], |
| 275 | ip: usize, |
| 276 | at: InputAt, |
| 277 | ) { |
| 278 | self.stack.push(FollowEpsilon::IP(ip)); |
| 279 | while let Some(frame) = self.stack.pop() { |
| 280 | match frame { |
| 281 | FollowEpsilon::IP(ip) => { |
| 282 | self.add_step(nlist, thread_caps, ip, at); |
| 283 | } |
| 284 | FollowEpsilon::Capture { slot, pos } => { |
| 285 | thread_caps[slot] = pos; |
| 286 | } |
| 287 | } |
| 288 | } |
| 289 | } |
| 290 | |
| 291 | /// A helper function for add that avoids excessive pushing to the stack. |
| 292 | fn add_step( |
| 293 | &mut self, |
| 294 | nlist: &mut Threads, |
| 295 | thread_caps: &mut [Option<usize>], |
| 296 | mut ip: usize, |
| 297 | at: InputAt, |
| 298 | ) { |
| 299 | // Instead of pushing and popping to the stack, we mutate ip as we |
| 300 | // traverse the set of states. We only push to the stack when we |
| 301 | // absolutely need recursion (restoring captures or following a |
| 302 | // branch). |
| 303 | use prog::Inst::*; |
| 304 | loop { |
| 305 | // Don't visit states we've already added. |
| 306 | if nlist.set.contains(ip) { |
| 307 | return; |
| 308 | } |
| 309 | nlist.set.insert(ip); |
| 310 | match self.prog[ip] { |
| 311 | EmptyLook(ref inst) => { |
| 312 | if self.input.is_empty_match(at, inst) { |
| 313 | ip = inst.goto; |
| 314 | } |
| 315 | } |
| 316 | Save(ref inst) => { |
| 317 | if inst.slot < thread_caps.len() { |
| 318 | self.stack.push(FollowEpsilon::Capture { |
| 319 | slot: inst.slot, |
| 320 | pos: thread_caps[inst.slot], |
| 321 | }); |
| 322 | thread_caps[inst.slot] = Some(at.pos()); |
| 323 | } |
| 324 | ip = inst.goto; |
| 325 | } |
| 326 | Split(ref inst) => { |
| 327 | self.stack.push(FollowEpsilon::IP(inst.goto2)); |
| 328 | ip = inst.goto1; |
| 329 | } |
| 330 | Match(_) | Char(_) | Ranges(_) | Bytes(_) => { |
| 331 | let t = &mut nlist.caps(ip); |
| 332 | for (slot, val) in t.iter_mut().zip(thread_caps.iter()) { |
| 333 | *slot = *val; |
| 334 | } |
| 335 | return; |
| 336 | } |
| 337 | } |
| 338 | } |
| 339 | } |
| 340 | } |
| 341 | |
| 342 | impl Threads { |
| 343 | fn new() -> Self { |
| 344 | Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 } |
| 345 | } |
| 346 | |
| 347 | fn resize(&mut self, num_insts: usize, ncaps: usize) { |
| 348 | if num_insts == self.set.capacity() { |
| 349 | return; |
| 350 | } |
| 351 | self.slots_per_thread = ncaps * 2; |
| 352 | self.set = SparseSet::new(num_insts); |
| 353 | self.caps = vec![None; self.slots_per_thread * num_insts]; |
| 354 | } |
| 355 | |
| 356 | fn caps(&mut self, pc: usize) -> &mut [Option<usize>] { |
| 357 | let i = pc * self.slots_per_thread; |
| 358 | &mut self.caps[i..i + self.slots_per_thread] |
| 359 | } |
| 360 | } |