Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 1 | // This is the backtracking matching engine. It has the same exact capability |
| 2 | // as the full NFA simulation, except it is artificially restricted to small |
| 3 | // regexes on small inputs because of its memory requirements. |
| 4 | // |
| 5 | // In particular, this is a *bounded* backtracking engine. It retains worst |
| 6 | // case linear time by keeping track of the states that it has visited (using a |
| 7 | // bitmap). Namely, once a state is visited, it is never visited again. Since a |
| 8 | // state is keyed by `(instruction index, input index)`, we have that its time |
| 9 | // complexity is `O(mn)` (i.e., linear in the size of the search text). |
| 10 | // |
| 11 | // The backtracking engine can beat out the NFA simulation on small |
| 12 | // regexes/inputs because it doesn't have to keep track of multiple copies of |
| 13 | // the capture groups. In benchmarks, the backtracking engine is roughly twice |
| 14 | // as fast as the full NFA simulation. Note though that its performance doesn't |
| 15 | // scale, even if you're willing to live with the memory requirements. Namely, |
| 16 | // the bitset has to be zeroed on each execution, which becomes quite expensive |
| 17 | // on large bitsets. |
| 18 | |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 19 | use crate::exec::ProgramCache; |
| 20 | use crate::input::{Input, InputAt}; |
| 21 | use crate::prog::{InstPtr, Program}; |
| 22 | use crate::re_trait::Slot; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 23 | |
| 24 | type Bits = u32; |
| 25 | |
| 26 | const BIT_SIZE: usize = 32; |
| 27 | const MAX_SIZE_BYTES: usize = 256 * (1 << 10); // 256 KB |
| 28 | |
| 29 | /// Returns true iff the given regex and input should be executed by this |
| 30 | /// engine with reasonable memory usage. |
| 31 | pub fn should_exec(num_insts: usize, text_len: usize) -> bool { |
| 32 | // Total memory usage in bytes is determined by: |
| 33 | // |
| 34 | // ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32)) |
| 35 | // |
| 36 | // The actual limit picked is pretty much a heuristic. |
| 37 | // See: https://github.com/rust-lang/regex/issues/215 |
| 38 | let size = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4; |
| 39 | size <= MAX_SIZE_BYTES |
| 40 | } |
| 41 | |
| 42 | /// A backtracking matching engine. |
| 43 | #[derive(Debug)] |
| 44 | pub struct Bounded<'a, 'm, 'r, 's, I> { |
| 45 | prog: &'r Program, |
| 46 | input: I, |
| 47 | matches: &'m mut [bool], |
| 48 | slots: &'s mut [Slot], |
| 49 | m: &'a mut Cache, |
| 50 | } |
| 51 | |
| 52 | /// Shared cached state between multiple invocations of a backtracking engine |
| 53 | /// in the same thread. |
| 54 | #[derive(Clone, Debug)] |
| 55 | pub struct Cache { |
| 56 | jobs: Vec<Job>, |
| 57 | visited: Vec<Bits>, |
| 58 | } |
| 59 | |
| 60 | impl Cache { |
| 61 | /// Create new empty cache for the backtracking engine. |
| 62 | pub fn new(_prog: &Program) -> Self { |
| 63 | Cache { jobs: vec![], visited: vec![] } |
| 64 | } |
| 65 | } |
| 66 | |
| 67 | /// A job is an explicit unit of stack space in the backtracking engine. |
| 68 | /// |
| 69 | /// The "normal" representation is a single state transition, which corresponds |
| 70 | /// to an NFA state and a character in the input. However, the backtracking |
| 71 | /// engine must keep track of old capture group values. We use the explicit |
| 72 | /// stack to do it. |
| 73 | #[derive(Clone, Copy, Debug)] |
| 74 | enum Job { |
| 75 | Inst { ip: InstPtr, at: InputAt }, |
| 76 | SaveRestore { slot: usize, old_pos: Option<usize> }, |
| 77 | } |
| 78 | |
| 79 | impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { |
| 80 | /// Execute the backtracking matching engine. |
| 81 | /// |
| 82 | /// If there's a match, `exec` returns `true` and populates the given |
| 83 | /// captures accordingly. |
| 84 | pub fn exec( |
| 85 | prog: &'r Program, |
| 86 | cache: &ProgramCache, |
| 87 | matches: &'m mut [bool], |
| 88 | slots: &'s mut [Slot], |
| 89 | input: I, |
| 90 | start: usize, |
| 91 | end: usize, |
| 92 | ) -> bool { |
| 93 | let mut cache = cache.borrow_mut(); |
| 94 | let cache = &mut cache.backtrack; |
| 95 | let start = input.at(start); |
| 96 | let mut b = Bounded { |
| 97 | prog: prog, |
| 98 | input: input, |
| 99 | matches: matches, |
| 100 | slots: slots, |
| 101 | m: cache, |
| 102 | }; |
| 103 | b.exec_(start, end) |
| 104 | } |
| 105 | |
| 106 | /// Clears the cache such that the backtracking engine can be executed |
| 107 | /// on some input of fixed length. |
| 108 | fn clear(&mut self) { |
| 109 | // Reset the job memory so that we start fresh. |
| 110 | self.m.jobs.clear(); |
| 111 | |
| 112 | // Now we need to clear the bit state set. |
| 113 | // We do this by figuring out how much space we need to keep track |
| 114 | // of the states we've visited. |
| 115 | // Then we reset all existing allocated space to 0. |
| 116 | // Finally, we request more space if we need it. |
| 117 | // |
Elliott Hughes | ffb6030 | 2021-04-01 17:11:40 -0700 | [diff] [blame] | 118 | // This is all a little circuitous, but doing this using unchecked |
| 119 | // operations doesn't seem to have a measurable impact on performance. |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 120 | // (Probably because backtracking is limited to such small |
| 121 | // inputs/regexes in the first place.) |
| 122 | let visited_len = |
| 123 | (self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1) |
| 124 | / BIT_SIZE; |
| 125 | self.m.visited.truncate(visited_len); |
| 126 | for v in &mut self.m.visited { |
| 127 | *v = 0; |
| 128 | } |
| 129 | if visited_len > self.m.visited.len() { |
| 130 | let len = self.m.visited.len(); |
| 131 | self.m.visited.reserve_exact(visited_len - len); |
| 132 | for _ in 0..(visited_len - len) { |
| 133 | self.m.visited.push(0); |
| 134 | } |
| 135 | } |
| 136 | } |
| 137 | |
| 138 | /// Start backtracking at the given position in the input, but also look |
| 139 | /// for literal prefixes. |
| 140 | fn exec_(&mut self, mut at: InputAt, end: usize) -> bool { |
| 141 | self.clear(); |
| 142 | // If this is an anchored regex at the beginning of the input, then |
| 143 | // we're either already done or we only need to try backtracking once. |
| 144 | if self.prog.is_anchored_start { |
| 145 | return if !at.is_start() { false } else { self.backtrack(at) }; |
| 146 | } |
| 147 | let mut matched = false; |
| 148 | loop { |
| 149 | if !self.prog.prefixes.is_empty() { |
| 150 | at = match self.input.prefix_at(&self.prog.prefixes, at) { |
| 151 | None => break, |
| 152 | Some(at) => at, |
| 153 | }; |
| 154 | } |
| 155 | matched = self.backtrack(at) || matched; |
| 156 | if matched && self.prog.matches.len() == 1 { |
| 157 | return true; |
| 158 | } |
| 159 | if at.pos() >= end { |
| 160 | break; |
| 161 | } |
| 162 | at = self.input.at(at.next_pos()); |
| 163 | } |
| 164 | matched |
| 165 | } |
| 166 | |
| 167 | /// The main backtracking loop starting at the given input position. |
| 168 | fn backtrack(&mut self, start: InputAt) -> bool { |
| 169 | // N.B. We use an explicit stack to avoid recursion. |
| 170 | // To avoid excessive pushing and popping, most transitions are handled |
| 171 | // in the `step` helper function, which only pushes to the stack when |
| 172 | // there's a capture or a branch. |
| 173 | let mut matched = false; |
| 174 | self.m.jobs.push(Job::Inst { ip: 0, at: start }); |
| 175 | while let Some(job) = self.m.jobs.pop() { |
| 176 | match job { |
| 177 | Job::Inst { ip, at } => { |
| 178 | if self.step(ip, at) { |
| 179 | // Only quit if we're matching one regex. |
| 180 | // If we're matching a regex set, then mush on and |
| 181 | // try to find other matches (if we want them). |
| 182 | if self.prog.matches.len() == 1 { |
| 183 | return true; |
| 184 | } |
| 185 | matched = true; |
| 186 | } |
| 187 | } |
| 188 | Job::SaveRestore { slot, old_pos } => { |
| 189 | if slot < self.slots.len() { |
| 190 | self.slots[slot] = old_pos; |
| 191 | } |
| 192 | } |
| 193 | } |
| 194 | } |
| 195 | matched |
| 196 | } |
| 197 | |
| 198 | fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool { |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 199 | use crate::prog::Inst::*; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 200 | loop { |
| 201 | // This loop is an optimization to avoid constantly pushing/popping |
| 202 | // from the stack. Namely, if we're pushing a job only to run it |
| 203 | // next, avoid the push and just mutate `ip` (and possibly `at`) |
| 204 | // in place. |
| 205 | if self.has_visited(ip, at) { |
| 206 | return false; |
| 207 | } |
| 208 | match self.prog[ip] { |
| 209 | Match(slot) => { |
| 210 | if slot < self.matches.len() { |
| 211 | self.matches[slot] = true; |
| 212 | } |
| 213 | return true; |
| 214 | } |
| 215 | Save(ref inst) => { |
| 216 | if let Some(&old_pos) = self.slots.get(inst.slot) { |
| 217 | // If this path doesn't work out, then we save the old |
| 218 | // capture index (if one exists) in an alternate |
| 219 | // job. If the next path fails, then the alternate |
| 220 | // job is popped and the old capture index is restored. |
| 221 | self.m.jobs.push(Job::SaveRestore { |
| 222 | slot: inst.slot, |
| 223 | old_pos: old_pos, |
| 224 | }); |
| 225 | self.slots[inst.slot] = Some(at.pos()); |
| 226 | } |
| 227 | ip = inst.goto; |
| 228 | } |
| 229 | Split(ref inst) => { |
| 230 | self.m.jobs.push(Job::Inst { ip: inst.goto2, at: at }); |
| 231 | ip = inst.goto1; |
| 232 | } |
| 233 | EmptyLook(ref inst) => { |
| 234 | if self.input.is_empty_match(at, inst) { |
| 235 | ip = inst.goto; |
| 236 | } else { |
| 237 | return false; |
| 238 | } |
| 239 | } |
| 240 | Char(ref inst) => { |
| 241 | if inst.c == at.char() { |
| 242 | ip = inst.goto; |
| 243 | at = self.input.at(at.next_pos()); |
| 244 | } else { |
| 245 | return false; |
| 246 | } |
| 247 | } |
| 248 | Ranges(ref inst) => { |
| 249 | if inst.matches(at.char()) { |
| 250 | ip = inst.goto; |
| 251 | at = self.input.at(at.next_pos()); |
| 252 | } else { |
| 253 | return false; |
| 254 | } |
| 255 | } |
| 256 | Bytes(ref inst) => { |
| 257 | if let Some(b) = at.byte() { |
| 258 | if inst.matches(b) { |
| 259 | ip = inst.goto; |
| 260 | at = self.input.at(at.next_pos()); |
| 261 | continue; |
| 262 | } |
| 263 | } |
| 264 | return false; |
| 265 | } |
| 266 | } |
| 267 | } |
| 268 | } |
| 269 | |
| 270 | fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool { |
| 271 | let k = ip * (self.input.len() + 1) + at.pos(); |
| 272 | let k1 = k / BIT_SIZE; |
| 273 | let k2 = usize_to_u32(1 << (k & (BIT_SIZE - 1))); |
| 274 | if self.m.visited[k1] & k2 == 0 { |
| 275 | self.m.visited[k1] |= k2; |
| 276 | false |
| 277 | } else { |
| 278 | true |
| 279 | } |
| 280 | } |
| 281 | } |
| 282 | |
| 283 | fn usize_to_u32(n: usize) -> u32 { |
| 284 | if (n as u64) > (::std::u32::MAX as u64) { |
| 285 | panic!("BUG: {} is too big to fit into u32", n) |
| 286 | } |
| 287 | n as u32 |
| 288 | } |