Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 1 | use std::collections::HashMap; |
| 2 | use std::iter; |
| 3 | use std::result; |
| 4 | use std::sync::Arc; |
| 5 | |
| 6 | use syntax::hir::{self, Hir}; |
| 7 | use syntax::is_word_byte; |
| 8 | use syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences}; |
| 9 | |
| 10 | use prog::{ |
| 11 | EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges, |
| 12 | InstSave, InstSplit, Program, |
| 13 | }; |
| 14 | |
| 15 | use Error; |
| 16 | |
| 17 | type Result = result::Result<Patch, Error>; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 18 | type ResultOrEmpty = result::Result<Option<Patch>, Error>; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 19 | |
| 20 | #[derive(Debug)] |
| 21 | struct Patch { |
| 22 | hole: Hole, |
| 23 | entry: InstPtr, |
| 24 | } |
| 25 | |
| 26 | /// A compiler translates a regular expression AST to a sequence of |
| 27 | /// instructions. The sequence of instructions represents an NFA. |
| 28 | pub struct Compiler { |
| 29 | insts: Vec<MaybeInst>, |
| 30 | compiled: Program, |
| 31 | capture_name_idx: HashMap<String, usize>, |
| 32 | num_exprs: usize, |
| 33 | size_limit: usize, |
| 34 | suffix_cache: SuffixCache, |
| 35 | utf8_seqs: Option<Utf8Sequences>, |
| 36 | byte_classes: ByteClassSet, |
| 37 | } |
| 38 | |
| 39 | impl Compiler { |
| 40 | /// Create a new regular expression compiler. |
| 41 | /// |
| 42 | /// Various options can be set before calling `compile` on an expression. |
| 43 | pub fn new() -> Self { |
| 44 | Compiler { |
| 45 | insts: vec![], |
| 46 | compiled: Program::new(), |
| 47 | capture_name_idx: HashMap::new(), |
| 48 | num_exprs: 0, |
| 49 | size_limit: 10 * (1 << 20), |
| 50 | suffix_cache: SuffixCache::new(1000), |
| 51 | utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')), |
| 52 | byte_classes: ByteClassSet::new(), |
| 53 | } |
| 54 | } |
| 55 | |
| 56 | /// The size of the resulting program is limited by size_limit. If |
| 57 | /// the program approximately exceeds the given size (in bytes), then |
| 58 | /// compilation will stop and return an error. |
| 59 | pub fn size_limit(mut self, size_limit: usize) -> Self { |
| 60 | self.size_limit = size_limit; |
| 61 | self |
| 62 | } |
| 63 | |
| 64 | /// If bytes is true, then the program is compiled as a byte based |
| 65 | /// automaton, which incorporates UTF-8 decoding into the machine. If it's |
| 66 | /// false, then the automaton is Unicode scalar value based, e.g., an |
| 67 | /// engine utilizing such an automaton is responsible for UTF-8 decoding. |
| 68 | /// |
| 69 | /// The specific invariant is that when returning a byte based machine, |
| 70 | /// the neither the `Char` nor `Ranges` instructions are produced. |
| 71 | /// Conversely, when producing a Unicode scalar value machine, the `Bytes` |
| 72 | /// instruction is never produced. |
| 73 | /// |
| 74 | /// Note that `dfa(true)` implies `bytes(true)`. |
| 75 | pub fn bytes(mut self, yes: bool) -> Self { |
| 76 | self.compiled.is_bytes = yes; |
| 77 | self |
| 78 | } |
| 79 | |
| 80 | /// When disabled, the program compiled may match arbitrary bytes. |
| 81 | /// |
| 82 | /// When enabled (the default), all compiled programs exclusively match |
| 83 | /// valid UTF-8 bytes. |
| 84 | pub fn only_utf8(mut self, yes: bool) -> Self { |
| 85 | self.compiled.only_utf8 = yes; |
| 86 | self |
| 87 | } |
| 88 | |
| 89 | /// When set, the machine returned is suitable for use in the DFA matching |
| 90 | /// engine. |
| 91 | /// |
| 92 | /// In particular, this ensures that if the regex is not anchored in the |
| 93 | /// beginning, then a preceding `.*?` is included in the program. (The NFA |
| 94 | /// based engines handle the preceding `.*?` explicitly, which is difficult |
| 95 | /// or impossible in the DFA engine.) |
| 96 | pub fn dfa(mut self, yes: bool) -> Self { |
| 97 | self.compiled.is_dfa = yes; |
| 98 | self |
| 99 | } |
| 100 | |
| 101 | /// When set, the machine returned is suitable for matching text in |
| 102 | /// reverse. In particular, all concatenations are flipped. |
| 103 | pub fn reverse(mut self, yes: bool) -> Self { |
| 104 | self.compiled.is_reverse = yes; |
| 105 | self |
| 106 | } |
| 107 | |
| 108 | /// Compile a regular expression given its AST. |
| 109 | /// |
| 110 | /// The compiler is guaranteed to succeed unless the program exceeds the |
| 111 | /// specified size limit. If the size limit is exceeded, then compilation |
| 112 | /// stops and returns an error. |
| 113 | pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> { |
| 114 | debug_assert!(!exprs.is_empty()); |
| 115 | self.num_exprs = exprs.len(); |
| 116 | if exprs.len() == 1 { |
| 117 | self.compile_one(&exprs[0]) |
| 118 | } else { |
| 119 | self.compile_many(exprs) |
| 120 | } |
| 121 | } |
| 122 | |
| 123 | fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> { |
| 124 | // If we're compiling a forward DFA and we aren't anchored, then |
| 125 | // add a `.*?` before the first capture group. |
| 126 | // Other matching engines handle this by baking the logic into the |
| 127 | // matching engine itself. |
| 128 | let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; |
| 129 | self.compiled.is_anchored_start = expr.is_anchored_start(); |
| 130 | self.compiled.is_anchored_end = expr.is_anchored_end(); |
| 131 | if self.compiled.needs_dotstar() { |
| 132 | dotstar_patch = self.c_dotstar()?; |
| 133 | self.compiled.start = dotstar_patch.entry; |
| 134 | } |
| 135 | self.compiled.captures = vec![None]; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 136 | let patch = self.c_capture(0, expr)?.unwrap_or(self.next_inst()); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 137 | if self.compiled.needs_dotstar() { |
| 138 | self.fill(dotstar_patch.hole, patch.entry); |
| 139 | } else { |
| 140 | self.compiled.start = patch.entry; |
| 141 | } |
| 142 | self.fill_to_next(patch.hole); |
| 143 | self.compiled.matches = vec![self.insts.len()]; |
| 144 | self.push_compiled(Inst::Match(0)); |
| 145 | self.compile_finish() |
| 146 | } |
| 147 | |
| 148 | fn compile_many( |
| 149 | mut self, |
| 150 | exprs: &[Hir], |
| 151 | ) -> result::Result<Program, Error> { |
| 152 | debug_assert!(exprs.len() > 1); |
| 153 | |
| 154 | self.compiled.is_anchored_start = |
| 155 | exprs.iter().all(|e| e.is_anchored_start()); |
| 156 | self.compiled.is_anchored_end = |
| 157 | exprs.iter().all(|e| e.is_anchored_end()); |
| 158 | let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; |
| 159 | if self.compiled.needs_dotstar() { |
| 160 | dotstar_patch = self.c_dotstar()?; |
| 161 | self.compiled.start = dotstar_patch.entry; |
| 162 | } else { |
| 163 | self.compiled.start = 0; // first instruction is always split |
| 164 | } |
| 165 | self.fill_to_next(dotstar_patch.hole); |
| 166 | |
| 167 | let mut prev_hole = Hole::None; |
| 168 | for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() { |
| 169 | self.fill_to_next(prev_hole); |
| 170 | let split = self.push_split_hole(); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 171 | let Patch { hole, entry } = |
| 172 | self.c_capture(0, expr)?.unwrap_or(self.next_inst()); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 173 | self.fill_to_next(hole); |
| 174 | self.compiled.matches.push(self.insts.len()); |
| 175 | self.push_compiled(Inst::Match(i)); |
| 176 | prev_hole = self.fill_split(split, Some(entry), None); |
| 177 | } |
| 178 | let i = exprs.len() - 1; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 179 | let Patch { hole, entry } = |
| 180 | self.c_capture(0, &exprs[i])?.unwrap_or(self.next_inst()); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 181 | self.fill(prev_hole, entry); |
| 182 | self.fill_to_next(hole); |
| 183 | self.compiled.matches.push(self.insts.len()); |
| 184 | self.push_compiled(Inst::Match(i)); |
| 185 | self.compile_finish() |
| 186 | } |
| 187 | |
| 188 | fn compile_finish(mut self) -> result::Result<Program, Error> { |
| 189 | self.compiled.insts = |
| 190 | self.insts.into_iter().map(|inst| inst.unwrap()).collect(); |
| 191 | self.compiled.byte_classes = self.byte_classes.byte_classes(); |
| 192 | self.compiled.capture_name_idx = Arc::new(self.capture_name_idx); |
| 193 | Ok(self.compiled) |
| 194 | } |
| 195 | |
| 196 | /// Compile expr into self.insts, returning a patch on success, |
| 197 | /// or an error if we run out of memory. |
| 198 | /// |
| 199 | /// All of the c_* methods of the compiler share the contract outlined |
| 200 | /// here. |
| 201 | /// |
| 202 | /// The main thing that a c_* method does is mutate `self.insts` |
| 203 | /// to add a list of mostly compiled instructions required to execute |
| 204 | /// the given expression. `self.insts` contains MaybeInsts rather than |
| 205 | /// Insts because there is some backpatching required. |
| 206 | /// |
| 207 | /// The `Patch` value returned by each c_* method provides metadata |
| 208 | /// about the compiled instructions emitted to `self.insts`. The |
| 209 | /// `entry` member of the patch refers to the first instruction |
| 210 | /// (the entry point), while the `hole` member contains zero or |
| 211 | /// more offsets to partial instructions that need to be backpatched. |
| 212 | /// The c_* routine can't know where its list of instructions are going to |
| 213 | /// jump to after execution, so it is up to the caller to patch |
| 214 | /// these jumps to point to the right place. So compiling some |
| 215 | /// expression, e, we would end up with a situation that looked like: |
| 216 | /// |
| 217 | /// ```text |
| 218 | /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...] |
| 219 | /// ^ ^ ^ |
| 220 | /// | \ / |
| 221 | /// entry \ / |
| 222 | /// hole |
| 223 | /// ``` |
| 224 | /// |
Chih-Hung Hsieh | 849e445 | 2020-10-26 13:16:47 -0700 | [diff] [blame] | 225 | /// To compile two expressions, e1 and e2, concatenated together we |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 226 | /// would do: |
| 227 | /// |
| 228 | /// ```ignore |
| 229 | /// let patch1 = self.c(e1); |
| 230 | /// let patch2 = self.c(e2); |
| 231 | /// ``` |
| 232 | /// |
| 233 | /// while leaves us with a situation that looks like |
| 234 | /// |
| 235 | /// ```text |
| 236 | /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ] |
| 237 | /// ^ ^ ^ ^ |
| 238 | /// | | | | |
| 239 | /// entry1 hole1 entry2 hole2 |
| 240 | /// ``` |
| 241 | /// |
| 242 | /// Then to merge the two patches together into one we would backpatch |
| 243 | /// hole1 with entry2 and return a new patch that enters at entry1 |
| 244 | /// and has hole2 for a hole. In fact, if you look at the c_concat |
| 245 | /// method you will see that it does exactly this, though it handles |
| 246 | /// a list of expressions rather than just the two that we use for |
| 247 | /// an example. |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 248 | /// |
| 249 | /// Ok(None) is returned when an expression is compiled to no |
| 250 | /// instruction, and so no patch.entry value makes sense. |
| 251 | fn c(&mut self, expr: &Hir) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 252 | use prog; |
| 253 | use syntax::hir::HirKind::*; |
| 254 | |
| 255 | self.check_size()?; |
| 256 | match *expr.kind() { |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 257 | Empty => Ok(None), |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 258 | Literal(hir::Literal::Unicode(c)) => self.c_char(c), |
| 259 | Literal(hir::Literal::Byte(b)) => { |
| 260 | assert!(self.compiled.uses_bytes()); |
| 261 | self.c_byte(b) |
| 262 | } |
| 263 | Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()), |
| 264 | Class(hir::Class::Bytes(ref cls)) => { |
| 265 | if self.compiled.uses_bytes() { |
| 266 | self.c_class_bytes(cls.ranges()) |
| 267 | } else { |
| 268 | assert!(cls.is_all_ascii()); |
| 269 | let mut char_ranges = vec![]; |
| 270 | for r in cls.iter() { |
| 271 | let (s, e) = (r.start() as char, r.end() as char); |
| 272 | char_ranges.push(hir::ClassUnicodeRange::new(s, e)); |
| 273 | } |
| 274 | self.c_class(&char_ranges) |
| 275 | } |
| 276 | } |
| 277 | Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => { |
| 278 | self.byte_classes.set_range(b'\n', b'\n'); |
| 279 | self.c_empty_look(prog::EmptyLook::EndLine) |
| 280 | } |
| 281 | Anchor(hir::Anchor::StartLine) => { |
| 282 | self.byte_classes.set_range(b'\n', b'\n'); |
| 283 | self.c_empty_look(prog::EmptyLook::StartLine) |
| 284 | } |
| 285 | Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => { |
| 286 | self.byte_classes.set_range(b'\n', b'\n'); |
| 287 | self.c_empty_look(prog::EmptyLook::StartLine) |
| 288 | } |
| 289 | Anchor(hir::Anchor::EndLine) => { |
| 290 | self.byte_classes.set_range(b'\n', b'\n'); |
| 291 | self.c_empty_look(prog::EmptyLook::EndLine) |
| 292 | } |
| 293 | Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => { |
| 294 | self.c_empty_look(prog::EmptyLook::EndText) |
| 295 | } |
| 296 | Anchor(hir::Anchor::StartText) => { |
| 297 | self.c_empty_look(prog::EmptyLook::StartText) |
| 298 | } |
| 299 | Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => { |
| 300 | self.c_empty_look(prog::EmptyLook::StartText) |
| 301 | } |
| 302 | Anchor(hir::Anchor::EndText) => { |
| 303 | self.c_empty_look(prog::EmptyLook::EndText) |
| 304 | } |
| 305 | WordBoundary(hir::WordBoundary::Unicode) => { |
| 306 | if !cfg!(feature = "unicode-perl") { |
| 307 | return Err(Error::Syntax( |
| 308 | "Unicode word boundaries are unavailable when \ |
| 309 | the unicode-perl feature is disabled" |
| 310 | .to_string(), |
| 311 | )); |
| 312 | } |
| 313 | self.compiled.has_unicode_word_boundary = true; |
| 314 | self.byte_classes.set_word_boundary(); |
| 315 | self.c_empty_look(prog::EmptyLook::WordBoundary) |
| 316 | } |
| 317 | WordBoundary(hir::WordBoundary::UnicodeNegate) => { |
| 318 | if !cfg!(feature = "unicode-perl") { |
| 319 | return Err(Error::Syntax( |
| 320 | "Unicode word boundaries are unavailable when \ |
| 321 | the unicode-perl feature is disabled" |
| 322 | .to_string(), |
| 323 | )); |
| 324 | } |
| 325 | self.compiled.has_unicode_word_boundary = true; |
| 326 | self.byte_classes.set_word_boundary(); |
| 327 | self.c_empty_look(prog::EmptyLook::NotWordBoundary) |
| 328 | } |
| 329 | WordBoundary(hir::WordBoundary::Ascii) => { |
| 330 | self.byte_classes.set_word_boundary(); |
| 331 | self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) |
| 332 | } |
| 333 | WordBoundary(hir::WordBoundary::AsciiNegate) => { |
| 334 | self.byte_classes.set_word_boundary(); |
| 335 | self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) |
| 336 | } |
| 337 | Group(ref g) => match g.kind { |
| 338 | hir::GroupKind::NonCapturing => self.c(&g.hir), |
| 339 | hir::GroupKind::CaptureIndex(index) => { |
| 340 | if index as usize >= self.compiled.captures.len() { |
| 341 | self.compiled.captures.push(None); |
| 342 | } |
| 343 | self.c_capture(2 * index as usize, &g.hir) |
| 344 | } |
| 345 | hir::GroupKind::CaptureName { index, ref name } => { |
| 346 | if index as usize >= self.compiled.captures.len() { |
| 347 | let n = name.to_string(); |
| 348 | self.compiled.captures.push(Some(n.clone())); |
| 349 | self.capture_name_idx.insert(n, index as usize); |
| 350 | } |
| 351 | self.c_capture(2 * index as usize, &g.hir) |
| 352 | } |
| 353 | }, |
| 354 | Concat(ref es) => { |
| 355 | if self.compiled.is_reverse { |
| 356 | self.c_concat(es.iter().rev()) |
| 357 | } else { |
| 358 | self.c_concat(es) |
| 359 | } |
| 360 | } |
| 361 | Alternation(ref es) => self.c_alternate(&**es), |
| 362 | Repetition(ref rep) => self.c_repeat(rep), |
| 363 | } |
| 364 | } |
| 365 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 366 | fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 367 | if self.num_exprs > 1 || self.compiled.is_dfa { |
| 368 | // Don't ever compile Save instructions for regex sets because |
| 369 | // they are never used. They are also never used in DFA programs |
| 370 | // because DFAs can't handle captures. |
| 371 | self.c(expr) |
| 372 | } else { |
| 373 | let entry = self.insts.len(); |
| 374 | let hole = self.push_hole(InstHole::Save { slot: first_slot }); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 375 | let patch = self.c(expr)?.unwrap_or(self.next_inst()); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 376 | self.fill(hole, patch.entry); |
| 377 | self.fill_to_next(patch.hole); |
| 378 | let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 }); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 379 | Ok(Some(Patch { hole: hole, entry: entry })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 380 | } |
| 381 | } |
| 382 | |
| 383 | fn c_dotstar(&mut self) -> Result { |
| 384 | Ok(if !self.compiled.only_utf8() { |
| 385 | self.c(&Hir::repetition(hir::Repetition { |
| 386 | kind: hir::RepetitionKind::ZeroOrMore, |
| 387 | greedy: false, |
| 388 | hir: Box::new(Hir::any(true)), |
| 389 | }))? |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 390 | .unwrap() |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 391 | } else { |
| 392 | self.c(&Hir::repetition(hir::Repetition { |
| 393 | kind: hir::RepetitionKind::ZeroOrMore, |
| 394 | greedy: false, |
| 395 | hir: Box::new(Hir::any(false)), |
| 396 | }))? |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 397 | .unwrap() |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 398 | }) |
| 399 | } |
| 400 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 401 | fn c_char(&mut self, c: char) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 402 | if self.compiled.uses_bytes() { |
| 403 | if c.is_ascii() { |
| 404 | let b = c as u8; |
| 405 | let hole = |
| 406 | self.push_hole(InstHole::Bytes { start: b, end: b }); |
| 407 | self.byte_classes.set_range(b, b); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 408 | Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 409 | } else { |
| 410 | self.c_class(&[hir::ClassUnicodeRange::new(c, c)]) |
| 411 | } |
| 412 | } else { |
| 413 | let hole = self.push_hole(InstHole::Char { c: c }); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 414 | Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 415 | } |
| 416 | } |
| 417 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 418 | fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 419 | assert!(!ranges.is_empty()); |
| 420 | if self.compiled.uses_bytes() { |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 421 | Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?)) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 422 | } else { |
| 423 | let ranges: Vec<(char, char)> = |
| 424 | ranges.iter().map(|r| (r.start(), r.end())).collect(); |
| 425 | let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 { |
| 426 | self.push_hole(InstHole::Char { c: ranges[0].0 }) |
| 427 | } else { |
| 428 | self.push_hole(InstHole::Ranges { ranges: ranges }) |
| 429 | }; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 430 | Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 431 | } |
| 432 | } |
| 433 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 434 | fn c_byte(&mut self, b: u8) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 435 | self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)]) |
| 436 | } |
| 437 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 438 | fn c_class_bytes( |
| 439 | &mut self, |
| 440 | ranges: &[hir::ClassBytesRange], |
| 441 | ) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 442 | debug_assert!(!ranges.is_empty()); |
| 443 | |
| 444 | let first_split_entry = self.insts.len(); |
| 445 | let mut holes = vec![]; |
| 446 | let mut prev_hole = Hole::None; |
| 447 | for r in &ranges[0..ranges.len() - 1] { |
| 448 | self.fill_to_next(prev_hole); |
| 449 | let split = self.push_split_hole(); |
| 450 | let next = self.insts.len(); |
| 451 | self.byte_classes.set_range(r.start(), r.end()); |
| 452 | holes.push(self.push_hole(InstHole::Bytes { |
| 453 | start: r.start(), |
| 454 | end: r.end(), |
| 455 | })); |
| 456 | prev_hole = self.fill_split(split, Some(next), None); |
| 457 | } |
| 458 | let next = self.insts.len(); |
| 459 | let r = &ranges[ranges.len() - 1]; |
| 460 | self.byte_classes.set_range(r.start(), r.end()); |
| 461 | holes.push( |
| 462 | self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }), |
| 463 | ); |
| 464 | self.fill(prev_hole, next); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 465 | Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 466 | } |
| 467 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 468 | fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 469 | let hole = self.push_hole(InstHole::EmptyLook { look: look }); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 470 | Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 471 | } |
| 472 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 473 | fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 474 | where |
| 475 | I: IntoIterator<Item = &'a Hir>, |
| 476 | { |
| 477 | let mut exprs = exprs.into_iter(); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 478 | let Patch { mut hole, entry } = loop { |
| 479 | match exprs.next() { |
| 480 | None => return Ok(None), |
| 481 | Some(e) => { |
| 482 | if let Some(p) = self.c(e)? { |
| 483 | break p; |
| 484 | } |
| 485 | } |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 486 | } |
| 487 | }; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 488 | for e in exprs { |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 489 | if let Some(p) = self.c(e)? { |
| 490 | self.fill(hole, p.entry); |
| 491 | hole = p.hole; |
| 492 | } |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 493 | } |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 494 | Ok(Some(Patch { hole: hole, entry: entry })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 495 | } |
| 496 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 497 | fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 498 | debug_assert!( |
| 499 | exprs.len() >= 2, |
| 500 | "alternates must have at least 2 exprs" |
| 501 | ); |
| 502 | |
| 503 | // Initial entry point is always the first split. |
| 504 | let first_split_entry = self.insts.len(); |
| 505 | |
| 506 | // Save up all of the holes from each alternate. They will all get |
| 507 | // patched to point to the same location. |
| 508 | let mut holes = vec![]; |
| 509 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 510 | // true indicates that the hole is a split where we want to fill |
| 511 | // the second branch. |
| 512 | let mut prev_hole = (Hole::None, false); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 513 | for e in &exprs[0..exprs.len() - 1] { |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 514 | if prev_hole.1 { |
| 515 | let next = self.insts.len(); |
| 516 | self.fill_split(prev_hole.0, None, Some(next)); |
| 517 | } else { |
| 518 | self.fill_to_next(prev_hole.0); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 519 | } |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 520 | let split = self.push_split_hole(); |
| 521 | if let Some(Patch { hole, entry }) = self.c(e)? { |
| 522 | holes.push(hole); |
| 523 | prev_hole = (self.fill_split(split, Some(entry), None), false); |
| 524 | } else { |
| 525 | let (split1, split2) = split.dup_one(); |
| 526 | holes.push(split1); |
| 527 | prev_hole = (split2, true); |
| 528 | } |
| 529 | } |
| 530 | if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 531 | holes.push(hole); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 532 | if prev_hole.1 { |
| 533 | self.fill_split(prev_hole.0, None, Some(entry)); |
| 534 | } else { |
| 535 | self.fill(prev_hole.0, entry); |
| 536 | } |
| 537 | } else { |
| 538 | // We ignore prev_hole.1. When it's true, it means we have two |
| 539 | // empty branches both pushing prev_hole.0 into holes, so both |
| 540 | // branches will go to the same place anyway. |
| 541 | holes.push(prev_hole.0); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 542 | } |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 543 | Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 544 | } |
| 545 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 546 | fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 547 | use syntax::hir::RepetitionKind::*; |
| 548 | match rep.kind { |
| 549 | ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy), |
| 550 | ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy), |
| 551 | OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy), |
| 552 | Range(hir::RepetitionRange::Exactly(min_max)) => { |
| 553 | self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max) |
| 554 | } |
| 555 | Range(hir::RepetitionRange::AtLeast(min)) => { |
| 556 | self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min) |
| 557 | } |
| 558 | Range(hir::RepetitionRange::Bounded(min, max)) => { |
| 559 | self.c_repeat_range(&rep.hir, rep.greedy, min, max) |
| 560 | } |
| 561 | } |
| 562 | } |
| 563 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 564 | fn c_repeat_zero_or_one( |
| 565 | &mut self, |
| 566 | expr: &Hir, |
| 567 | greedy: bool, |
| 568 | ) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 569 | let split_entry = self.insts.len(); |
| 570 | let split = self.push_split_hole(); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 571 | let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { |
| 572 | Some(p) => p, |
| 573 | None => return self.pop_split_hole(), |
| 574 | }; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 575 | let split_hole = if greedy { |
| 576 | self.fill_split(split, Some(entry_rep), None) |
| 577 | } else { |
| 578 | self.fill_split(split, None, Some(entry_rep)) |
| 579 | }; |
| 580 | let holes = vec![hole_rep, split_hole]; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 581 | Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 582 | } |
| 583 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 584 | fn c_repeat_zero_or_more( |
| 585 | &mut self, |
| 586 | expr: &Hir, |
| 587 | greedy: bool, |
| 588 | ) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 589 | let split_entry = self.insts.len(); |
| 590 | let split = self.push_split_hole(); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 591 | let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { |
| 592 | Some(p) => p, |
| 593 | None => return self.pop_split_hole(), |
| 594 | }; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 595 | |
| 596 | self.fill(hole_rep, split_entry); |
| 597 | let split_hole = if greedy { |
| 598 | self.fill_split(split, Some(entry_rep), None) |
| 599 | } else { |
| 600 | self.fill_split(split, None, Some(entry_rep)) |
| 601 | }; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 602 | Ok(Some(Patch { hole: split_hole, entry: split_entry })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 603 | } |
| 604 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 605 | fn c_repeat_one_or_more( |
| 606 | &mut self, |
| 607 | expr: &Hir, |
| 608 | greedy: bool, |
| 609 | ) -> ResultOrEmpty { |
| 610 | let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { |
| 611 | Some(p) => p, |
| 612 | None => return Ok(None), |
| 613 | }; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 614 | self.fill_to_next(hole_rep); |
| 615 | let split = self.push_split_hole(); |
| 616 | |
| 617 | let split_hole = if greedy { |
| 618 | self.fill_split(split, Some(entry_rep), None) |
| 619 | } else { |
| 620 | self.fill_split(split, None, Some(entry_rep)) |
| 621 | }; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 622 | Ok(Some(Patch { hole: split_hole, entry: entry_rep })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 623 | } |
| 624 | |
| 625 | fn c_repeat_range_min_or_more( |
| 626 | &mut self, |
| 627 | expr: &Hir, |
| 628 | greedy: bool, |
| 629 | min: u32, |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 630 | ) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 631 | let min = u32_to_usize(min); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 632 | // Using next_inst() is ok, because we can't return it (concat would |
| 633 | // have to return Some(_) while c_repeat_range_min_or_more returns |
| 634 | // None). |
| 635 | let patch_concat = self |
| 636 | .c_concat(iter::repeat(expr).take(min))? |
| 637 | .unwrap_or(self.next_inst()); |
| 638 | if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? { |
| 639 | self.fill(patch_concat.hole, patch_rep.entry); |
| 640 | Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry })) |
| 641 | } else { |
| 642 | Ok(None) |
| 643 | } |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 644 | } |
| 645 | |
| 646 | fn c_repeat_range( |
| 647 | &mut self, |
| 648 | expr: &Hir, |
| 649 | greedy: bool, |
| 650 | min: u32, |
| 651 | max: u32, |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 652 | ) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 653 | let (min, max) = (u32_to_usize(min), u32_to_usize(max)); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 654 | debug_assert!(min <= max); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 655 | let patch_concat = self.c_concat(iter::repeat(expr).take(min))?; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 656 | if min == max { |
| 657 | return Ok(patch_concat); |
| 658 | } |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 659 | // Same reasoning as in c_repeat_range_min_or_more (we know that min < |
| 660 | // max at this point). |
| 661 | let patch_concat = patch_concat.unwrap_or(self.next_inst()); |
| 662 | let initial_entry = patch_concat.entry; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 663 | // It is much simpler to compile, e.g., `a{2,5}` as: |
| 664 | // |
| 665 | // aaa?a?a? |
| 666 | // |
| 667 | // But you end up with a sequence of instructions like this: |
| 668 | // |
| 669 | // 0: 'a' |
| 670 | // 1: 'a', |
| 671 | // 2: split(3, 4) |
| 672 | // 3: 'a' |
| 673 | // 4: split(5, 6) |
| 674 | // 5: 'a' |
| 675 | // 6: split(7, 8) |
| 676 | // 7: 'a' |
| 677 | // 8: MATCH |
| 678 | // |
| 679 | // This is *incredibly* inefficient because the splits end |
| 680 | // up forming a chain, which has to be resolved everything a |
| 681 | // transition is followed. |
| 682 | let mut holes = vec![]; |
| 683 | let mut prev_hole = patch_concat.hole; |
| 684 | for _ in min..max { |
| 685 | self.fill_to_next(prev_hole); |
| 686 | let split = self.push_split_hole(); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 687 | let Patch { hole, entry } = match self.c(expr)? { |
| 688 | Some(p) => p, |
| 689 | None => return self.pop_split_hole(), |
| 690 | }; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 691 | prev_hole = hole; |
| 692 | if greedy { |
| 693 | holes.push(self.fill_split(split, Some(entry), None)); |
| 694 | } else { |
| 695 | holes.push(self.fill_split(split, None, Some(entry))); |
| 696 | } |
| 697 | } |
| 698 | holes.push(prev_hole); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 699 | Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry })) |
| 700 | } |
| 701 | |
| 702 | /// Can be used as a default value for the c_* functions when the call to |
| 703 | /// c_function is followed by inserting at least one instruction that is |
| 704 | /// always executed after the ones written by the c* function. |
| 705 | fn next_inst(&self) -> Patch { |
| 706 | Patch { hole: Hole::None, entry: self.insts.len() } |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 707 | } |
| 708 | |
| 709 | fn fill(&mut self, hole: Hole, goto: InstPtr) { |
| 710 | match hole { |
| 711 | Hole::None => {} |
| 712 | Hole::One(pc) => { |
| 713 | self.insts[pc].fill(goto); |
| 714 | } |
| 715 | Hole::Many(holes) => { |
| 716 | for hole in holes { |
| 717 | self.fill(hole, goto); |
| 718 | } |
| 719 | } |
| 720 | } |
| 721 | } |
| 722 | |
| 723 | fn fill_to_next(&mut self, hole: Hole) { |
| 724 | let next = self.insts.len(); |
| 725 | self.fill(hole, next); |
| 726 | } |
| 727 | |
| 728 | fn fill_split( |
| 729 | &mut self, |
| 730 | hole: Hole, |
| 731 | goto1: Option<InstPtr>, |
| 732 | goto2: Option<InstPtr>, |
| 733 | ) -> Hole { |
| 734 | match hole { |
| 735 | Hole::None => Hole::None, |
| 736 | Hole::One(pc) => match (goto1, goto2) { |
| 737 | (Some(goto1), Some(goto2)) => { |
| 738 | self.insts[pc].fill_split(goto1, goto2); |
| 739 | Hole::None |
| 740 | } |
| 741 | (Some(goto1), None) => { |
| 742 | self.insts[pc].half_fill_split_goto1(goto1); |
| 743 | Hole::One(pc) |
| 744 | } |
| 745 | (None, Some(goto2)) => { |
| 746 | self.insts[pc].half_fill_split_goto2(goto2); |
| 747 | Hole::One(pc) |
| 748 | } |
| 749 | (None, None) => unreachable!( |
| 750 | "at least one of the split \ |
| 751 | holes must be filled" |
| 752 | ), |
| 753 | }, |
| 754 | Hole::Many(holes) => { |
| 755 | let mut new_holes = vec![]; |
| 756 | for hole in holes { |
| 757 | new_holes.push(self.fill_split(hole, goto1, goto2)); |
| 758 | } |
| 759 | if new_holes.is_empty() { |
| 760 | Hole::None |
| 761 | } else if new_holes.len() == 1 { |
| 762 | new_holes.pop().unwrap() |
| 763 | } else { |
| 764 | Hole::Many(new_holes) |
| 765 | } |
| 766 | } |
| 767 | } |
| 768 | } |
| 769 | |
| 770 | fn push_compiled(&mut self, inst: Inst) { |
| 771 | self.insts.push(MaybeInst::Compiled(inst)); |
| 772 | } |
| 773 | |
| 774 | fn push_hole(&mut self, inst: InstHole) -> Hole { |
| 775 | let hole = self.insts.len(); |
| 776 | self.insts.push(MaybeInst::Uncompiled(inst)); |
| 777 | Hole::One(hole) |
| 778 | } |
| 779 | |
| 780 | fn push_split_hole(&mut self) -> Hole { |
| 781 | let hole = self.insts.len(); |
| 782 | self.insts.push(MaybeInst::Split); |
| 783 | Hole::One(hole) |
| 784 | } |
| 785 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 786 | fn pop_split_hole(&mut self) -> ResultOrEmpty { |
| 787 | self.insts.pop(); |
| 788 | Ok(None) |
| 789 | } |
| 790 | |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 791 | fn check_size(&self) -> result::Result<(), Error> { |
| 792 | use std::mem::size_of; |
| 793 | |
| 794 | if self.insts.len() * size_of::<Inst>() > self.size_limit { |
| 795 | Err(Error::CompiledTooBig(self.size_limit)) |
| 796 | } else { |
| 797 | Ok(()) |
| 798 | } |
| 799 | } |
| 800 | } |
| 801 | |
| 802 | #[derive(Debug)] |
| 803 | enum Hole { |
| 804 | None, |
| 805 | One(InstPtr), |
| 806 | Many(Vec<Hole>), |
| 807 | } |
| 808 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 809 | impl Hole { |
| 810 | fn dup_one(self) -> (Self, Self) { |
| 811 | match self { |
| 812 | Hole::One(pc) => (Hole::One(pc), Hole::One(pc)), |
| 813 | Hole::None | Hole::Many(_) => { |
| 814 | unreachable!("must be called on single hole") |
| 815 | } |
| 816 | } |
| 817 | } |
| 818 | } |
| 819 | |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 820 | #[derive(Clone, Debug)] |
| 821 | enum MaybeInst { |
| 822 | Compiled(Inst), |
| 823 | Uncompiled(InstHole), |
| 824 | Split, |
| 825 | Split1(InstPtr), |
| 826 | Split2(InstPtr), |
| 827 | } |
| 828 | |
| 829 | impl MaybeInst { |
| 830 | fn fill(&mut self, goto: InstPtr) { |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 831 | let maybeinst = match *self { |
| 832 | MaybeInst::Split => MaybeInst::Split1(goto), |
| 833 | MaybeInst::Uncompiled(ref inst) => { |
| 834 | MaybeInst::Compiled(inst.fill(goto)) |
| 835 | } |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 836 | MaybeInst::Split1(goto1) => { |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 837 | MaybeInst::Compiled(Inst::Split(InstSplit { |
| 838 | goto1: goto1, |
| 839 | goto2: goto, |
| 840 | })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 841 | } |
| 842 | MaybeInst::Split2(goto2) => { |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 843 | MaybeInst::Compiled(Inst::Split(InstSplit { |
| 844 | goto1: goto, |
| 845 | goto2: goto2, |
| 846 | })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 847 | } |
| 848 | _ => unreachable!( |
| 849 | "not all instructions were compiled! \ |
| 850 | found uncompiled instruction: {:?}", |
| 851 | self |
| 852 | ), |
| 853 | }; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 854 | *self = maybeinst; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 855 | } |
| 856 | |
| 857 | fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) { |
| 858 | let filled = match *self { |
| 859 | MaybeInst::Split => { |
| 860 | Inst::Split(InstSplit { goto1: goto1, goto2: goto2 }) |
| 861 | } |
| 862 | _ => unreachable!( |
| 863 | "must be called on Split instruction, \ |
| 864 | instead it was called on: {:?}", |
| 865 | self |
| 866 | ), |
| 867 | }; |
| 868 | *self = MaybeInst::Compiled(filled); |
| 869 | } |
| 870 | |
| 871 | fn half_fill_split_goto1(&mut self, goto1: InstPtr) { |
| 872 | let half_filled = match *self { |
| 873 | MaybeInst::Split => goto1, |
| 874 | _ => unreachable!( |
| 875 | "must be called on Split instruction, \ |
| 876 | instead it was called on: {:?}", |
| 877 | self |
| 878 | ), |
| 879 | }; |
| 880 | *self = MaybeInst::Split1(half_filled); |
| 881 | } |
| 882 | |
| 883 | fn half_fill_split_goto2(&mut self, goto2: InstPtr) { |
| 884 | let half_filled = match *self { |
| 885 | MaybeInst::Split => goto2, |
| 886 | _ => unreachable!( |
| 887 | "must be called on Split instruction, \ |
| 888 | instead it was called on: {:?}", |
| 889 | self |
| 890 | ), |
| 891 | }; |
| 892 | *self = MaybeInst::Split2(half_filled); |
| 893 | } |
| 894 | |
| 895 | fn unwrap(self) -> Inst { |
| 896 | match self { |
| 897 | MaybeInst::Compiled(inst) => inst, |
| 898 | _ => unreachable!( |
| 899 | "must be called on a compiled instruction, \ |
| 900 | instead it was called on: {:?}", |
| 901 | self |
| 902 | ), |
| 903 | } |
| 904 | } |
| 905 | } |
| 906 | |
| 907 | #[derive(Clone, Debug)] |
| 908 | enum InstHole { |
| 909 | Save { slot: usize }, |
| 910 | EmptyLook { look: EmptyLook }, |
| 911 | Char { c: char }, |
| 912 | Ranges { ranges: Vec<(char, char)> }, |
| 913 | Bytes { start: u8, end: u8 }, |
| 914 | } |
| 915 | |
| 916 | impl InstHole { |
| 917 | fn fill(&self, goto: InstPtr) -> Inst { |
| 918 | match *self { |
| 919 | InstHole::Save { slot } => { |
| 920 | Inst::Save(InstSave { goto: goto, slot: slot }) |
| 921 | } |
| 922 | InstHole::EmptyLook { look } => { |
| 923 | Inst::EmptyLook(InstEmptyLook { goto: goto, look: look }) |
| 924 | } |
| 925 | InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }), |
| 926 | InstHole::Ranges { ref ranges } => { |
| 927 | Inst::Ranges(InstRanges { goto: goto, ranges: ranges.clone() }) |
| 928 | } |
| 929 | InstHole::Bytes { start, end } => { |
| 930 | Inst::Bytes(InstBytes { goto: goto, start: start, end: end }) |
| 931 | } |
| 932 | } |
| 933 | } |
| 934 | } |
| 935 | |
| 936 | struct CompileClass<'a, 'b> { |
| 937 | c: &'a mut Compiler, |
| 938 | ranges: &'b [hir::ClassUnicodeRange], |
| 939 | } |
| 940 | |
| 941 | impl<'a, 'b> CompileClass<'a, 'b> { |
| 942 | fn compile(mut self) -> Result { |
| 943 | let mut holes = vec![]; |
| 944 | let mut initial_entry = None; |
| 945 | let mut last_split = Hole::None; |
| 946 | let mut utf8_seqs = self.c.utf8_seqs.take().unwrap(); |
| 947 | self.c.suffix_cache.clear(); |
| 948 | |
| 949 | for (i, range) in self.ranges.iter().enumerate() { |
| 950 | let is_last_range = i + 1 == self.ranges.len(); |
| 951 | utf8_seqs.reset(range.start(), range.end()); |
| 952 | let mut it = (&mut utf8_seqs).peekable(); |
| 953 | loop { |
| 954 | let utf8_seq = match it.next() { |
| 955 | None => break, |
| 956 | Some(utf8_seq) => utf8_seq, |
| 957 | }; |
| 958 | if is_last_range && it.peek().is_none() { |
| 959 | let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; |
| 960 | holes.push(hole); |
| 961 | self.c.fill(last_split, entry); |
| 962 | last_split = Hole::None; |
| 963 | if initial_entry.is_none() { |
| 964 | initial_entry = Some(entry); |
| 965 | } |
| 966 | } else { |
| 967 | if initial_entry.is_none() { |
| 968 | initial_entry = Some(self.c.insts.len()); |
| 969 | } |
| 970 | self.c.fill_to_next(last_split); |
| 971 | last_split = self.c.push_split_hole(); |
| 972 | let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; |
| 973 | holes.push(hole); |
| 974 | last_split = |
| 975 | self.c.fill_split(last_split, Some(entry), None); |
| 976 | } |
| 977 | } |
| 978 | } |
| 979 | self.c.utf8_seqs = Some(utf8_seqs); |
| 980 | Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() }) |
| 981 | } |
| 982 | |
| 983 | fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result { |
| 984 | if self.c.compiled.is_reverse { |
| 985 | self.c_utf8_seq_(seq) |
| 986 | } else { |
| 987 | self.c_utf8_seq_(seq.into_iter().rev()) |
| 988 | } |
| 989 | } |
| 990 | |
| 991 | fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result |
| 992 | where |
| 993 | I: IntoIterator<Item = &'r Utf8Range>, |
| 994 | { |
| 995 | // The initial instruction for each UTF-8 sequence should be the same. |
| 996 | let mut from_inst = ::std::usize::MAX; |
| 997 | let mut last_hole = Hole::None; |
| 998 | for byte_range in seq { |
| 999 | let key = SuffixCacheKey { |
| 1000 | from_inst: from_inst, |
| 1001 | start: byte_range.start, |
| 1002 | end: byte_range.end, |
| 1003 | }; |
| 1004 | { |
| 1005 | let pc = self.c.insts.len(); |
| 1006 | if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) { |
| 1007 | from_inst = cached_pc; |
| 1008 | continue; |
| 1009 | } |
| 1010 | } |
| 1011 | self.c.byte_classes.set_range(byte_range.start, byte_range.end); |
| 1012 | if from_inst == ::std::usize::MAX { |
| 1013 | last_hole = self.c.push_hole(InstHole::Bytes { |
| 1014 | start: byte_range.start, |
| 1015 | end: byte_range.end, |
| 1016 | }); |
| 1017 | } else { |
| 1018 | self.c.push_compiled(Inst::Bytes(InstBytes { |
| 1019 | goto: from_inst, |
| 1020 | start: byte_range.start, |
| 1021 | end: byte_range.end, |
| 1022 | })); |
| 1023 | } |
| 1024 | from_inst = self.c.insts.len().checked_sub(1).unwrap(); |
| 1025 | debug_assert!(from_inst < ::std::usize::MAX); |
| 1026 | } |
| 1027 | debug_assert!(from_inst < ::std::usize::MAX); |
| 1028 | Ok(Patch { hole: last_hole, entry: from_inst }) |
| 1029 | } |
| 1030 | } |
| 1031 | |
| 1032 | /// `SuffixCache` is a simple bounded hash map for caching suffix entries in |
| 1033 | /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}. |
| 1034 | /// The set of byte ranges looks like this: |
| 1035 | /// |
| 1036 | /// [0-7F] |
| 1037 | /// [C2-DF][80-BF] |
| 1038 | /// [E0][A0-BF][80-BF] |
| 1039 | /// [E1-EC][80-BF][80-BF] |
| 1040 | /// [ED][80-9F][80-BF] |
| 1041 | /// [EE-EF][80-BF][80-BF] |
| 1042 | /// |
| 1043 | /// Each line above translates to one alternate in the compiled regex program. |
| 1044 | /// However, all but one of the alternates end in the same suffix, which is |
| 1045 | /// a waste of an instruction. The suffix cache facilitates reusing them across |
| 1046 | /// alternates. |
| 1047 | /// |
| 1048 | /// Note that a HashMap could be trivially used for this, but we don't need its |
| 1049 | /// overhead. Some small bounded space (LRU style) is more than enough. |
| 1050 | /// |
| 1051 | /// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html), |
| 1052 | /// except it uses hashes as original indices and then compares full keys for |
| 1053 | /// validation against `dense` array. |
| 1054 | struct SuffixCache { |
| 1055 | sparse: Box<[usize]>, |
| 1056 | dense: Vec<SuffixCacheEntry>, |
| 1057 | } |
| 1058 | |
| 1059 | #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] |
| 1060 | struct SuffixCacheEntry { |
| 1061 | key: SuffixCacheKey, |
| 1062 | pc: InstPtr, |
| 1063 | } |
| 1064 | |
| 1065 | #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] |
| 1066 | struct SuffixCacheKey { |
| 1067 | from_inst: InstPtr, |
| 1068 | start: u8, |
| 1069 | end: u8, |
| 1070 | } |
| 1071 | |
| 1072 | impl SuffixCache { |
| 1073 | fn new(size: usize) -> Self { |
| 1074 | SuffixCache { |
| 1075 | sparse: vec![0usize; size].into(), |
| 1076 | dense: Vec::with_capacity(size), |
| 1077 | } |
| 1078 | } |
| 1079 | |
| 1080 | fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> { |
| 1081 | let hash = self.hash(&key); |
| 1082 | let pos = &mut self.sparse[hash]; |
| 1083 | if let Some(entry) = self.dense.get(*pos) { |
| 1084 | if entry.key == key { |
| 1085 | return Some(entry.pc); |
| 1086 | } |
| 1087 | } |
| 1088 | *pos = self.dense.len(); |
| 1089 | self.dense.push(SuffixCacheEntry { key: key, pc: pc }); |
| 1090 | None |
| 1091 | } |
| 1092 | |
| 1093 | fn clear(&mut self) { |
| 1094 | self.dense.clear(); |
| 1095 | } |
| 1096 | |
| 1097 | fn hash(&self, suffix: &SuffixCacheKey) -> usize { |
| 1098 | // Basic FNV-1a hash as described: |
| 1099 | // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function |
| 1100 | const FNV_PRIME: u64 = 1099511628211; |
| 1101 | let mut h = 14695981039346656037; |
| 1102 | h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME); |
| 1103 | h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME); |
| 1104 | h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME); |
| 1105 | (h as usize) % self.sparse.len() |
| 1106 | } |
| 1107 | } |
| 1108 | |
| 1109 | struct ByteClassSet([bool; 256]); |
| 1110 | |
| 1111 | impl ByteClassSet { |
| 1112 | fn new() -> Self { |
| 1113 | ByteClassSet([false; 256]) |
| 1114 | } |
| 1115 | |
| 1116 | fn set_range(&mut self, start: u8, end: u8) { |
| 1117 | debug_assert!(start <= end); |
| 1118 | if start > 0 { |
| 1119 | self.0[start as usize - 1] = true; |
| 1120 | } |
| 1121 | self.0[end as usize] = true; |
| 1122 | } |
| 1123 | |
| 1124 | fn set_word_boundary(&mut self) { |
| 1125 | // We need to mark all ranges of bytes whose pairs result in |
| 1126 | // evaluating \b differently. |
| 1127 | let iswb = is_word_byte; |
| 1128 | let mut b1: u16 = 0; |
| 1129 | let mut b2: u16; |
| 1130 | while b1 <= 255 { |
| 1131 | b2 = b1 + 1; |
| 1132 | while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) { |
| 1133 | b2 += 1; |
| 1134 | } |
| 1135 | self.set_range(b1 as u8, (b2 - 1) as u8); |
| 1136 | b1 = b2; |
| 1137 | } |
| 1138 | } |
| 1139 | |
| 1140 | fn byte_classes(&self) -> Vec<u8> { |
| 1141 | // N.B. If you're debugging the DFA, it's useful to simply return |
| 1142 | // `(0..256).collect()`, which effectively removes the byte classes |
| 1143 | // and makes the transitions easier to read. |
| 1144 | // (0usize..256).map(|x| x as u8).collect() |
| 1145 | let mut byte_classes = vec![0; 256]; |
| 1146 | let mut class = 0u8; |
| 1147 | let mut i = 0; |
| 1148 | loop { |
| 1149 | byte_classes[i] = class as u8; |
| 1150 | if i >= 255 { |
| 1151 | break; |
| 1152 | } |
| 1153 | if self.0[i] { |
| 1154 | class = class.checked_add(1).unwrap(); |
| 1155 | } |
| 1156 | i += 1; |
| 1157 | } |
| 1158 | byte_classes |
| 1159 | } |
| 1160 | } |
| 1161 | |
| 1162 | fn u32_to_usize(n: u32) -> usize { |
| 1163 | // In case usize is less than 32 bits, we need to guard against overflow. |
| 1164 | // On most platforms this compiles to nothing. |
| 1165 | // TODO Use `std::convert::TryFrom` once it's stable. |
| 1166 | if (n as u64) > (::std::usize::MAX as u64) { |
| 1167 | panic!("BUG: {} is too big to be pointer sized", n) |
| 1168 | } |
| 1169 | n as usize |
| 1170 | } |
| 1171 | |
| 1172 | #[cfg(test)] |
| 1173 | mod tests { |
| 1174 | use super::ByteClassSet; |
| 1175 | |
| 1176 | #[test] |
| 1177 | fn byte_classes() { |
| 1178 | let mut set = ByteClassSet::new(); |
| 1179 | set.set_range(b'a', b'z'); |
| 1180 | let classes = set.byte_classes(); |
| 1181 | assert_eq!(classes[0], 0); |
| 1182 | assert_eq!(classes[1], 0); |
| 1183 | assert_eq!(classes[2], 0); |
| 1184 | assert_eq!(classes[b'a' as usize - 1], 0); |
| 1185 | assert_eq!(classes[b'a' as usize], 1); |
| 1186 | assert_eq!(classes[b'm' as usize], 1); |
| 1187 | assert_eq!(classes[b'z' as usize], 1); |
| 1188 | assert_eq!(classes[b'z' as usize + 1], 2); |
| 1189 | assert_eq!(classes[254], 2); |
| 1190 | assert_eq!(classes[255], 2); |
| 1191 | |
| 1192 | let mut set = ByteClassSet::new(); |
| 1193 | set.set_range(0, 2); |
| 1194 | set.set_range(4, 6); |
| 1195 | let classes = set.byte_classes(); |
| 1196 | assert_eq!(classes[0], 0); |
| 1197 | assert_eq!(classes[1], 0); |
| 1198 | assert_eq!(classes[2], 0); |
| 1199 | assert_eq!(classes[3], 1); |
| 1200 | assert_eq!(classes[4], 2); |
| 1201 | assert_eq!(classes[5], 2); |
| 1202 | assert_eq!(classes[6], 2); |
| 1203 | assert_eq!(classes[7], 3); |
| 1204 | assert_eq!(classes[255], 3); |
| 1205 | } |
| 1206 | |
| 1207 | #[test] |
| 1208 | fn full_byte_classes() { |
| 1209 | let mut set = ByteClassSet::new(); |
| 1210 | for i in 0..256u16 { |
| 1211 | set.set_range(i as u8, i as u8); |
| 1212 | } |
| 1213 | assert_eq!(set.byte_classes().len(), 256); |
| 1214 | } |
| 1215 | } |