Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 1 | use std::collections::HashMap; |
Haibo Huang | 47619dd | 2021-01-08 17:05:43 -0800 | [diff] [blame] | 2 | use std::fmt; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 3 | use std::iter; |
| 4 | use std::result; |
| 5 | use std::sync::Arc; |
| 6 | |
| 7 | use syntax::hir::{self, Hir}; |
| 8 | use syntax::is_word_byte; |
| 9 | use syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences}; |
| 10 | |
| 11 | use prog::{ |
| 12 | EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges, |
| 13 | InstSave, InstSplit, Program, |
| 14 | }; |
| 15 | |
| 16 | use Error; |
| 17 | |
| 18 | type Result = result::Result<Patch, Error>; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 19 | type ResultOrEmpty = result::Result<Option<Patch>, Error>; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 20 | |
| 21 | #[derive(Debug)] |
| 22 | struct Patch { |
| 23 | hole: Hole, |
| 24 | entry: InstPtr, |
| 25 | } |
| 26 | |
| 27 | /// A compiler translates a regular expression AST to a sequence of |
| 28 | /// instructions. The sequence of instructions represents an NFA. |
Haibo Huang | 47619dd | 2021-01-08 17:05:43 -0800 | [diff] [blame] | 29 | // `Compiler` is only public via the `internal` module, so avoid deriving |
| 30 | // `Debug`. |
| 31 | #[allow(missing_debug_implementations)] |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 32 | pub struct Compiler { |
| 33 | insts: Vec<MaybeInst>, |
| 34 | compiled: Program, |
| 35 | capture_name_idx: HashMap<String, usize>, |
| 36 | num_exprs: usize, |
| 37 | size_limit: usize, |
| 38 | suffix_cache: SuffixCache, |
| 39 | utf8_seqs: Option<Utf8Sequences>, |
| 40 | byte_classes: ByteClassSet, |
| 41 | } |
| 42 | |
| 43 | impl Compiler { |
| 44 | /// Create a new regular expression compiler. |
| 45 | /// |
| 46 | /// Various options can be set before calling `compile` on an expression. |
| 47 | pub fn new() -> Self { |
| 48 | Compiler { |
| 49 | insts: vec![], |
| 50 | compiled: Program::new(), |
| 51 | capture_name_idx: HashMap::new(), |
| 52 | num_exprs: 0, |
| 53 | size_limit: 10 * (1 << 20), |
| 54 | suffix_cache: SuffixCache::new(1000), |
| 55 | utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')), |
| 56 | byte_classes: ByteClassSet::new(), |
| 57 | } |
| 58 | } |
| 59 | |
| 60 | /// The size of the resulting program is limited by size_limit. If |
| 61 | /// the program approximately exceeds the given size (in bytes), then |
| 62 | /// compilation will stop and return an error. |
| 63 | pub fn size_limit(mut self, size_limit: usize) -> Self { |
| 64 | self.size_limit = size_limit; |
| 65 | self |
| 66 | } |
| 67 | |
| 68 | /// If bytes is true, then the program is compiled as a byte based |
| 69 | /// automaton, which incorporates UTF-8 decoding into the machine. If it's |
| 70 | /// false, then the automaton is Unicode scalar value based, e.g., an |
| 71 | /// engine utilizing such an automaton is responsible for UTF-8 decoding. |
| 72 | /// |
| 73 | /// The specific invariant is that when returning a byte based machine, |
| 74 | /// the neither the `Char` nor `Ranges` instructions are produced. |
| 75 | /// Conversely, when producing a Unicode scalar value machine, the `Bytes` |
| 76 | /// instruction is never produced. |
| 77 | /// |
| 78 | /// Note that `dfa(true)` implies `bytes(true)`. |
| 79 | pub fn bytes(mut self, yes: bool) -> Self { |
| 80 | self.compiled.is_bytes = yes; |
| 81 | self |
| 82 | } |
| 83 | |
| 84 | /// When disabled, the program compiled may match arbitrary bytes. |
| 85 | /// |
| 86 | /// When enabled (the default), all compiled programs exclusively match |
| 87 | /// valid UTF-8 bytes. |
| 88 | pub fn only_utf8(mut self, yes: bool) -> Self { |
| 89 | self.compiled.only_utf8 = yes; |
| 90 | self |
| 91 | } |
| 92 | |
| 93 | /// When set, the machine returned is suitable for use in the DFA matching |
| 94 | /// engine. |
| 95 | /// |
| 96 | /// In particular, this ensures that if the regex is not anchored in the |
| 97 | /// beginning, then a preceding `.*?` is included in the program. (The NFA |
| 98 | /// based engines handle the preceding `.*?` explicitly, which is difficult |
| 99 | /// or impossible in the DFA engine.) |
| 100 | pub fn dfa(mut self, yes: bool) -> Self { |
| 101 | self.compiled.is_dfa = yes; |
| 102 | self |
| 103 | } |
| 104 | |
| 105 | /// When set, the machine returned is suitable for matching text in |
| 106 | /// reverse. In particular, all concatenations are flipped. |
| 107 | pub fn reverse(mut self, yes: bool) -> Self { |
| 108 | self.compiled.is_reverse = yes; |
| 109 | self |
| 110 | } |
| 111 | |
| 112 | /// Compile a regular expression given its AST. |
| 113 | /// |
| 114 | /// The compiler is guaranteed to succeed unless the program exceeds the |
| 115 | /// specified size limit. If the size limit is exceeded, then compilation |
| 116 | /// stops and returns an error. |
| 117 | pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> { |
| 118 | debug_assert!(!exprs.is_empty()); |
| 119 | self.num_exprs = exprs.len(); |
| 120 | if exprs.len() == 1 { |
| 121 | self.compile_one(&exprs[0]) |
| 122 | } else { |
| 123 | self.compile_many(exprs) |
| 124 | } |
| 125 | } |
| 126 | |
| 127 | fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> { |
| 128 | // If we're compiling a forward DFA and we aren't anchored, then |
| 129 | // add a `.*?` before the first capture group. |
| 130 | // Other matching engines handle this by baking the logic into the |
| 131 | // matching engine itself. |
| 132 | let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; |
| 133 | self.compiled.is_anchored_start = expr.is_anchored_start(); |
| 134 | self.compiled.is_anchored_end = expr.is_anchored_end(); |
| 135 | if self.compiled.needs_dotstar() { |
| 136 | dotstar_patch = self.c_dotstar()?; |
| 137 | self.compiled.start = dotstar_patch.entry; |
| 138 | } |
| 139 | self.compiled.captures = vec![None]; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 140 | let patch = self.c_capture(0, expr)?.unwrap_or(self.next_inst()); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 141 | if self.compiled.needs_dotstar() { |
| 142 | self.fill(dotstar_patch.hole, patch.entry); |
| 143 | } else { |
| 144 | self.compiled.start = patch.entry; |
| 145 | } |
| 146 | self.fill_to_next(patch.hole); |
| 147 | self.compiled.matches = vec![self.insts.len()]; |
| 148 | self.push_compiled(Inst::Match(0)); |
| 149 | self.compile_finish() |
| 150 | } |
| 151 | |
| 152 | fn compile_many( |
| 153 | mut self, |
| 154 | exprs: &[Hir], |
| 155 | ) -> result::Result<Program, Error> { |
| 156 | debug_assert!(exprs.len() > 1); |
| 157 | |
| 158 | self.compiled.is_anchored_start = |
| 159 | exprs.iter().all(|e| e.is_anchored_start()); |
| 160 | self.compiled.is_anchored_end = |
| 161 | exprs.iter().all(|e| e.is_anchored_end()); |
| 162 | let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; |
| 163 | if self.compiled.needs_dotstar() { |
| 164 | dotstar_patch = self.c_dotstar()?; |
| 165 | self.compiled.start = dotstar_patch.entry; |
| 166 | } else { |
| 167 | self.compiled.start = 0; // first instruction is always split |
| 168 | } |
| 169 | self.fill_to_next(dotstar_patch.hole); |
| 170 | |
| 171 | let mut prev_hole = Hole::None; |
| 172 | for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() { |
| 173 | self.fill_to_next(prev_hole); |
| 174 | let split = self.push_split_hole(); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 175 | let Patch { hole, entry } = |
| 176 | self.c_capture(0, expr)?.unwrap_or(self.next_inst()); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 177 | self.fill_to_next(hole); |
| 178 | self.compiled.matches.push(self.insts.len()); |
| 179 | self.push_compiled(Inst::Match(i)); |
| 180 | prev_hole = self.fill_split(split, Some(entry), None); |
| 181 | } |
| 182 | let i = exprs.len() - 1; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 183 | let Patch { hole, entry } = |
| 184 | self.c_capture(0, &exprs[i])?.unwrap_or(self.next_inst()); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 185 | self.fill(prev_hole, entry); |
| 186 | self.fill_to_next(hole); |
| 187 | self.compiled.matches.push(self.insts.len()); |
| 188 | self.push_compiled(Inst::Match(i)); |
| 189 | self.compile_finish() |
| 190 | } |
| 191 | |
| 192 | fn compile_finish(mut self) -> result::Result<Program, Error> { |
| 193 | self.compiled.insts = |
| 194 | self.insts.into_iter().map(|inst| inst.unwrap()).collect(); |
| 195 | self.compiled.byte_classes = self.byte_classes.byte_classes(); |
| 196 | self.compiled.capture_name_idx = Arc::new(self.capture_name_idx); |
| 197 | Ok(self.compiled) |
| 198 | } |
| 199 | |
| 200 | /// Compile expr into self.insts, returning a patch on success, |
| 201 | /// or an error if we run out of memory. |
| 202 | /// |
| 203 | /// All of the c_* methods of the compiler share the contract outlined |
| 204 | /// here. |
| 205 | /// |
| 206 | /// The main thing that a c_* method does is mutate `self.insts` |
| 207 | /// to add a list of mostly compiled instructions required to execute |
| 208 | /// the given expression. `self.insts` contains MaybeInsts rather than |
| 209 | /// Insts because there is some backpatching required. |
| 210 | /// |
| 211 | /// The `Patch` value returned by each c_* method provides metadata |
| 212 | /// about the compiled instructions emitted to `self.insts`. The |
| 213 | /// `entry` member of the patch refers to the first instruction |
| 214 | /// (the entry point), while the `hole` member contains zero or |
| 215 | /// more offsets to partial instructions that need to be backpatched. |
| 216 | /// The c_* routine can't know where its list of instructions are going to |
| 217 | /// jump to after execution, so it is up to the caller to patch |
| 218 | /// these jumps to point to the right place. So compiling some |
| 219 | /// expression, e, we would end up with a situation that looked like: |
| 220 | /// |
| 221 | /// ```text |
| 222 | /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...] |
| 223 | /// ^ ^ ^ |
| 224 | /// | \ / |
| 225 | /// entry \ / |
| 226 | /// hole |
| 227 | /// ``` |
| 228 | /// |
Chih-Hung Hsieh | 849e445 | 2020-10-26 13:16:47 -0700 | [diff] [blame] | 229 | /// To compile two expressions, e1 and e2, concatenated together we |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 230 | /// would do: |
| 231 | /// |
| 232 | /// ```ignore |
| 233 | /// let patch1 = self.c(e1); |
| 234 | /// let patch2 = self.c(e2); |
| 235 | /// ``` |
| 236 | /// |
| 237 | /// while leaves us with a situation that looks like |
| 238 | /// |
| 239 | /// ```text |
| 240 | /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ] |
| 241 | /// ^ ^ ^ ^ |
| 242 | /// | | | | |
| 243 | /// entry1 hole1 entry2 hole2 |
| 244 | /// ``` |
| 245 | /// |
| 246 | /// Then to merge the two patches together into one we would backpatch |
| 247 | /// hole1 with entry2 and return a new patch that enters at entry1 |
| 248 | /// and has hole2 for a hole. In fact, if you look at the c_concat |
| 249 | /// method you will see that it does exactly this, though it handles |
| 250 | /// a list of expressions rather than just the two that we use for |
| 251 | /// an example. |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 252 | /// |
| 253 | /// Ok(None) is returned when an expression is compiled to no |
| 254 | /// instruction, and so no patch.entry value makes sense. |
| 255 | fn c(&mut self, expr: &Hir) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 256 | use prog; |
| 257 | use syntax::hir::HirKind::*; |
| 258 | |
| 259 | self.check_size()?; |
| 260 | match *expr.kind() { |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 261 | Empty => Ok(None), |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 262 | Literal(hir::Literal::Unicode(c)) => self.c_char(c), |
| 263 | Literal(hir::Literal::Byte(b)) => { |
| 264 | assert!(self.compiled.uses_bytes()); |
| 265 | self.c_byte(b) |
| 266 | } |
| 267 | Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()), |
| 268 | Class(hir::Class::Bytes(ref cls)) => { |
| 269 | if self.compiled.uses_bytes() { |
| 270 | self.c_class_bytes(cls.ranges()) |
| 271 | } else { |
| 272 | assert!(cls.is_all_ascii()); |
| 273 | let mut char_ranges = vec![]; |
| 274 | for r in cls.iter() { |
| 275 | let (s, e) = (r.start() as char, r.end() as char); |
| 276 | char_ranges.push(hir::ClassUnicodeRange::new(s, e)); |
| 277 | } |
| 278 | self.c_class(&char_ranges) |
| 279 | } |
| 280 | } |
| 281 | Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => { |
| 282 | self.byte_classes.set_range(b'\n', b'\n'); |
| 283 | self.c_empty_look(prog::EmptyLook::EndLine) |
| 284 | } |
| 285 | Anchor(hir::Anchor::StartLine) => { |
| 286 | self.byte_classes.set_range(b'\n', b'\n'); |
| 287 | self.c_empty_look(prog::EmptyLook::StartLine) |
| 288 | } |
| 289 | Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => { |
| 290 | self.byte_classes.set_range(b'\n', b'\n'); |
| 291 | self.c_empty_look(prog::EmptyLook::StartLine) |
| 292 | } |
| 293 | Anchor(hir::Anchor::EndLine) => { |
| 294 | self.byte_classes.set_range(b'\n', b'\n'); |
| 295 | self.c_empty_look(prog::EmptyLook::EndLine) |
| 296 | } |
| 297 | Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => { |
| 298 | self.c_empty_look(prog::EmptyLook::EndText) |
| 299 | } |
| 300 | Anchor(hir::Anchor::StartText) => { |
| 301 | self.c_empty_look(prog::EmptyLook::StartText) |
| 302 | } |
| 303 | Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => { |
| 304 | self.c_empty_look(prog::EmptyLook::StartText) |
| 305 | } |
| 306 | Anchor(hir::Anchor::EndText) => { |
| 307 | self.c_empty_look(prog::EmptyLook::EndText) |
| 308 | } |
| 309 | WordBoundary(hir::WordBoundary::Unicode) => { |
| 310 | if !cfg!(feature = "unicode-perl") { |
| 311 | return Err(Error::Syntax( |
| 312 | "Unicode word boundaries are unavailable when \ |
| 313 | the unicode-perl feature is disabled" |
| 314 | .to_string(), |
| 315 | )); |
| 316 | } |
| 317 | self.compiled.has_unicode_word_boundary = true; |
| 318 | self.byte_classes.set_word_boundary(); |
| 319 | self.c_empty_look(prog::EmptyLook::WordBoundary) |
| 320 | } |
| 321 | WordBoundary(hir::WordBoundary::UnicodeNegate) => { |
| 322 | if !cfg!(feature = "unicode-perl") { |
| 323 | return Err(Error::Syntax( |
| 324 | "Unicode word boundaries are unavailable when \ |
| 325 | the unicode-perl feature is disabled" |
| 326 | .to_string(), |
| 327 | )); |
| 328 | } |
| 329 | self.compiled.has_unicode_word_boundary = true; |
| 330 | self.byte_classes.set_word_boundary(); |
| 331 | self.c_empty_look(prog::EmptyLook::NotWordBoundary) |
| 332 | } |
| 333 | WordBoundary(hir::WordBoundary::Ascii) => { |
| 334 | self.byte_classes.set_word_boundary(); |
| 335 | self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) |
| 336 | } |
| 337 | WordBoundary(hir::WordBoundary::AsciiNegate) => { |
| 338 | self.byte_classes.set_word_boundary(); |
| 339 | self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) |
| 340 | } |
| 341 | Group(ref g) => match g.kind { |
| 342 | hir::GroupKind::NonCapturing => self.c(&g.hir), |
| 343 | hir::GroupKind::CaptureIndex(index) => { |
| 344 | if index as usize >= self.compiled.captures.len() { |
| 345 | self.compiled.captures.push(None); |
| 346 | } |
| 347 | self.c_capture(2 * index as usize, &g.hir) |
| 348 | } |
| 349 | hir::GroupKind::CaptureName { index, ref name } => { |
| 350 | if index as usize >= self.compiled.captures.len() { |
| 351 | let n = name.to_string(); |
| 352 | self.compiled.captures.push(Some(n.clone())); |
| 353 | self.capture_name_idx.insert(n, index as usize); |
| 354 | } |
| 355 | self.c_capture(2 * index as usize, &g.hir) |
| 356 | } |
| 357 | }, |
| 358 | Concat(ref es) => { |
| 359 | if self.compiled.is_reverse { |
| 360 | self.c_concat(es.iter().rev()) |
| 361 | } else { |
| 362 | self.c_concat(es) |
| 363 | } |
| 364 | } |
| 365 | Alternation(ref es) => self.c_alternate(&**es), |
| 366 | Repetition(ref rep) => self.c_repeat(rep), |
| 367 | } |
| 368 | } |
| 369 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 370 | fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 371 | if self.num_exprs > 1 || self.compiled.is_dfa { |
| 372 | // Don't ever compile Save instructions for regex sets because |
| 373 | // they are never used. They are also never used in DFA programs |
| 374 | // because DFAs can't handle captures. |
| 375 | self.c(expr) |
| 376 | } else { |
| 377 | let entry = self.insts.len(); |
| 378 | let hole = self.push_hole(InstHole::Save { slot: first_slot }); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 379 | let patch = self.c(expr)?.unwrap_or(self.next_inst()); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 380 | self.fill(hole, patch.entry); |
| 381 | self.fill_to_next(patch.hole); |
| 382 | let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 }); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 383 | Ok(Some(Patch { hole: hole, entry: entry })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 384 | } |
| 385 | } |
| 386 | |
| 387 | fn c_dotstar(&mut self) -> Result { |
| 388 | Ok(if !self.compiled.only_utf8() { |
| 389 | self.c(&Hir::repetition(hir::Repetition { |
| 390 | kind: hir::RepetitionKind::ZeroOrMore, |
| 391 | greedy: false, |
| 392 | hir: Box::new(Hir::any(true)), |
| 393 | }))? |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 394 | .unwrap() |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 395 | } else { |
| 396 | self.c(&Hir::repetition(hir::Repetition { |
| 397 | kind: hir::RepetitionKind::ZeroOrMore, |
| 398 | greedy: false, |
| 399 | hir: Box::new(Hir::any(false)), |
| 400 | }))? |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 401 | .unwrap() |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 402 | }) |
| 403 | } |
| 404 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 405 | fn c_char(&mut self, c: char) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 406 | if self.compiled.uses_bytes() { |
| 407 | if c.is_ascii() { |
| 408 | let b = c as u8; |
| 409 | let hole = |
| 410 | self.push_hole(InstHole::Bytes { start: b, end: b }); |
| 411 | self.byte_classes.set_range(b, b); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 412 | Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 413 | } else { |
| 414 | self.c_class(&[hir::ClassUnicodeRange::new(c, c)]) |
| 415 | } |
| 416 | } else { |
| 417 | let hole = self.push_hole(InstHole::Char { c: c }); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 418 | Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 419 | } |
| 420 | } |
| 421 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 422 | fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 423 | assert!(!ranges.is_empty()); |
| 424 | if self.compiled.uses_bytes() { |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 425 | Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?)) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 426 | } else { |
| 427 | let ranges: Vec<(char, char)> = |
| 428 | ranges.iter().map(|r| (r.start(), r.end())).collect(); |
| 429 | let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 { |
| 430 | self.push_hole(InstHole::Char { c: ranges[0].0 }) |
| 431 | } else { |
| 432 | self.push_hole(InstHole::Ranges { ranges: ranges }) |
| 433 | }; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 434 | Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 435 | } |
| 436 | } |
| 437 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 438 | fn c_byte(&mut self, b: u8) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 439 | self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)]) |
| 440 | } |
| 441 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 442 | fn c_class_bytes( |
| 443 | &mut self, |
| 444 | ranges: &[hir::ClassBytesRange], |
| 445 | ) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 446 | debug_assert!(!ranges.is_empty()); |
| 447 | |
| 448 | let first_split_entry = self.insts.len(); |
| 449 | let mut holes = vec![]; |
| 450 | let mut prev_hole = Hole::None; |
| 451 | for r in &ranges[0..ranges.len() - 1] { |
| 452 | self.fill_to_next(prev_hole); |
| 453 | let split = self.push_split_hole(); |
| 454 | let next = self.insts.len(); |
| 455 | self.byte_classes.set_range(r.start(), r.end()); |
| 456 | holes.push(self.push_hole(InstHole::Bytes { |
| 457 | start: r.start(), |
| 458 | end: r.end(), |
| 459 | })); |
| 460 | prev_hole = self.fill_split(split, Some(next), None); |
| 461 | } |
| 462 | let next = self.insts.len(); |
| 463 | let r = &ranges[ranges.len() - 1]; |
| 464 | self.byte_classes.set_range(r.start(), r.end()); |
| 465 | holes.push( |
| 466 | self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }), |
| 467 | ); |
| 468 | self.fill(prev_hole, next); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 469 | Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 470 | } |
| 471 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 472 | fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 473 | let hole = self.push_hole(InstHole::EmptyLook { look: look }); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 474 | Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 475 | } |
| 476 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 477 | fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 478 | where |
| 479 | I: IntoIterator<Item = &'a Hir>, |
| 480 | { |
| 481 | let mut exprs = exprs.into_iter(); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 482 | let Patch { mut hole, entry } = loop { |
| 483 | match exprs.next() { |
| 484 | None => return Ok(None), |
| 485 | Some(e) => { |
| 486 | if let Some(p) = self.c(e)? { |
| 487 | break p; |
| 488 | } |
| 489 | } |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 490 | } |
| 491 | }; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 492 | for e in exprs { |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 493 | if let Some(p) = self.c(e)? { |
| 494 | self.fill(hole, p.entry); |
| 495 | hole = p.hole; |
| 496 | } |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 497 | } |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 498 | Ok(Some(Patch { hole: hole, entry: entry })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 499 | } |
| 500 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 501 | fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 502 | debug_assert!( |
| 503 | exprs.len() >= 2, |
| 504 | "alternates must have at least 2 exprs" |
| 505 | ); |
| 506 | |
| 507 | // Initial entry point is always the first split. |
| 508 | let first_split_entry = self.insts.len(); |
| 509 | |
| 510 | // Save up all of the holes from each alternate. They will all get |
| 511 | // patched to point to the same location. |
| 512 | let mut holes = vec![]; |
| 513 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 514 | // true indicates that the hole is a split where we want to fill |
| 515 | // the second branch. |
| 516 | let mut prev_hole = (Hole::None, false); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 517 | for e in &exprs[0..exprs.len() - 1] { |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 518 | if prev_hole.1 { |
| 519 | let next = self.insts.len(); |
| 520 | self.fill_split(prev_hole.0, None, Some(next)); |
| 521 | } else { |
| 522 | self.fill_to_next(prev_hole.0); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 523 | } |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 524 | let split = self.push_split_hole(); |
| 525 | if let Some(Patch { hole, entry }) = self.c(e)? { |
| 526 | holes.push(hole); |
| 527 | prev_hole = (self.fill_split(split, Some(entry), None), false); |
| 528 | } else { |
| 529 | let (split1, split2) = split.dup_one(); |
| 530 | holes.push(split1); |
| 531 | prev_hole = (split2, true); |
| 532 | } |
| 533 | } |
| 534 | if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 535 | holes.push(hole); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 536 | if prev_hole.1 { |
| 537 | self.fill_split(prev_hole.0, None, Some(entry)); |
| 538 | } else { |
| 539 | self.fill(prev_hole.0, entry); |
| 540 | } |
| 541 | } else { |
| 542 | // We ignore prev_hole.1. When it's true, it means we have two |
| 543 | // empty branches both pushing prev_hole.0 into holes, so both |
| 544 | // branches will go to the same place anyway. |
| 545 | holes.push(prev_hole.0); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 546 | } |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 547 | Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 548 | } |
| 549 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 550 | fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 551 | use syntax::hir::RepetitionKind::*; |
| 552 | match rep.kind { |
| 553 | ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy), |
| 554 | ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy), |
| 555 | OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy), |
| 556 | Range(hir::RepetitionRange::Exactly(min_max)) => { |
| 557 | self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max) |
| 558 | } |
| 559 | Range(hir::RepetitionRange::AtLeast(min)) => { |
| 560 | self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min) |
| 561 | } |
| 562 | Range(hir::RepetitionRange::Bounded(min, max)) => { |
| 563 | self.c_repeat_range(&rep.hir, rep.greedy, min, max) |
| 564 | } |
| 565 | } |
| 566 | } |
| 567 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 568 | fn c_repeat_zero_or_one( |
| 569 | &mut self, |
| 570 | expr: &Hir, |
| 571 | greedy: bool, |
| 572 | ) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 573 | let split_entry = self.insts.len(); |
| 574 | let split = self.push_split_hole(); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 575 | let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { |
| 576 | Some(p) => p, |
| 577 | None => return self.pop_split_hole(), |
| 578 | }; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 579 | let split_hole = if greedy { |
| 580 | self.fill_split(split, Some(entry_rep), None) |
| 581 | } else { |
| 582 | self.fill_split(split, None, Some(entry_rep)) |
| 583 | }; |
| 584 | let holes = vec![hole_rep, split_hole]; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 585 | Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 586 | } |
| 587 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 588 | fn c_repeat_zero_or_more( |
| 589 | &mut self, |
| 590 | expr: &Hir, |
| 591 | greedy: bool, |
| 592 | ) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 593 | let split_entry = self.insts.len(); |
| 594 | let split = self.push_split_hole(); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 595 | let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { |
| 596 | Some(p) => p, |
| 597 | None => return self.pop_split_hole(), |
| 598 | }; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 599 | |
| 600 | self.fill(hole_rep, split_entry); |
| 601 | let split_hole = if greedy { |
| 602 | self.fill_split(split, Some(entry_rep), None) |
| 603 | } else { |
| 604 | self.fill_split(split, None, Some(entry_rep)) |
| 605 | }; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 606 | Ok(Some(Patch { hole: split_hole, entry: split_entry })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 607 | } |
| 608 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 609 | fn c_repeat_one_or_more( |
| 610 | &mut self, |
| 611 | expr: &Hir, |
| 612 | greedy: bool, |
| 613 | ) -> ResultOrEmpty { |
| 614 | let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { |
| 615 | Some(p) => p, |
| 616 | None => return Ok(None), |
| 617 | }; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 618 | self.fill_to_next(hole_rep); |
| 619 | let split = self.push_split_hole(); |
| 620 | |
| 621 | let split_hole = if greedy { |
| 622 | self.fill_split(split, Some(entry_rep), None) |
| 623 | } else { |
| 624 | self.fill_split(split, None, Some(entry_rep)) |
| 625 | }; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 626 | Ok(Some(Patch { hole: split_hole, entry: entry_rep })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 627 | } |
| 628 | |
| 629 | fn c_repeat_range_min_or_more( |
| 630 | &mut self, |
| 631 | expr: &Hir, |
| 632 | greedy: bool, |
| 633 | min: u32, |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 634 | ) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 635 | let min = u32_to_usize(min); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 636 | // Using next_inst() is ok, because we can't return it (concat would |
| 637 | // have to return Some(_) while c_repeat_range_min_or_more returns |
| 638 | // None). |
| 639 | let patch_concat = self |
| 640 | .c_concat(iter::repeat(expr).take(min))? |
| 641 | .unwrap_or(self.next_inst()); |
| 642 | if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? { |
| 643 | self.fill(patch_concat.hole, patch_rep.entry); |
| 644 | Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry })) |
| 645 | } else { |
| 646 | Ok(None) |
| 647 | } |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 648 | } |
| 649 | |
| 650 | fn c_repeat_range( |
| 651 | &mut self, |
| 652 | expr: &Hir, |
| 653 | greedy: bool, |
| 654 | min: u32, |
| 655 | max: u32, |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 656 | ) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 657 | let (min, max) = (u32_to_usize(min), u32_to_usize(max)); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 658 | debug_assert!(min <= max); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 659 | let patch_concat = self.c_concat(iter::repeat(expr).take(min))?; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 660 | if min == max { |
| 661 | return Ok(patch_concat); |
| 662 | } |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 663 | // Same reasoning as in c_repeat_range_min_or_more (we know that min < |
| 664 | // max at this point). |
| 665 | let patch_concat = patch_concat.unwrap_or(self.next_inst()); |
| 666 | let initial_entry = patch_concat.entry; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 667 | // It is much simpler to compile, e.g., `a{2,5}` as: |
| 668 | // |
| 669 | // aaa?a?a? |
| 670 | // |
| 671 | // But you end up with a sequence of instructions like this: |
| 672 | // |
| 673 | // 0: 'a' |
| 674 | // 1: 'a', |
| 675 | // 2: split(3, 4) |
| 676 | // 3: 'a' |
| 677 | // 4: split(5, 6) |
| 678 | // 5: 'a' |
| 679 | // 6: split(7, 8) |
| 680 | // 7: 'a' |
| 681 | // 8: MATCH |
| 682 | // |
| 683 | // This is *incredibly* inefficient because the splits end |
| 684 | // up forming a chain, which has to be resolved everything a |
| 685 | // transition is followed. |
| 686 | let mut holes = vec![]; |
| 687 | let mut prev_hole = patch_concat.hole; |
| 688 | for _ in min..max { |
| 689 | self.fill_to_next(prev_hole); |
| 690 | let split = self.push_split_hole(); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 691 | let Patch { hole, entry } = match self.c(expr)? { |
| 692 | Some(p) => p, |
| 693 | None => return self.pop_split_hole(), |
| 694 | }; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 695 | prev_hole = hole; |
| 696 | if greedy { |
| 697 | holes.push(self.fill_split(split, Some(entry), None)); |
| 698 | } else { |
| 699 | holes.push(self.fill_split(split, None, Some(entry))); |
| 700 | } |
| 701 | } |
| 702 | holes.push(prev_hole); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 703 | Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry })) |
| 704 | } |
| 705 | |
| 706 | /// Can be used as a default value for the c_* functions when the call to |
| 707 | /// c_function is followed by inserting at least one instruction that is |
| 708 | /// always executed after the ones written by the c* function. |
| 709 | fn next_inst(&self) -> Patch { |
| 710 | Patch { hole: Hole::None, entry: self.insts.len() } |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 711 | } |
| 712 | |
| 713 | fn fill(&mut self, hole: Hole, goto: InstPtr) { |
| 714 | match hole { |
| 715 | Hole::None => {} |
| 716 | Hole::One(pc) => { |
| 717 | self.insts[pc].fill(goto); |
| 718 | } |
| 719 | Hole::Many(holes) => { |
| 720 | for hole in holes { |
| 721 | self.fill(hole, goto); |
| 722 | } |
| 723 | } |
| 724 | } |
| 725 | } |
| 726 | |
| 727 | fn fill_to_next(&mut self, hole: Hole) { |
| 728 | let next = self.insts.len(); |
| 729 | self.fill(hole, next); |
| 730 | } |
| 731 | |
| 732 | fn fill_split( |
| 733 | &mut self, |
| 734 | hole: Hole, |
| 735 | goto1: Option<InstPtr>, |
| 736 | goto2: Option<InstPtr>, |
| 737 | ) -> Hole { |
| 738 | match hole { |
| 739 | Hole::None => Hole::None, |
| 740 | Hole::One(pc) => match (goto1, goto2) { |
| 741 | (Some(goto1), Some(goto2)) => { |
| 742 | self.insts[pc].fill_split(goto1, goto2); |
| 743 | Hole::None |
| 744 | } |
| 745 | (Some(goto1), None) => { |
| 746 | self.insts[pc].half_fill_split_goto1(goto1); |
| 747 | Hole::One(pc) |
| 748 | } |
| 749 | (None, Some(goto2)) => { |
| 750 | self.insts[pc].half_fill_split_goto2(goto2); |
| 751 | Hole::One(pc) |
| 752 | } |
| 753 | (None, None) => unreachable!( |
| 754 | "at least one of the split \ |
| 755 | holes must be filled" |
| 756 | ), |
| 757 | }, |
| 758 | Hole::Many(holes) => { |
| 759 | let mut new_holes = vec![]; |
| 760 | for hole in holes { |
| 761 | new_holes.push(self.fill_split(hole, goto1, goto2)); |
| 762 | } |
| 763 | if new_holes.is_empty() { |
| 764 | Hole::None |
| 765 | } else if new_holes.len() == 1 { |
| 766 | new_holes.pop().unwrap() |
| 767 | } else { |
| 768 | Hole::Many(new_holes) |
| 769 | } |
| 770 | } |
| 771 | } |
| 772 | } |
| 773 | |
| 774 | fn push_compiled(&mut self, inst: Inst) { |
| 775 | self.insts.push(MaybeInst::Compiled(inst)); |
| 776 | } |
| 777 | |
| 778 | fn push_hole(&mut self, inst: InstHole) -> Hole { |
| 779 | let hole = self.insts.len(); |
| 780 | self.insts.push(MaybeInst::Uncompiled(inst)); |
| 781 | Hole::One(hole) |
| 782 | } |
| 783 | |
| 784 | fn push_split_hole(&mut self) -> Hole { |
| 785 | let hole = self.insts.len(); |
| 786 | self.insts.push(MaybeInst::Split); |
| 787 | Hole::One(hole) |
| 788 | } |
| 789 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 790 | fn pop_split_hole(&mut self) -> ResultOrEmpty { |
| 791 | self.insts.pop(); |
| 792 | Ok(None) |
| 793 | } |
| 794 | |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 795 | fn check_size(&self) -> result::Result<(), Error> { |
| 796 | use std::mem::size_of; |
| 797 | |
| 798 | if self.insts.len() * size_of::<Inst>() > self.size_limit { |
| 799 | Err(Error::CompiledTooBig(self.size_limit)) |
| 800 | } else { |
| 801 | Ok(()) |
| 802 | } |
| 803 | } |
| 804 | } |
| 805 | |
| 806 | #[derive(Debug)] |
| 807 | enum Hole { |
| 808 | None, |
| 809 | One(InstPtr), |
| 810 | Many(Vec<Hole>), |
| 811 | } |
| 812 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 813 | impl Hole { |
| 814 | fn dup_one(self) -> (Self, Self) { |
| 815 | match self { |
| 816 | Hole::One(pc) => (Hole::One(pc), Hole::One(pc)), |
| 817 | Hole::None | Hole::Many(_) => { |
| 818 | unreachable!("must be called on single hole") |
| 819 | } |
| 820 | } |
| 821 | } |
| 822 | } |
| 823 | |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 824 | #[derive(Clone, Debug)] |
| 825 | enum MaybeInst { |
| 826 | Compiled(Inst), |
| 827 | Uncompiled(InstHole), |
| 828 | Split, |
| 829 | Split1(InstPtr), |
| 830 | Split2(InstPtr), |
| 831 | } |
| 832 | |
| 833 | impl MaybeInst { |
| 834 | fn fill(&mut self, goto: InstPtr) { |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 835 | let maybeinst = match *self { |
| 836 | MaybeInst::Split => MaybeInst::Split1(goto), |
| 837 | MaybeInst::Uncompiled(ref inst) => { |
| 838 | MaybeInst::Compiled(inst.fill(goto)) |
| 839 | } |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 840 | MaybeInst::Split1(goto1) => { |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 841 | MaybeInst::Compiled(Inst::Split(InstSplit { |
| 842 | goto1: goto1, |
| 843 | goto2: goto, |
| 844 | })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 845 | } |
| 846 | MaybeInst::Split2(goto2) => { |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 847 | MaybeInst::Compiled(Inst::Split(InstSplit { |
| 848 | goto1: goto, |
| 849 | goto2: goto2, |
| 850 | })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 851 | } |
| 852 | _ => unreachable!( |
| 853 | "not all instructions were compiled! \ |
| 854 | found uncompiled instruction: {:?}", |
| 855 | self |
| 856 | ), |
| 857 | }; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 858 | *self = maybeinst; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 859 | } |
| 860 | |
| 861 | fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) { |
| 862 | let filled = match *self { |
| 863 | MaybeInst::Split => { |
| 864 | Inst::Split(InstSplit { goto1: goto1, goto2: goto2 }) |
| 865 | } |
| 866 | _ => unreachable!( |
| 867 | "must be called on Split instruction, \ |
| 868 | instead it was called on: {:?}", |
| 869 | self |
| 870 | ), |
| 871 | }; |
| 872 | *self = MaybeInst::Compiled(filled); |
| 873 | } |
| 874 | |
| 875 | fn half_fill_split_goto1(&mut self, goto1: InstPtr) { |
| 876 | let half_filled = match *self { |
| 877 | MaybeInst::Split => goto1, |
| 878 | _ => unreachable!( |
| 879 | "must be called on Split instruction, \ |
| 880 | instead it was called on: {:?}", |
| 881 | self |
| 882 | ), |
| 883 | }; |
| 884 | *self = MaybeInst::Split1(half_filled); |
| 885 | } |
| 886 | |
| 887 | fn half_fill_split_goto2(&mut self, goto2: InstPtr) { |
| 888 | let half_filled = match *self { |
| 889 | MaybeInst::Split => goto2, |
| 890 | _ => unreachable!( |
| 891 | "must be called on Split instruction, \ |
| 892 | instead it was called on: {:?}", |
| 893 | self |
| 894 | ), |
| 895 | }; |
| 896 | *self = MaybeInst::Split2(half_filled); |
| 897 | } |
| 898 | |
| 899 | fn unwrap(self) -> Inst { |
| 900 | match self { |
| 901 | MaybeInst::Compiled(inst) => inst, |
| 902 | _ => unreachable!( |
| 903 | "must be called on a compiled instruction, \ |
| 904 | instead it was called on: {:?}", |
| 905 | self |
| 906 | ), |
| 907 | } |
| 908 | } |
| 909 | } |
| 910 | |
| 911 | #[derive(Clone, Debug)] |
| 912 | enum InstHole { |
| 913 | Save { slot: usize }, |
| 914 | EmptyLook { look: EmptyLook }, |
| 915 | Char { c: char }, |
| 916 | Ranges { ranges: Vec<(char, char)> }, |
| 917 | Bytes { start: u8, end: u8 }, |
| 918 | } |
| 919 | |
| 920 | impl InstHole { |
| 921 | fn fill(&self, goto: InstPtr) -> Inst { |
| 922 | match *self { |
| 923 | InstHole::Save { slot } => { |
| 924 | Inst::Save(InstSave { goto: goto, slot: slot }) |
| 925 | } |
| 926 | InstHole::EmptyLook { look } => { |
| 927 | Inst::EmptyLook(InstEmptyLook { goto: goto, look: look }) |
| 928 | } |
| 929 | InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }), |
| 930 | InstHole::Ranges { ref ranges } => { |
| 931 | Inst::Ranges(InstRanges { goto: goto, ranges: ranges.clone() }) |
| 932 | } |
| 933 | InstHole::Bytes { start, end } => { |
| 934 | Inst::Bytes(InstBytes { goto: goto, start: start, end: end }) |
| 935 | } |
| 936 | } |
| 937 | } |
| 938 | } |
| 939 | |
| 940 | struct CompileClass<'a, 'b> { |
| 941 | c: &'a mut Compiler, |
| 942 | ranges: &'b [hir::ClassUnicodeRange], |
| 943 | } |
| 944 | |
| 945 | impl<'a, 'b> CompileClass<'a, 'b> { |
| 946 | fn compile(mut self) -> Result { |
| 947 | let mut holes = vec![]; |
| 948 | let mut initial_entry = None; |
| 949 | let mut last_split = Hole::None; |
| 950 | let mut utf8_seqs = self.c.utf8_seqs.take().unwrap(); |
| 951 | self.c.suffix_cache.clear(); |
| 952 | |
| 953 | for (i, range) in self.ranges.iter().enumerate() { |
| 954 | let is_last_range = i + 1 == self.ranges.len(); |
| 955 | utf8_seqs.reset(range.start(), range.end()); |
| 956 | let mut it = (&mut utf8_seqs).peekable(); |
| 957 | loop { |
| 958 | let utf8_seq = match it.next() { |
| 959 | None => break, |
| 960 | Some(utf8_seq) => utf8_seq, |
| 961 | }; |
| 962 | if is_last_range && it.peek().is_none() { |
| 963 | let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; |
| 964 | holes.push(hole); |
| 965 | self.c.fill(last_split, entry); |
| 966 | last_split = Hole::None; |
| 967 | if initial_entry.is_none() { |
| 968 | initial_entry = Some(entry); |
| 969 | } |
| 970 | } else { |
| 971 | if initial_entry.is_none() { |
| 972 | initial_entry = Some(self.c.insts.len()); |
| 973 | } |
| 974 | self.c.fill_to_next(last_split); |
| 975 | last_split = self.c.push_split_hole(); |
| 976 | let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; |
| 977 | holes.push(hole); |
| 978 | last_split = |
| 979 | self.c.fill_split(last_split, Some(entry), None); |
| 980 | } |
| 981 | } |
| 982 | } |
| 983 | self.c.utf8_seqs = Some(utf8_seqs); |
| 984 | Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() }) |
| 985 | } |
| 986 | |
| 987 | fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result { |
| 988 | if self.c.compiled.is_reverse { |
| 989 | self.c_utf8_seq_(seq) |
| 990 | } else { |
| 991 | self.c_utf8_seq_(seq.into_iter().rev()) |
| 992 | } |
| 993 | } |
| 994 | |
| 995 | fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result |
| 996 | where |
| 997 | I: IntoIterator<Item = &'r Utf8Range>, |
| 998 | { |
| 999 | // The initial instruction for each UTF-8 sequence should be the same. |
| 1000 | let mut from_inst = ::std::usize::MAX; |
| 1001 | let mut last_hole = Hole::None; |
| 1002 | for byte_range in seq { |
| 1003 | let key = SuffixCacheKey { |
| 1004 | from_inst: from_inst, |
| 1005 | start: byte_range.start, |
| 1006 | end: byte_range.end, |
| 1007 | }; |
| 1008 | { |
| 1009 | let pc = self.c.insts.len(); |
| 1010 | if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) { |
| 1011 | from_inst = cached_pc; |
| 1012 | continue; |
| 1013 | } |
| 1014 | } |
| 1015 | self.c.byte_classes.set_range(byte_range.start, byte_range.end); |
| 1016 | if from_inst == ::std::usize::MAX { |
| 1017 | last_hole = self.c.push_hole(InstHole::Bytes { |
| 1018 | start: byte_range.start, |
| 1019 | end: byte_range.end, |
| 1020 | }); |
| 1021 | } else { |
| 1022 | self.c.push_compiled(Inst::Bytes(InstBytes { |
| 1023 | goto: from_inst, |
| 1024 | start: byte_range.start, |
| 1025 | end: byte_range.end, |
| 1026 | })); |
| 1027 | } |
| 1028 | from_inst = self.c.insts.len().checked_sub(1).unwrap(); |
| 1029 | debug_assert!(from_inst < ::std::usize::MAX); |
| 1030 | } |
| 1031 | debug_assert!(from_inst < ::std::usize::MAX); |
| 1032 | Ok(Patch { hole: last_hole, entry: from_inst }) |
| 1033 | } |
| 1034 | } |
| 1035 | |
| 1036 | /// `SuffixCache` is a simple bounded hash map for caching suffix entries in |
| 1037 | /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}. |
| 1038 | /// The set of byte ranges looks like this: |
| 1039 | /// |
| 1040 | /// [0-7F] |
| 1041 | /// [C2-DF][80-BF] |
| 1042 | /// [E0][A0-BF][80-BF] |
| 1043 | /// [E1-EC][80-BF][80-BF] |
| 1044 | /// [ED][80-9F][80-BF] |
| 1045 | /// [EE-EF][80-BF][80-BF] |
| 1046 | /// |
| 1047 | /// Each line above translates to one alternate in the compiled regex program. |
| 1048 | /// However, all but one of the alternates end in the same suffix, which is |
| 1049 | /// a waste of an instruction. The suffix cache facilitates reusing them across |
| 1050 | /// alternates. |
| 1051 | /// |
| 1052 | /// Note that a HashMap could be trivially used for this, but we don't need its |
| 1053 | /// overhead. Some small bounded space (LRU style) is more than enough. |
| 1054 | /// |
| 1055 | /// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html), |
| 1056 | /// except it uses hashes as original indices and then compares full keys for |
| 1057 | /// validation against `dense` array. |
Haibo Huang | 47619dd | 2021-01-08 17:05:43 -0800 | [diff] [blame] | 1058 | #[derive(Debug)] |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 1059 | struct SuffixCache { |
| 1060 | sparse: Box<[usize]>, |
| 1061 | dense: Vec<SuffixCacheEntry>, |
| 1062 | } |
| 1063 | |
| 1064 | #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] |
| 1065 | struct SuffixCacheEntry { |
| 1066 | key: SuffixCacheKey, |
| 1067 | pc: InstPtr, |
| 1068 | } |
| 1069 | |
| 1070 | #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] |
| 1071 | struct SuffixCacheKey { |
| 1072 | from_inst: InstPtr, |
| 1073 | start: u8, |
| 1074 | end: u8, |
| 1075 | } |
| 1076 | |
| 1077 | impl SuffixCache { |
| 1078 | fn new(size: usize) -> Self { |
| 1079 | SuffixCache { |
| 1080 | sparse: vec![0usize; size].into(), |
| 1081 | dense: Vec::with_capacity(size), |
| 1082 | } |
| 1083 | } |
| 1084 | |
| 1085 | fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> { |
| 1086 | let hash = self.hash(&key); |
| 1087 | let pos = &mut self.sparse[hash]; |
| 1088 | if let Some(entry) = self.dense.get(*pos) { |
| 1089 | if entry.key == key { |
| 1090 | return Some(entry.pc); |
| 1091 | } |
| 1092 | } |
| 1093 | *pos = self.dense.len(); |
| 1094 | self.dense.push(SuffixCacheEntry { key: key, pc: pc }); |
| 1095 | None |
| 1096 | } |
| 1097 | |
| 1098 | fn clear(&mut self) { |
| 1099 | self.dense.clear(); |
| 1100 | } |
| 1101 | |
| 1102 | fn hash(&self, suffix: &SuffixCacheKey) -> usize { |
| 1103 | // Basic FNV-1a hash as described: |
| 1104 | // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function |
| 1105 | const FNV_PRIME: u64 = 1099511628211; |
| 1106 | let mut h = 14695981039346656037; |
| 1107 | h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME); |
| 1108 | h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME); |
| 1109 | h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME); |
| 1110 | (h as usize) % self.sparse.len() |
| 1111 | } |
| 1112 | } |
| 1113 | |
| 1114 | struct ByteClassSet([bool; 256]); |
| 1115 | |
| 1116 | impl ByteClassSet { |
| 1117 | fn new() -> Self { |
| 1118 | ByteClassSet([false; 256]) |
| 1119 | } |
| 1120 | |
| 1121 | fn set_range(&mut self, start: u8, end: u8) { |
| 1122 | debug_assert!(start <= end); |
| 1123 | if start > 0 { |
| 1124 | self.0[start as usize - 1] = true; |
| 1125 | } |
| 1126 | self.0[end as usize] = true; |
| 1127 | } |
| 1128 | |
| 1129 | fn set_word_boundary(&mut self) { |
| 1130 | // We need to mark all ranges of bytes whose pairs result in |
| 1131 | // evaluating \b differently. |
| 1132 | let iswb = is_word_byte; |
| 1133 | let mut b1: u16 = 0; |
| 1134 | let mut b2: u16; |
| 1135 | while b1 <= 255 { |
| 1136 | b2 = b1 + 1; |
| 1137 | while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) { |
| 1138 | b2 += 1; |
| 1139 | } |
| 1140 | self.set_range(b1 as u8, (b2 - 1) as u8); |
| 1141 | b1 = b2; |
| 1142 | } |
| 1143 | } |
| 1144 | |
| 1145 | fn byte_classes(&self) -> Vec<u8> { |
| 1146 | // N.B. If you're debugging the DFA, it's useful to simply return |
| 1147 | // `(0..256).collect()`, which effectively removes the byte classes |
| 1148 | // and makes the transitions easier to read. |
| 1149 | // (0usize..256).map(|x| x as u8).collect() |
| 1150 | let mut byte_classes = vec![0; 256]; |
| 1151 | let mut class = 0u8; |
| 1152 | let mut i = 0; |
| 1153 | loop { |
| 1154 | byte_classes[i] = class as u8; |
| 1155 | if i >= 255 { |
| 1156 | break; |
| 1157 | } |
| 1158 | if self.0[i] { |
| 1159 | class = class.checked_add(1).unwrap(); |
| 1160 | } |
| 1161 | i += 1; |
| 1162 | } |
| 1163 | byte_classes |
| 1164 | } |
| 1165 | } |
| 1166 | |
Haibo Huang | 47619dd | 2021-01-08 17:05:43 -0800 | [diff] [blame] | 1167 | impl fmt::Debug for ByteClassSet { |
| 1168 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 1169 | f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish() |
| 1170 | } |
| 1171 | } |
| 1172 | |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 1173 | fn u32_to_usize(n: u32) -> usize { |
| 1174 | // In case usize is less than 32 bits, we need to guard against overflow. |
| 1175 | // On most platforms this compiles to nothing. |
| 1176 | // TODO Use `std::convert::TryFrom` once it's stable. |
| 1177 | if (n as u64) > (::std::usize::MAX as u64) { |
| 1178 | panic!("BUG: {} is too big to be pointer sized", n) |
| 1179 | } |
| 1180 | n as usize |
| 1181 | } |
| 1182 | |
| 1183 | #[cfg(test)] |
| 1184 | mod tests { |
| 1185 | use super::ByteClassSet; |
| 1186 | |
| 1187 | #[test] |
| 1188 | fn byte_classes() { |
| 1189 | let mut set = ByteClassSet::new(); |
| 1190 | set.set_range(b'a', b'z'); |
| 1191 | let classes = set.byte_classes(); |
| 1192 | assert_eq!(classes[0], 0); |
| 1193 | assert_eq!(classes[1], 0); |
| 1194 | assert_eq!(classes[2], 0); |
| 1195 | assert_eq!(classes[b'a' as usize - 1], 0); |
| 1196 | assert_eq!(classes[b'a' as usize], 1); |
| 1197 | assert_eq!(classes[b'm' as usize], 1); |
| 1198 | assert_eq!(classes[b'z' as usize], 1); |
| 1199 | assert_eq!(classes[b'z' as usize + 1], 2); |
| 1200 | assert_eq!(classes[254], 2); |
| 1201 | assert_eq!(classes[255], 2); |
| 1202 | |
| 1203 | let mut set = ByteClassSet::new(); |
| 1204 | set.set_range(0, 2); |
| 1205 | set.set_range(4, 6); |
| 1206 | let classes = set.byte_classes(); |
| 1207 | assert_eq!(classes[0], 0); |
| 1208 | assert_eq!(classes[1], 0); |
| 1209 | assert_eq!(classes[2], 0); |
| 1210 | assert_eq!(classes[3], 1); |
| 1211 | assert_eq!(classes[4], 2); |
| 1212 | assert_eq!(classes[5], 2); |
| 1213 | assert_eq!(classes[6], 2); |
| 1214 | assert_eq!(classes[7], 3); |
| 1215 | assert_eq!(classes[255], 3); |
| 1216 | } |
| 1217 | |
| 1218 | #[test] |
| 1219 | fn full_byte_classes() { |
| 1220 | let mut set = ByteClassSet::new(); |
| 1221 | for i in 0..256u16 { |
| 1222 | set.set_range(i as u8, i as u8); |
| 1223 | } |
| 1224 | assert_eq!(set.byte_classes().len(), 256); |
| 1225 | } |
| 1226 | } |