Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 1 | use std::collections::HashMap; |
Haibo Huang | 47619dd | 2021-01-08 17:05:43 -0800 | [diff] [blame] | 2 | use std::fmt; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 3 | use std::iter; |
| 4 | use std::result; |
| 5 | use std::sync::Arc; |
| 6 | |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 7 | use regex_syntax::hir::{self, Hir}; |
| 8 | use regex_syntax::is_word_byte; |
| 9 | use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences}; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 10 | |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 11 | use crate::prog::{ |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 12 | EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges, |
| 13 | InstSave, InstSplit, Program, |
| 14 | }; |
| 15 | |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 16 | use crate::Error; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 17 | |
| 18 | type Result = result::Result<Patch, Error>; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 19 | type ResultOrEmpty = result::Result<Option<Patch>, Error>; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 20 | |
| 21 | #[derive(Debug)] |
| 22 | struct Patch { |
| 23 | hole: Hole, |
| 24 | entry: InstPtr, |
| 25 | } |
| 26 | |
| 27 | /// A compiler translates a regular expression AST to a sequence of |
| 28 | /// instructions. The sequence of instructions represents an NFA. |
Haibo Huang | 47619dd | 2021-01-08 17:05:43 -0800 | [diff] [blame] | 29 | // `Compiler` is only public via the `internal` module, so avoid deriving |
| 30 | // `Debug`. |
| 31 | #[allow(missing_debug_implementations)] |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 32 | pub struct Compiler { |
| 33 | insts: Vec<MaybeInst>, |
| 34 | compiled: Program, |
| 35 | capture_name_idx: HashMap<String, usize>, |
| 36 | num_exprs: usize, |
| 37 | size_limit: usize, |
| 38 | suffix_cache: SuffixCache, |
| 39 | utf8_seqs: Option<Utf8Sequences>, |
| 40 | byte_classes: ByteClassSet, |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 41 | extra_inst_bytes: usize, |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 42 | } |
| 43 | |
| 44 | impl Compiler { |
| 45 | /// Create a new regular expression compiler. |
| 46 | /// |
| 47 | /// Various options can be set before calling `compile` on an expression. |
| 48 | pub fn new() -> Self { |
| 49 | Compiler { |
| 50 | insts: vec![], |
| 51 | compiled: Program::new(), |
| 52 | capture_name_idx: HashMap::new(), |
| 53 | num_exprs: 0, |
| 54 | size_limit: 10 * (1 << 20), |
| 55 | suffix_cache: SuffixCache::new(1000), |
| 56 | utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')), |
| 57 | byte_classes: ByteClassSet::new(), |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 58 | extra_inst_bytes: 0, |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 59 | } |
| 60 | } |
| 61 | |
| 62 | /// The size of the resulting program is limited by size_limit. If |
| 63 | /// the program approximately exceeds the given size (in bytes), then |
| 64 | /// compilation will stop and return an error. |
| 65 | pub fn size_limit(mut self, size_limit: usize) -> Self { |
| 66 | self.size_limit = size_limit; |
| 67 | self |
| 68 | } |
| 69 | |
| 70 | /// If bytes is true, then the program is compiled as a byte based |
| 71 | /// automaton, which incorporates UTF-8 decoding into the machine. If it's |
| 72 | /// false, then the automaton is Unicode scalar value based, e.g., an |
| 73 | /// engine utilizing such an automaton is responsible for UTF-8 decoding. |
| 74 | /// |
| 75 | /// The specific invariant is that when returning a byte based machine, |
| 76 | /// the neither the `Char` nor `Ranges` instructions are produced. |
| 77 | /// Conversely, when producing a Unicode scalar value machine, the `Bytes` |
| 78 | /// instruction is never produced. |
| 79 | /// |
| 80 | /// Note that `dfa(true)` implies `bytes(true)`. |
| 81 | pub fn bytes(mut self, yes: bool) -> Self { |
| 82 | self.compiled.is_bytes = yes; |
| 83 | self |
| 84 | } |
| 85 | |
| 86 | /// When disabled, the program compiled may match arbitrary bytes. |
| 87 | /// |
| 88 | /// When enabled (the default), all compiled programs exclusively match |
| 89 | /// valid UTF-8 bytes. |
| 90 | pub fn only_utf8(mut self, yes: bool) -> Self { |
| 91 | self.compiled.only_utf8 = yes; |
| 92 | self |
| 93 | } |
| 94 | |
| 95 | /// When set, the machine returned is suitable for use in the DFA matching |
| 96 | /// engine. |
| 97 | /// |
| 98 | /// In particular, this ensures that if the regex is not anchored in the |
| 99 | /// beginning, then a preceding `.*?` is included in the program. (The NFA |
| 100 | /// based engines handle the preceding `.*?` explicitly, which is difficult |
| 101 | /// or impossible in the DFA engine.) |
| 102 | pub fn dfa(mut self, yes: bool) -> Self { |
| 103 | self.compiled.is_dfa = yes; |
| 104 | self |
| 105 | } |
| 106 | |
| 107 | /// When set, the machine returned is suitable for matching text in |
| 108 | /// reverse. In particular, all concatenations are flipped. |
| 109 | pub fn reverse(mut self, yes: bool) -> Self { |
| 110 | self.compiled.is_reverse = yes; |
| 111 | self |
| 112 | } |
| 113 | |
| 114 | /// Compile a regular expression given its AST. |
| 115 | /// |
| 116 | /// The compiler is guaranteed to succeed unless the program exceeds the |
| 117 | /// specified size limit. If the size limit is exceeded, then compilation |
| 118 | /// stops and returns an error. |
| 119 | pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> { |
| 120 | debug_assert!(!exprs.is_empty()); |
| 121 | self.num_exprs = exprs.len(); |
| 122 | if exprs.len() == 1 { |
| 123 | self.compile_one(&exprs[0]) |
| 124 | } else { |
| 125 | self.compile_many(exprs) |
| 126 | } |
| 127 | } |
| 128 | |
| 129 | fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> { |
| 130 | // If we're compiling a forward DFA and we aren't anchored, then |
| 131 | // add a `.*?` before the first capture group. |
| 132 | // Other matching engines handle this by baking the logic into the |
| 133 | // matching engine itself. |
| 134 | let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; |
| 135 | self.compiled.is_anchored_start = expr.is_anchored_start(); |
| 136 | self.compiled.is_anchored_end = expr.is_anchored_end(); |
| 137 | if self.compiled.needs_dotstar() { |
| 138 | dotstar_patch = self.c_dotstar()?; |
| 139 | self.compiled.start = dotstar_patch.entry; |
| 140 | } |
| 141 | self.compiled.captures = vec![None]; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 142 | let patch = self.c_capture(0, expr)?.unwrap_or(self.next_inst()); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 143 | if self.compiled.needs_dotstar() { |
| 144 | self.fill(dotstar_patch.hole, patch.entry); |
| 145 | } else { |
| 146 | self.compiled.start = patch.entry; |
| 147 | } |
| 148 | self.fill_to_next(patch.hole); |
| 149 | self.compiled.matches = vec![self.insts.len()]; |
| 150 | self.push_compiled(Inst::Match(0)); |
| 151 | self.compile_finish() |
| 152 | } |
| 153 | |
| 154 | fn compile_many( |
| 155 | mut self, |
| 156 | exprs: &[Hir], |
| 157 | ) -> result::Result<Program, Error> { |
| 158 | debug_assert!(exprs.len() > 1); |
| 159 | |
| 160 | self.compiled.is_anchored_start = |
| 161 | exprs.iter().all(|e| e.is_anchored_start()); |
| 162 | self.compiled.is_anchored_end = |
| 163 | exprs.iter().all(|e| e.is_anchored_end()); |
| 164 | let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; |
| 165 | if self.compiled.needs_dotstar() { |
| 166 | dotstar_patch = self.c_dotstar()?; |
| 167 | self.compiled.start = dotstar_patch.entry; |
| 168 | } else { |
| 169 | self.compiled.start = 0; // first instruction is always split |
| 170 | } |
| 171 | self.fill_to_next(dotstar_patch.hole); |
| 172 | |
| 173 | let mut prev_hole = Hole::None; |
| 174 | for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() { |
| 175 | self.fill_to_next(prev_hole); |
| 176 | let split = self.push_split_hole(); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 177 | let Patch { hole, entry } = |
| 178 | self.c_capture(0, expr)?.unwrap_or(self.next_inst()); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 179 | self.fill_to_next(hole); |
| 180 | self.compiled.matches.push(self.insts.len()); |
| 181 | self.push_compiled(Inst::Match(i)); |
| 182 | prev_hole = self.fill_split(split, Some(entry), None); |
| 183 | } |
| 184 | let i = exprs.len() - 1; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 185 | let Patch { hole, entry } = |
| 186 | self.c_capture(0, &exprs[i])?.unwrap_or(self.next_inst()); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 187 | self.fill(prev_hole, entry); |
| 188 | self.fill_to_next(hole); |
| 189 | self.compiled.matches.push(self.insts.len()); |
| 190 | self.push_compiled(Inst::Match(i)); |
| 191 | self.compile_finish() |
| 192 | } |
| 193 | |
| 194 | fn compile_finish(mut self) -> result::Result<Program, Error> { |
| 195 | self.compiled.insts = |
| 196 | self.insts.into_iter().map(|inst| inst.unwrap()).collect(); |
| 197 | self.compiled.byte_classes = self.byte_classes.byte_classes(); |
| 198 | self.compiled.capture_name_idx = Arc::new(self.capture_name_idx); |
| 199 | Ok(self.compiled) |
| 200 | } |
| 201 | |
| 202 | /// Compile expr into self.insts, returning a patch on success, |
| 203 | /// or an error if we run out of memory. |
| 204 | /// |
| 205 | /// All of the c_* methods of the compiler share the contract outlined |
| 206 | /// here. |
| 207 | /// |
| 208 | /// The main thing that a c_* method does is mutate `self.insts` |
| 209 | /// to add a list of mostly compiled instructions required to execute |
| 210 | /// the given expression. `self.insts` contains MaybeInsts rather than |
| 211 | /// Insts because there is some backpatching required. |
| 212 | /// |
| 213 | /// The `Patch` value returned by each c_* method provides metadata |
| 214 | /// about the compiled instructions emitted to `self.insts`. The |
| 215 | /// `entry` member of the patch refers to the first instruction |
| 216 | /// (the entry point), while the `hole` member contains zero or |
| 217 | /// more offsets to partial instructions that need to be backpatched. |
| 218 | /// The c_* routine can't know where its list of instructions are going to |
| 219 | /// jump to after execution, so it is up to the caller to patch |
| 220 | /// these jumps to point to the right place. So compiling some |
| 221 | /// expression, e, we would end up with a situation that looked like: |
| 222 | /// |
| 223 | /// ```text |
| 224 | /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...] |
| 225 | /// ^ ^ ^ |
| 226 | /// | \ / |
| 227 | /// entry \ / |
| 228 | /// hole |
| 229 | /// ``` |
| 230 | /// |
Chih-Hung Hsieh | 849e445 | 2020-10-26 13:16:47 -0700 | [diff] [blame] | 231 | /// To compile two expressions, e1 and e2, concatenated together we |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 232 | /// would do: |
| 233 | /// |
| 234 | /// ```ignore |
| 235 | /// let patch1 = self.c(e1); |
| 236 | /// let patch2 = self.c(e2); |
| 237 | /// ``` |
| 238 | /// |
| 239 | /// while leaves us with a situation that looks like |
| 240 | /// |
| 241 | /// ```text |
| 242 | /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ] |
| 243 | /// ^ ^ ^ ^ |
| 244 | /// | | | | |
| 245 | /// entry1 hole1 entry2 hole2 |
| 246 | /// ``` |
| 247 | /// |
| 248 | /// Then to merge the two patches together into one we would backpatch |
| 249 | /// hole1 with entry2 and return a new patch that enters at entry1 |
| 250 | /// and has hole2 for a hole. In fact, if you look at the c_concat |
| 251 | /// method you will see that it does exactly this, though it handles |
| 252 | /// a list of expressions rather than just the two that we use for |
| 253 | /// an example. |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 254 | /// |
| 255 | /// Ok(None) is returned when an expression is compiled to no |
| 256 | /// instruction, and so no patch.entry value makes sense. |
| 257 | fn c(&mut self, expr: &Hir) -> ResultOrEmpty { |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 258 | use crate::prog; |
| 259 | use regex_syntax::hir::HirKind::*; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 260 | |
| 261 | self.check_size()?; |
| 262 | match *expr.kind() { |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 263 | Empty => Ok(None), |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 264 | Literal(hir::Literal::Unicode(c)) => self.c_char(c), |
| 265 | Literal(hir::Literal::Byte(b)) => { |
| 266 | assert!(self.compiled.uses_bytes()); |
| 267 | self.c_byte(b) |
| 268 | } |
| 269 | Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()), |
| 270 | Class(hir::Class::Bytes(ref cls)) => { |
| 271 | if self.compiled.uses_bytes() { |
| 272 | self.c_class_bytes(cls.ranges()) |
| 273 | } else { |
| 274 | assert!(cls.is_all_ascii()); |
| 275 | let mut char_ranges = vec![]; |
| 276 | for r in cls.iter() { |
| 277 | let (s, e) = (r.start() as char, r.end() as char); |
| 278 | char_ranges.push(hir::ClassUnicodeRange::new(s, e)); |
| 279 | } |
| 280 | self.c_class(&char_ranges) |
| 281 | } |
| 282 | } |
| 283 | Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => { |
| 284 | self.byte_classes.set_range(b'\n', b'\n'); |
| 285 | self.c_empty_look(prog::EmptyLook::EndLine) |
| 286 | } |
| 287 | Anchor(hir::Anchor::StartLine) => { |
| 288 | self.byte_classes.set_range(b'\n', b'\n'); |
| 289 | self.c_empty_look(prog::EmptyLook::StartLine) |
| 290 | } |
| 291 | Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => { |
| 292 | self.byte_classes.set_range(b'\n', b'\n'); |
| 293 | self.c_empty_look(prog::EmptyLook::StartLine) |
| 294 | } |
| 295 | Anchor(hir::Anchor::EndLine) => { |
| 296 | self.byte_classes.set_range(b'\n', b'\n'); |
| 297 | self.c_empty_look(prog::EmptyLook::EndLine) |
| 298 | } |
| 299 | Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => { |
| 300 | self.c_empty_look(prog::EmptyLook::EndText) |
| 301 | } |
| 302 | Anchor(hir::Anchor::StartText) => { |
| 303 | self.c_empty_look(prog::EmptyLook::StartText) |
| 304 | } |
| 305 | Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => { |
| 306 | self.c_empty_look(prog::EmptyLook::StartText) |
| 307 | } |
| 308 | Anchor(hir::Anchor::EndText) => { |
| 309 | self.c_empty_look(prog::EmptyLook::EndText) |
| 310 | } |
| 311 | WordBoundary(hir::WordBoundary::Unicode) => { |
| 312 | if !cfg!(feature = "unicode-perl") { |
| 313 | return Err(Error::Syntax( |
| 314 | "Unicode word boundaries are unavailable when \ |
| 315 | the unicode-perl feature is disabled" |
| 316 | .to_string(), |
| 317 | )); |
| 318 | } |
| 319 | self.compiled.has_unicode_word_boundary = true; |
| 320 | self.byte_classes.set_word_boundary(); |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 321 | // We also make sure that all ASCII bytes are in a different |
| 322 | // class from non-ASCII bytes. Otherwise, it's possible for |
| 323 | // ASCII bytes to get lumped into the same class as non-ASCII |
| 324 | // bytes. This in turn may cause the lazy DFA to falsely start |
| 325 | // when it sees an ASCII byte that maps to a byte class with |
| 326 | // non-ASCII bytes. This ensures that never happens. |
| 327 | self.byte_classes.set_range(0, 0x7F); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 328 | self.c_empty_look(prog::EmptyLook::WordBoundary) |
| 329 | } |
| 330 | WordBoundary(hir::WordBoundary::UnicodeNegate) => { |
| 331 | if !cfg!(feature = "unicode-perl") { |
| 332 | return Err(Error::Syntax( |
| 333 | "Unicode word boundaries are unavailable when \ |
| 334 | the unicode-perl feature is disabled" |
| 335 | .to_string(), |
| 336 | )); |
| 337 | } |
| 338 | self.compiled.has_unicode_word_boundary = true; |
| 339 | self.byte_classes.set_word_boundary(); |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 340 | // See comments above for why we set the ASCII range here. |
| 341 | self.byte_classes.set_range(0, 0x7F); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 342 | self.c_empty_look(prog::EmptyLook::NotWordBoundary) |
| 343 | } |
| 344 | WordBoundary(hir::WordBoundary::Ascii) => { |
| 345 | self.byte_classes.set_word_boundary(); |
| 346 | self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) |
| 347 | } |
| 348 | WordBoundary(hir::WordBoundary::AsciiNegate) => { |
| 349 | self.byte_classes.set_word_boundary(); |
| 350 | self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) |
| 351 | } |
| 352 | Group(ref g) => match g.kind { |
| 353 | hir::GroupKind::NonCapturing => self.c(&g.hir), |
| 354 | hir::GroupKind::CaptureIndex(index) => { |
| 355 | if index as usize >= self.compiled.captures.len() { |
| 356 | self.compiled.captures.push(None); |
| 357 | } |
| 358 | self.c_capture(2 * index as usize, &g.hir) |
| 359 | } |
| 360 | hir::GroupKind::CaptureName { index, ref name } => { |
| 361 | if index as usize >= self.compiled.captures.len() { |
| 362 | let n = name.to_string(); |
| 363 | self.compiled.captures.push(Some(n.clone())); |
| 364 | self.capture_name_idx.insert(n, index as usize); |
| 365 | } |
| 366 | self.c_capture(2 * index as usize, &g.hir) |
| 367 | } |
| 368 | }, |
| 369 | Concat(ref es) => { |
| 370 | if self.compiled.is_reverse { |
| 371 | self.c_concat(es.iter().rev()) |
| 372 | } else { |
| 373 | self.c_concat(es) |
| 374 | } |
| 375 | } |
| 376 | Alternation(ref es) => self.c_alternate(&**es), |
| 377 | Repetition(ref rep) => self.c_repeat(rep), |
| 378 | } |
| 379 | } |
| 380 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 381 | fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 382 | if self.num_exprs > 1 || self.compiled.is_dfa { |
| 383 | // Don't ever compile Save instructions for regex sets because |
| 384 | // they are never used. They are also never used in DFA programs |
| 385 | // because DFAs can't handle captures. |
| 386 | self.c(expr) |
| 387 | } else { |
| 388 | let entry = self.insts.len(); |
| 389 | let hole = self.push_hole(InstHole::Save { slot: first_slot }); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 390 | let patch = self.c(expr)?.unwrap_or(self.next_inst()); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 391 | self.fill(hole, patch.entry); |
| 392 | self.fill_to_next(patch.hole); |
| 393 | let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 }); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 394 | Ok(Some(Patch { hole: hole, entry: entry })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 395 | } |
| 396 | } |
| 397 | |
| 398 | fn c_dotstar(&mut self) -> Result { |
| 399 | Ok(if !self.compiled.only_utf8() { |
| 400 | self.c(&Hir::repetition(hir::Repetition { |
| 401 | kind: hir::RepetitionKind::ZeroOrMore, |
| 402 | greedy: false, |
| 403 | hir: Box::new(Hir::any(true)), |
| 404 | }))? |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 405 | .unwrap() |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 406 | } else { |
| 407 | self.c(&Hir::repetition(hir::Repetition { |
| 408 | kind: hir::RepetitionKind::ZeroOrMore, |
| 409 | greedy: false, |
| 410 | hir: Box::new(Hir::any(false)), |
| 411 | }))? |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 412 | .unwrap() |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 413 | }) |
| 414 | } |
| 415 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 416 | fn c_char(&mut self, c: char) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 417 | if self.compiled.uses_bytes() { |
| 418 | if c.is_ascii() { |
| 419 | let b = c as u8; |
| 420 | let hole = |
| 421 | self.push_hole(InstHole::Bytes { start: b, end: b }); |
| 422 | self.byte_classes.set_range(b, b); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 423 | Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 424 | } else { |
| 425 | self.c_class(&[hir::ClassUnicodeRange::new(c, c)]) |
| 426 | } |
| 427 | } else { |
| 428 | let hole = self.push_hole(InstHole::Char { c: c }); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 429 | Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 430 | } |
| 431 | } |
| 432 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 433 | fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty { |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 434 | use std::mem::size_of; |
| 435 | |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 436 | assert!(!ranges.is_empty()); |
| 437 | if self.compiled.uses_bytes() { |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 438 | Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?)) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 439 | } else { |
| 440 | let ranges: Vec<(char, char)> = |
| 441 | ranges.iter().map(|r| (r.start(), r.end())).collect(); |
| 442 | let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 { |
| 443 | self.push_hole(InstHole::Char { c: ranges[0].0 }) |
| 444 | } else { |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 445 | self.extra_inst_bytes += |
| 446 | ranges.len() * (size_of::<char>() * 2); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 447 | self.push_hole(InstHole::Ranges { ranges: ranges }) |
| 448 | }; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 449 | Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 450 | } |
| 451 | } |
| 452 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 453 | fn c_byte(&mut self, b: u8) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 454 | self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)]) |
| 455 | } |
| 456 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 457 | fn c_class_bytes( |
| 458 | &mut self, |
| 459 | ranges: &[hir::ClassBytesRange], |
| 460 | ) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 461 | debug_assert!(!ranges.is_empty()); |
| 462 | |
| 463 | let first_split_entry = self.insts.len(); |
| 464 | let mut holes = vec![]; |
| 465 | let mut prev_hole = Hole::None; |
| 466 | for r in &ranges[0..ranges.len() - 1] { |
| 467 | self.fill_to_next(prev_hole); |
| 468 | let split = self.push_split_hole(); |
| 469 | let next = self.insts.len(); |
| 470 | self.byte_classes.set_range(r.start(), r.end()); |
| 471 | holes.push(self.push_hole(InstHole::Bytes { |
| 472 | start: r.start(), |
| 473 | end: r.end(), |
| 474 | })); |
| 475 | prev_hole = self.fill_split(split, Some(next), None); |
| 476 | } |
| 477 | let next = self.insts.len(); |
| 478 | let r = &ranges[ranges.len() - 1]; |
| 479 | self.byte_classes.set_range(r.start(), r.end()); |
| 480 | holes.push( |
| 481 | self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }), |
| 482 | ); |
| 483 | self.fill(prev_hole, next); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 484 | Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 485 | } |
| 486 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 487 | fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 488 | let hole = self.push_hole(InstHole::EmptyLook { look: look }); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 489 | Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 490 | } |
| 491 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 492 | fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 493 | where |
| 494 | I: IntoIterator<Item = &'a Hir>, |
| 495 | { |
| 496 | let mut exprs = exprs.into_iter(); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 497 | let Patch { mut hole, entry } = loop { |
| 498 | match exprs.next() { |
| 499 | None => return Ok(None), |
| 500 | Some(e) => { |
| 501 | if let Some(p) = self.c(e)? { |
| 502 | break p; |
| 503 | } |
| 504 | } |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 505 | } |
| 506 | }; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 507 | for e in exprs { |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 508 | if let Some(p) = self.c(e)? { |
| 509 | self.fill(hole, p.entry); |
| 510 | hole = p.hole; |
| 511 | } |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 512 | } |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 513 | Ok(Some(Patch { hole: hole, entry: entry })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 514 | } |
| 515 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 516 | fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 517 | debug_assert!( |
| 518 | exprs.len() >= 2, |
| 519 | "alternates must have at least 2 exprs" |
| 520 | ); |
| 521 | |
| 522 | // Initial entry point is always the first split. |
| 523 | let first_split_entry = self.insts.len(); |
| 524 | |
| 525 | // Save up all of the holes from each alternate. They will all get |
| 526 | // patched to point to the same location. |
| 527 | let mut holes = vec![]; |
| 528 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 529 | // true indicates that the hole is a split where we want to fill |
| 530 | // the second branch. |
| 531 | let mut prev_hole = (Hole::None, false); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 532 | for e in &exprs[0..exprs.len() - 1] { |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 533 | if prev_hole.1 { |
| 534 | let next = self.insts.len(); |
| 535 | self.fill_split(prev_hole.0, None, Some(next)); |
| 536 | } else { |
| 537 | self.fill_to_next(prev_hole.0); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 538 | } |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 539 | let split = self.push_split_hole(); |
| 540 | if let Some(Patch { hole, entry }) = self.c(e)? { |
| 541 | holes.push(hole); |
| 542 | prev_hole = (self.fill_split(split, Some(entry), None), false); |
| 543 | } else { |
| 544 | let (split1, split2) = split.dup_one(); |
| 545 | holes.push(split1); |
| 546 | prev_hole = (split2, true); |
| 547 | } |
| 548 | } |
| 549 | if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 550 | holes.push(hole); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 551 | if prev_hole.1 { |
| 552 | self.fill_split(prev_hole.0, None, Some(entry)); |
| 553 | } else { |
| 554 | self.fill(prev_hole.0, entry); |
| 555 | } |
| 556 | } else { |
| 557 | // We ignore prev_hole.1. When it's true, it means we have two |
| 558 | // empty branches both pushing prev_hole.0 into holes, so both |
| 559 | // branches will go to the same place anyway. |
| 560 | holes.push(prev_hole.0); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 561 | } |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 562 | Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 563 | } |
| 564 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 565 | fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty { |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 566 | use regex_syntax::hir::RepetitionKind::*; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 567 | match rep.kind { |
| 568 | ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy), |
| 569 | ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy), |
| 570 | OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy), |
| 571 | Range(hir::RepetitionRange::Exactly(min_max)) => { |
| 572 | self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max) |
| 573 | } |
| 574 | Range(hir::RepetitionRange::AtLeast(min)) => { |
| 575 | self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min) |
| 576 | } |
| 577 | Range(hir::RepetitionRange::Bounded(min, max)) => { |
| 578 | self.c_repeat_range(&rep.hir, rep.greedy, min, max) |
| 579 | } |
| 580 | } |
| 581 | } |
| 582 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 583 | fn c_repeat_zero_or_one( |
| 584 | &mut self, |
| 585 | expr: &Hir, |
| 586 | greedy: bool, |
| 587 | ) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 588 | let split_entry = self.insts.len(); |
| 589 | let split = self.push_split_hole(); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 590 | let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { |
| 591 | Some(p) => p, |
| 592 | None => return self.pop_split_hole(), |
| 593 | }; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 594 | let split_hole = if greedy { |
| 595 | self.fill_split(split, Some(entry_rep), None) |
| 596 | } else { |
| 597 | self.fill_split(split, None, Some(entry_rep)) |
| 598 | }; |
| 599 | let holes = vec![hole_rep, split_hole]; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 600 | Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 601 | } |
| 602 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 603 | fn c_repeat_zero_or_more( |
| 604 | &mut self, |
| 605 | expr: &Hir, |
| 606 | greedy: bool, |
| 607 | ) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 608 | let split_entry = self.insts.len(); |
| 609 | let split = self.push_split_hole(); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 610 | let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { |
| 611 | Some(p) => p, |
| 612 | None => return self.pop_split_hole(), |
| 613 | }; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 614 | |
| 615 | self.fill(hole_rep, split_entry); |
| 616 | let split_hole = if greedy { |
| 617 | self.fill_split(split, Some(entry_rep), None) |
| 618 | } else { |
| 619 | self.fill_split(split, None, Some(entry_rep)) |
| 620 | }; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 621 | Ok(Some(Patch { hole: split_hole, entry: split_entry })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 622 | } |
| 623 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 624 | fn c_repeat_one_or_more( |
| 625 | &mut self, |
| 626 | expr: &Hir, |
| 627 | greedy: bool, |
| 628 | ) -> ResultOrEmpty { |
| 629 | let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { |
| 630 | Some(p) => p, |
| 631 | None => return Ok(None), |
| 632 | }; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 633 | self.fill_to_next(hole_rep); |
| 634 | let split = self.push_split_hole(); |
| 635 | |
| 636 | let split_hole = if greedy { |
| 637 | self.fill_split(split, Some(entry_rep), None) |
| 638 | } else { |
| 639 | self.fill_split(split, None, Some(entry_rep)) |
| 640 | }; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 641 | Ok(Some(Patch { hole: split_hole, entry: entry_rep })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 642 | } |
| 643 | |
| 644 | fn c_repeat_range_min_or_more( |
| 645 | &mut self, |
| 646 | expr: &Hir, |
| 647 | greedy: bool, |
| 648 | min: u32, |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 649 | ) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 650 | let min = u32_to_usize(min); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 651 | // Using next_inst() is ok, because we can't return it (concat would |
| 652 | // have to return Some(_) while c_repeat_range_min_or_more returns |
| 653 | // None). |
| 654 | let patch_concat = self |
| 655 | .c_concat(iter::repeat(expr).take(min))? |
| 656 | .unwrap_or(self.next_inst()); |
| 657 | if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? { |
| 658 | self.fill(patch_concat.hole, patch_rep.entry); |
| 659 | Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry })) |
| 660 | } else { |
| 661 | Ok(None) |
| 662 | } |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 663 | } |
| 664 | |
| 665 | fn c_repeat_range( |
| 666 | &mut self, |
| 667 | expr: &Hir, |
| 668 | greedy: bool, |
| 669 | min: u32, |
| 670 | max: u32, |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 671 | ) -> ResultOrEmpty { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 672 | let (min, max) = (u32_to_usize(min), u32_to_usize(max)); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 673 | debug_assert!(min <= max); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 674 | let patch_concat = self.c_concat(iter::repeat(expr).take(min))?; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 675 | if min == max { |
| 676 | return Ok(patch_concat); |
| 677 | } |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 678 | // Same reasoning as in c_repeat_range_min_or_more (we know that min < |
| 679 | // max at this point). |
| 680 | let patch_concat = patch_concat.unwrap_or(self.next_inst()); |
| 681 | let initial_entry = patch_concat.entry; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 682 | // It is much simpler to compile, e.g., `a{2,5}` as: |
| 683 | // |
| 684 | // aaa?a?a? |
| 685 | // |
| 686 | // But you end up with a sequence of instructions like this: |
| 687 | // |
| 688 | // 0: 'a' |
| 689 | // 1: 'a', |
| 690 | // 2: split(3, 4) |
| 691 | // 3: 'a' |
| 692 | // 4: split(5, 6) |
| 693 | // 5: 'a' |
| 694 | // 6: split(7, 8) |
| 695 | // 7: 'a' |
| 696 | // 8: MATCH |
| 697 | // |
| 698 | // This is *incredibly* inefficient because the splits end |
| 699 | // up forming a chain, which has to be resolved everything a |
| 700 | // transition is followed. |
| 701 | let mut holes = vec![]; |
| 702 | let mut prev_hole = patch_concat.hole; |
| 703 | for _ in min..max { |
| 704 | self.fill_to_next(prev_hole); |
| 705 | let split = self.push_split_hole(); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 706 | let Patch { hole, entry } = match self.c(expr)? { |
| 707 | Some(p) => p, |
| 708 | None => return self.pop_split_hole(), |
| 709 | }; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 710 | prev_hole = hole; |
| 711 | if greedy { |
| 712 | holes.push(self.fill_split(split, Some(entry), None)); |
| 713 | } else { |
| 714 | holes.push(self.fill_split(split, None, Some(entry))); |
| 715 | } |
| 716 | } |
| 717 | holes.push(prev_hole); |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 718 | Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry })) |
| 719 | } |
| 720 | |
| 721 | /// Can be used as a default value for the c_* functions when the call to |
| 722 | /// c_function is followed by inserting at least one instruction that is |
| 723 | /// always executed after the ones written by the c* function. |
| 724 | fn next_inst(&self) -> Patch { |
| 725 | Patch { hole: Hole::None, entry: self.insts.len() } |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 726 | } |
| 727 | |
| 728 | fn fill(&mut self, hole: Hole, goto: InstPtr) { |
| 729 | match hole { |
| 730 | Hole::None => {} |
| 731 | Hole::One(pc) => { |
| 732 | self.insts[pc].fill(goto); |
| 733 | } |
| 734 | Hole::Many(holes) => { |
| 735 | for hole in holes { |
| 736 | self.fill(hole, goto); |
| 737 | } |
| 738 | } |
| 739 | } |
| 740 | } |
| 741 | |
| 742 | fn fill_to_next(&mut self, hole: Hole) { |
| 743 | let next = self.insts.len(); |
| 744 | self.fill(hole, next); |
| 745 | } |
| 746 | |
| 747 | fn fill_split( |
| 748 | &mut self, |
| 749 | hole: Hole, |
| 750 | goto1: Option<InstPtr>, |
| 751 | goto2: Option<InstPtr>, |
| 752 | ) -> Hole { |
| 753 | match hole { |
| 754 | Hole::None => Hole::None, |
| 755 | Hole::One(pc) => match (goto1, goto2) { |
| 756 | (Some(goto1), Some(goto2)) => { |
| 757 | self.insts[pc].fill_split(goto1, goto2); |
| 758 | Hole::None |
| 759 | } |
| 760 | (Some(goto1), None) => { |
| 761 | self.insts[pc].half_fill_split_goto1(goto1); |
| 762 | Hole::One(pc) |
| 763 | } |
| 764 | (None, Some(goto2)) => { |
| 765 | self.insts[pc].half_fill_split_goto2(goto2); |
| 766 | Hole::One(pc) |
| 767 | } |
| 768 | (None, None) => unreachable!( |
| 769 | "at least one of the split \ |
| 770 | holes must be filled" |
| 771 | ), |
| 772 | }, |
| 773 | Hole::Many(holes) => { |
| 774 | let mut new_holes = vec![]; |
| 775 | for hole in holes { |
| 776 | new_holes.push(self.fill_split(hole, goto1, goto2)); |
| 777 | } |
| 778 | if new_holes.is_empty() { |
| 779 | Hole::None |
| 780 | } else if new_holes.len() == 1 { |
| 781 | new_holes.pop().unwrap() |
| 782 | } else { |
| 783 | Hole::Many(new_holes) |
| 784 | } |
| 785 | } |
| 786 | } |
| 787 | } |
| 788 | |
| 789 | fn push_compiled(&mut self, inst: Inst) { |
| 790 | self.insts.push(MaybeInst::Compiled(inst)); |
| 791 | } |
| 792 | |
| 793 | fn push_hole(&mut self, inst: InstHole) -> Hole { |
| 794 | let hole = self.insts.len(); |
| 795 | self.insts.push(MaybeInst::Uncompiled(inst)); |
| 796 | Hole::One(hole) |
| 797 | } |
| 798 | |
| 799 | fn push_split_hole(&mut self) -> Hole { |
| 800 | let hole = self.insts.len(); |
| 801 | self.insts.push(MaybeInst::Split); |
| 802 | Hole::One(hole) |
| 803 | } |
| 804 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 805 | fn pop_split_hole(&mut self) -> ResultOrEmpty { |
| 806 | self.insts.pop(); |
| 807 | Ok(None) |
| 808 | } |
| 809 | |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 810 | fn check_size(&self) -> result::Result<(), Error> { |
| 811 | use std::mem::size_of; |
| 812 | |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 813 | let size = |
| 814 | self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>()); |
| 815 | if size > self.size_limit { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 816 | Err(Error::CompiledTooBig(self.size_limit)) |
| 817 | } else { |
| 818 | Ok(()) |
| 819 | } |
| 820 | } |
| 821 | } |
| 822 | |
| 823 | #[derive(Debug)] |
| 824 | enum Hole { |
| 825 | None, |
| 826 | One(InstPtr), |
| 827 | Many(Vec<Hole>), |
| 828 | } |
| 829 | |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 830 | impl Hole { |
| 831 | fn dup_one(self) -> (Self, Self) { |
| 832 | match self { |
| 833 | Hole::One(pc) => (Hole::One(pc), Hole::One(pc)), |
| 834 | Hole::None | Hole::Many(_) => { |
| 835 | unreachable!("must be called on single hole") |
| 836 | } |
| 837 | } |
| 838 | } |
| 839 | } |
| 840 | |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 841 | #[derive(Clone, Debug)] |
| 842 | enum MaybeInst { |
| 843 | Compiled(Inst), |
| 844 | Uncompiled(InstHole), |
| 845 | Split, |
| 846 | Split1(InstPtr), |
| 847 | Split2(InstPtr), |
| 848 | } |
| 849 | |
| 850 | impl MaybeInst { |
| 851 | fn fill(&mut self, goto: InstPtr) { |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 852 | let maybeinst = match *self { |
| 853 | MaybeInst::Split => MaybeInst::Split1(goto), |
| 854 | MaybeInst::Uncompiled(ref inst) => { |
| 855 | MaybeInst::Compiled(inst.fill(goto)) |
| 856 | } |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 857 | MaybeInst::Split1(goto1) => { |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 858 | MaybeInst::Compiled(Inst::Split(InstSplit { |
| 859 | goto1: goto1, |
| 860 | goto2: goto, |
| 861 | })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 862 | } |
| 863 | MaybeInst::Split2(goto2) => { |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 864 | MaybeInst::Compiled(Inst::Split(InstSplit { |
| 865 | goto1: goto, |
| 866 | goto2: goto2, |
| 867 | })) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 868 | } |
| 869 | _ => unreachable!( |
| 870 | "not all instructions were compiled! \ |
| 871 | found uncompiled instruction: {:?}", |
| 872 | self |
| 873 | ), |
| 874 | }; |
Haibo Huang | 49cbe5f | 2020-05-28 20:14:24 -0700 | [diff] [blame] | 875 | *self = maybeinst; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 876 | } |
| 877 | |
| 878 | fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) { |
| 879 | let filled = match *self { |
| 880 | MaybeInst::Split => { |
| 881 | Inst::Split(InstSplit { goto1: goto1, goto2: goto2 }) |
| 882 | } |
| 883 | _ => unreachable!( |
| 884 | "must be called on Split instruction, \ |
| 885 | instead it was called on: {:?}", |
| 886 | self |
| 887 | ), |
| 888 | }; |
| 889 | *self = MaybeInst::Compiled(filled); |
| 890 | } |
| 891 | |
| 892 | fn half_fill_split_goto1(&mut self, goto1: InstPtr) { |
| 893 | let half_filled = match *self { |
| 894 | MaybeInst::Split => goto1, |
| 895 | _ => unreachable!( |
| 896 | "must be called on Split instruction, \ |
| 897 | instead it was called on: {:?}", |
| 898 | self |
| 899 | ), |
| 900 | }; |
| 901 | *self = MaybeInst::Split1(half_filled); |
| 902 | } |
| 903 | |
| 904 | fn half_fill_split_goto2(&mut self, goto2: InstPtr) { |
| 905 | let half_filled = match *self { |
| 906 | MaybeInst::Split => goto2, |
| 907 | _ => unreachable!( |
| 908 | "must be called on Split instruction, \ |
| 909 | instead it was called on: {:?}", |
| 910 | self |
| 911 | ), |
| 912 | }; |
| 913 | *self = MaybeInst::Split2(half_filled); |
| 914 | } |
| 915 | |
| 916 | fn unwrap(self) -> Inst { |
| 917 | match self { |
| 918 | MaybeInst::Compiled(inst) => inst, |
| 919 | _ => unreachable!( |
| 920 | "must be called on a compiled instruction, \ |
| 921 | instead it was called on: {:?}", |
| 922 | self |
| 923 | ), |
| 924 | } |
| 925 | } |
| 926 | } |
| 927 | |
| 928 | #[derive(Clone, Debug)] |
| 929 | enum InstHole { |
| 930 | Save { slot: usize }, |
| 931 | EmptyLook { look: EmptyLook }, |
| 932 | Char { c: char }, |
| 933 | Ranges { ranges: Vec<(char, char)> }, |
| 934 | Bytes { start: u8, end: u8 }, |
| 935 | } |
| 936 | |
| 937 | impl InstHole { |
| 938 | fn fill(&self, goto: InstPtr) -> Inst { |
| 939 | match *self { |
| 940 | InstHole::Save { slot } => { |
| 941 | Inst::Save(InstSave { goto: goto, slot: slot }) |
| 942 | } |
| 943 | InstHole::EmptyLook { look } => { |
| 944 | Inst::EmptyLook(InstEmptyLook { goto: goto, look: look }) |
| 945 | } |
| 946 | InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }), |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 947 | InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges { |
| 948 | goto: goto, |
| 949 | ranges: ranges.clone().into_boxed_slice(), |
| 950 | }), |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 951 | InstHole::Bytes { start, end } => { |
| 952 | Inst::Bytes(InstBytes { goto: goto, start: start, end: end }) |
| 953 | } |
| 954 | } |
| 955 | } |
| 956 | } |
| 957 | |
| 958 | struct CompileClass<'a, 'b> { |
| 959 | c: &'a mut Compiler, |
| 960 | ranges: &'b [hir::ClassUnicodeRange], |
| 961 | } |
| 962 | |
| 963 | impl<'a, 'b> CompileClass<'a, 'b> { |
| 964 | fn compile(mut self) -> Result { |
| 965 | let mut holes = vec![]; |
| 966 | let mut initial_entry = None; |
| 967 | let mut last_split = Hole::None; |
| 968 | let mut utf8_seqs = self.c.utf8_seqs.take().unwrap(); |
| 969 | self.c.suffix_cache.clear(); |
| 970 | |
| 971 | for (i, range) in self.ranges.iter().enumerate() { |
| 972 | let is_last_range = i + 1 == self.ranges.len(); |
| 973 | utf8_seqs.reset(range.start(), range.end()); |
| 974 | let mut it = (&mut utf8_seqs).peekable(); |
| 975 | loop { |
| 976 | let utf8_seq = match it.next() { |
| 977 | None => break, |
| 978 | Some(utf8_seq) => utf8_seq, |
| 979 | }; |
| 980 | if is_last_range && it.peek().is_none() { |
| 981 | let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; |
| 982 | holes.push(hole); |
| 983 | self.c.fill(last_split, entry); |
| 984 | last_split = Hole::None; |
| 985 | if initial_entry.is_none() { |
| 986 | initial_entry = Some(entry); |
| 987 | } |
| 988 | } else { |
| 989 | if initial_entry.is_none() { |
| 990 | initial_entry = Some(self.c.insts.len()); |
| 991 | } |
| 992 | self.c.fill_to_next(last_split); |
| 993 | last_split = self.c.push_split_hole(); |
| 994 | let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; |
| 995 | holes.push(hole); |
| 996 | last_split = |
| 997 | self.c.fill_split(last_split, Some(entry), None); |
| 998 | } |
| 999 | } |
| 1000 | } |
| 1001 | self.c.utf8_seqs = Some(utf8_seqs); |
| 1002 | Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() }) |
| 1003 | } |
| 1004 | |
| 1005 | fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result { |
| 1006 | if self.c.compiled.is_reverse { |
| 1007 | self.c_utf8_seq_(seq) |
| 1008 | } else { |
| 1009 | self.c_utf8_seq_(seq.into_iter().rev()) |
| 1010 | } |
| 1011 | } |
| 1012 | |
| 1013 | fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result |
| 1014 | where |
| 1015 | I: IntoIterator<Item = &'r Utf8Range>, |
| 1016 | { |
| 1017 | // The initial instruction for each UTF-8 sequence should be the same. |
| 1018 | let mut from_inst = ::std::usize::MAX; |
| 1019 | let mut last_hole = Hole::None; |
| 1020 | for byte_range in seq { |
| 1021 | let key = SuffixCacheKey { |
| 1022 | from_inst: from_inst, |
| 1023 | start: byte_range.start, |
| 1024 | end: byte_range.end, |
| 1025 | }; |
| 1026 | { |
| 1027 | let pc = self.c.insts.len(); |
| 1028 | if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) { |
| 1029 | from_inst = cached_pc; |
| 1030 | continue; |
| 1031 | } |
| 1032 | } |
| 1033 | self.c.byte_classes.set_range(byte_range.start, byte_range.end); |
| 1034 | if from_inst == ::std::usize::MAX { |
| 1035 | last_hole = self.c.push_hole(InstHole::Bytes { |
| 1036 | start: byte_range.start, |
| 1037 | end: byte_range.end, |
| 1038 | }); |
| 1039 | } else { |
| 1040 | self.c.push_compiled(Inst::Bytes(InstBytes { |
| 1041 | goto: from_inst, |
| 1042 | start: byte_range.start, |
| 1043 | end: byte_range.end, |
| 1044 | })); |
| 1045 | } |
| 1046 | from_inst = self.c.insts.len().checked_sub(1).unwrap(); |
| 1047 | debug_assert!(from_inst < ::std::usize::MAX); |
| 1048 | } |
| 1049 | debug_assert!(from_inst < ::std::usize::MAX); |
| 1050 | Ok(Patch { hole: last_hole, entry: from_inst }) |
| 1051 | } |
| 1052 | } |
| 1053 | |
| 1054 | /// `SuffixCache` is a simple bounded hash map for caching suffix entries in |
| 1055 | /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}. |
| 1056 | /// The set of byte ranges looks like this: |
| 1057 | /// |
| 1058 | /// [0-7F] |
| 1059 | /// [C2-DF][80-BF] |
| 1060 | /// [E0][A0-BF][80-BF] |
| 1061 | /// [E1-EC][80-BF][80-BF] |
| 1062 | /// [ED][80-9F][80-BF] |
| 1063 | /// [EE-EF][80-BF][80-BF] |
| 1064 | /// |
| 1065 | /// Each line above translates to one alternate in the compiled regex program. |
| 1066 | /// However, all but one of the alternates end in the same suffix, which is |
| 1067 | /// a waste of an instruction. The suffix cache facilitates reusing them across |
| 1068 | /// alternates. |
| 1069 | /// |
| 1070 | /// Note that a HashMap could be trivially used for this, but we don't need its |
| 1071 | /// overhead. Some small bounded space (LRU style) is more than enough. |
| 1072 | /// |
| 1073 | /// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html), |
| 1074 | /// except it uses hashes as original indices and then compares full keys for |
| 1075 | /// validation against `dense` array. |
Haibo Huang | 47619dd | 2021-01-08 17:05:43 -0800 | [diff] [blame] | 1076 | #[derive(Debug)] |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 1077 | struct SuffixCache { |
| 1078 | sparse: Box<[usize]>, |
| 1079 | dense: Vec<SuffixCacheEntry>, |
| 1080 | } |
| 1081 | |
| 1082 | #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] |
| 1083 | struct SuffixCacheEntry { |
| 1084 | key: SuffixCacheKey, |
| 1085 | pc: InstPtr, |
| 1086 | } |
| 1087 | |
| 1088 | #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] |
| 1089 | struct SuffixCacheKey { |
| 1090 | from_inst: InstPtr, |
| 1091 | start: u8, |
| 1092 | end: u8, |
| 1093 | } |
| 1094 | |
| 1095 | impl SuffixCache { |
| 1096 | fn new(size: usize) -> Self { |
| 1097 | SuffixCache { |
| 1098 | sparse: vec![0usize; size].into(), |
| 1099 | dense: Vec::with_capacity(size), |
| 1100 | } |
| 1101 | } |
| 1102 | |
| 1103 | fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> { |
| 1104 | let hash = self.hash(&key); |
| 1105 | let pos = &mut self.sparse[hash]; |
| 1106 | if let Some(entry) = self.dense.get(*pos) { |
| 1107 | if entry.key == key { |
| 1108 | return Some(entry.pc); |
| 1109 | } |
| 1110 | } |
| 1111 | *pos = self.dense.len(); |
| 1112 | self.dense.push(SuffixCacheEntry { key: key, pc: pc }); |
| 1113 | None |
| 1114 | } |
| 1115 | |
| 1116 | fn clear(&mut self) { |
| 1117 | self.dense.clear(); |
| 1118 | } |
| 1119 | |
| 1120 | fn hash(&self, suffix: &SuffixCacheKey) -> usize { |
| 1121 | // Basic FNV-1a hash as described: |
| 1122 | // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function |
| 1123 | const FNV_PRIME: u64 = 1099511628211; |
| 1124 | let mut h = 14695981039346656037; |
| 1125 | h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME); |
| 1126 | h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME); |
| 1127 | h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME); |
| 1128 | (h as usize) % self.sparse.len() |
| 1129 | } |
| 1130 | } |
| 1131 | |
| 1132 | struct ByteClassSet([bool; 256]); |
| 1133 | |
| 1134 | impl ByteClassSet { |
| 1135 | fn new() -> Self { |
| 1136 | ByteClassSet([false; 256]) |
| 1137 | } |
| 1138 | |
| 1139 | fn set_range(&mut self, start: u8, end: u8) { |
| 1140 | debug_assert!(start <= end); |
| 1141 | if start > 0 { |
| 1142 | self.0[start as usize - 1] = true; |
| 1143 | } |
| 1144 | self.0[end as usize] = true; |
| 1145 | } |
| 1146 | |
| 1147 | fn set_word_boundary(&mut self) { |
| 1148 | // We need to mark all ranges of bytes whose pairs result in |
| 1149 | // evaluating \b differently. |
| 1150 | let iswb = is_word_byte; |
| 1151 | let mut b1: u16 = 0; |
| 1152 | let mut b2: u16; |
| 1153 | while b1 <= 255 { |
| 1154 | b2 = b1 + 1; |
| 1155 | while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) { |
| 1156 | b2 += 1; |
| 1157 | } |
| 1158 | self.set_range(b1 as u8, (b2 - 1) as u8); |
| 1159 | b1 = b2; |
| 1160 | } |
| 1161 | } |
| 1162 | |
| 1163 | fn byte_classes(&self) -> Vec<u8> { |
| 1164 | // N.B. If you're debugging the DFA, it's useful to simply return |
| 1165 | // `(0..256).collect()`, which effectively removes the byte classes |
| 1166 | // and makes the transitions easier to read. |
| 1167 | // (0usize..256).map(|x| x as u8).collect() |
| 1168 | let mut byte_classes = vec![0; 256]; |
| 1169 | let mut class = 0u8; |
| 1170 | let mut i = 0; |
| 1171 | loop { |
| 1172 | byte_classes[i] = class as u8; |
| 1173 | if i >= 255 { |
| 1174 | break; |
| 1175 | } |
| 1176 | if self.0[i] { |
| 1177 | class = class.checked_add(1).unwrap(); |
| 1178 | } |
| 1179 | i += 1; |
| 1180 | } |
| 1181 | byte_classes |
| 1182 | } |
| 1183 | } |
| 1184 | |
Haibo Huang | 47619dd | 2021-01-08 17:05:43 -0800 | [diff] [blame] | 1185 | impl fmt::Debug for ByteClassSet { |
| 1186 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 1187 | f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish() |
| 1188 | } |
| 1189 | } |
| 1190 | |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 1191 | fn u32_to_usize(n: u32) -> usize { |
| 1192 | // In case usize is less than 32 bits, we need to guard against overflow. |
| 1193 | // On most platforms this compiles to nothing. |
| 1194 | // TODO Use `std::convert::TryFrom` once it's stable. |
| 1195 | if (n as u64) > (::std::usize::MAX as u64) { |
| 1196 | panic!("BUG: {} is too big to be pointer sized", n) |
| 1197 | } |
| 1198 | n as usize |
| 1199 | } |
| 1200 | |
| 1201 | #[cfg(test)] |
| 1202 | mod tests { |
| 1203 | use super::ByteClassSet; |
| 1204 | |
| 1205 | #[test] |
| 1206 | fn byte_classes() { |
| 1207 | let mut set = ByteClassSet::new(); |
| 1208 | set.set_range(b'a', b'z'); |
| 1209 | let classes = set.byte_classes(); |
| 1210 | assert_eq!(classes[0], 0); |
| 1211 | assert_eq!(classes[1], 0); |
| 1212 | assert_eq!(classes[2], 0); |
| 1213 | assert_eq!(classes[b'a' as usize - 1], 0); |
| 1214 | assert_eq!(classes[b'a' as usize], 1); |
| 1215 | assert_eq!(classes[b'm' as usize], 1); |
| 1216 | assert_eq!(classes[b'z' as usize], 1); |
| 1217 | assert_eq!(classes[b'z' as usize + 1], 2); |
| 1218 | assert_eq!(classes[254], 2); |
| 1219 | assert_eq!(classes[255], 2); |
| 1220 | |
| 1221 | let mut set = ByteClassSet::new(); |
| 1222 | set.set_range(0, 2); |
| 1223 | set.set_range(4, 6); |
| 1224 | let classes = set.byte_classes(); |
| 1225 | assert_eq!(classes[0], 0); |
| 1226 | assert_eq!(classes[1], 0); |
| 1227 | assert_eq!(classes[2], 0); |
| 1228 | assert_eq!(classes[3], 1); |
| 1229 | assert_eq!(classes[4], 2); |
| 1230 | assert_eq!(classes[5], 2); |
| 1231 | assert_eq!(classes[6], 2); |
| 1232 | assert_eq!(classes[7], 3); |
| 1233 | assert_eq!(classes[255], 3); |
| 1234 | } |
| 1235 | |
| 1236 | #[test] |
| 1237 | fn full_byte_classes() { |
| 1238 | let mut set = ByteClassSet::new(); |
| 1239 | for i in 0..256u16 { |
| 1240 | set.set_range(i as u8, i as u8); |
| 1241 | } |
| 1242 | assert_eq!(set.byte_classes().len(), 256); |
| 1243 | } |
| 1244 | } |