Jakub Kotur | 3bceaeb | 2020-12-21 17:28:16 +0100 | [diff] [blame] | 1 | // This module provides an NFA compiler using Thompson's construction |
| 2 | // algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA |
| 3 | // graph as output. The NFA graph is structured in a way that permits it to be |
| 4 | // executed by a virtual machine and also used to efficiently build a DFA. |
| 5 | // |
| 6 | // The compiler deals with a slightly expanded set of NFA states that notably |
| 7 | // includes an empty node that has exactly one epsilon transition to the next |
| 8 | // state. In other words, it's a "goto" instruction if one views Thompson's NFA |
| 9 | // as a set of bytecode instructions. These goto instructions are removed in |
| 10 | // a subsequent phase before returning the NFA to the caller. The purpose of |
| 11 | // these empty nodes is that they make the construction algorithm substantially |
| 12 | // simpler to implement. We remove them before returning to the caller because |
| 13 | // they can represent substantial overhead when traversing the NFA graph |
| 14 | // (either while searching using the NFA directly or while building a DFA). |
| 15 | // |
| 16 | // In the future, it would be nice to provide a Glushkov compiler as well, |
| 17 | // as it would work well as a bit-parallel NFA for smaller regexes. But |
| 18 | // the Thompson construction is one I'm more familiar with and seems more |
| 19 | // straight-forward to deal with when it comes to large Unicode character |
| 20 | // classes. |
| 21 | // |
| 22 | // Internally, the compiler uses interior mutability to improve composition |
| 23 | // in the face of the borrow checker. In particular, we'd really like to be |
| 24 | // able to write things like this: |
| 25 | // |
| 26 | // self.c_concat(exprs.iter().map(|e| self.c(e))) |
| 27 | // |
| 28 | // Which elegantly uses iterators to build up a sequence of compiled regex |
| 29 | // sub-expressions and then hands it off to the concatenating compiler |
| 30 | // routine. Without interior mutability, the borrow checker won't let us |
| 31 | // borrow `self` mutably both inside and outside the closure at the same |
| 32 | // time. |
| 33 | |
| 34 | use std::cell::RefCell; |
| 35 | use std::mem; |
| 36 | |
| 37 | use regex_syntax::hir::{self, Hir, HirKind}; |
| 38 | use regex_syntax::utf8::{Utf8Range, Utf8Sequences}; |
| 39 | |
| 40 | use classes::ByteClassSet; |
| 41 | use error::{Error, Result}; |
| 42 | use nfa::map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap}; |
| 43 | use nfa::range_trie::RangeTrie; |
| 44 | use nfa::{State, StateID, Transition, NFA}; |
| 45 | |
| 46 | /// Config knobs for the NFA compiler. See the builder's methods for more |
| 47 | /// docs on each one. |
| 48 | #[derive(Clone, Copy, Debug)] |
| 49 | struct Config { |
| 50 | anchored: bool, |
| 51 | allow_invalid_utf8: bool, |
| 52 | reverse: bool, |
| 53 | shrink: bool, |
| 54 | } |
| 55 | |
| 56 | impl Default for Config { |
| 57 | fn default() -> Config { |
| 58 | Config { |
| 59 | anchored: false, |
| 60 | allow_invalid_utf8: false, |
| 61 | reverse: false, |
| 62 | shrink: true, |
| 63 | } |
| 64 | } |
| 65 | } |
| 66 | |
| 67 | /// A builder for compiling an NFA. |
| 68 | #[derive(Clone, Debug)] |
| 69 | pub struct Builder { |
| 70 | config: Config, |
| 71 | } |
| 72 | |
| 73 | impl Builder { |
| 74 | /// Create a new NFA builder with its default configuration. |
| 75 | pub fn new() -> Builder { |
| 76 | Builder { config: Config::default() } |
| 77 | } |
| 78 | |
| 79 | /// Compile the given high level intermediate representation of a regular |
| 80 | /// expression into an NFA. |
| 81 | /// |
| 82 | /// If there was a problem building the NFA, then an error is returned. |
| 83 | /// For example, if the regex uses unsupported features (such as zero-width |
| 84 | /// assertions), then an error is returned. |
| 85 | pub fn build(&self, expr: &Hir) -> Result<NFA> { |
| 86 | let mut nfa = NFA::always_match(); |
| 87 | self.build_with(&mut Compiler::new(), &mut nfa, expr)?; |
| 88 | Ok(nfa) |
| 89 | } |
| 90 | |
| 91 | /// Compile the given high level intermediate representation of a regular |
| 92 | /// expression into the NFA given using the given compiler. Callers may |
| 93 | /// prefer this over `build` if they would like to reuse allocations while |
| 94 | /// compiling many regular expressions. |
| 95 | /// |
| 96 | /// On success, the given NFA is completely overwritten with the NFA |
| 97 | /// produced by the compiler. |
| 98 | /// |
| 99 | /// If there was a problem building the NFA, then an error is returned. For |
| 100 | /// example, if the regex uses unsupported features (such as zero-width |
| 101 | /// assertions), then an error is returned. When an error is returned, |
| 102 | /// the contents of `nfa` are unspecified and should not be relied upon. |
| 103 | /// However, it can still be reused in subsequent calls to this method. |
| 104 | pub fn build_with( |
| 105 | &self, |
| 106 | compiler: &mut Compiler, |
| 107 | nfa: &mut NFA, |
| 108 | expr: &Hir, |
| 109 | ) -> Result<()> { |
| 110 | compiler.clear(); |
| 111 | compiler.configure(self.config); |
| 112 | compiler.compile(nfa, expr) |
| 113 | } |
| 114 | |
| 115 | /// Set whether matching must be anchored at the beginning of the input. |
| 116 | /// |
| 117 | /// When enabled, a match must begin at the start of the input. When |
| 118 | /// disabled, the NFA will act as if the pattern started with a `.*?`, |
| 119 | /// which enables a match to appear anywhere. |
| 120 | /// |
| 121 | /// By default this is disabled. |
| 122 | pub fn anchored(&mut self, yes: bool) -> &mut Builder { |
| 123 | self.config.anchored = yes; |
| 124 | self |
| 125 | } |
| 126 | |
| 127 | /// When enabled, the builder will permit the construction of an NFA that |
| 128 | /// may match invalid UTF-8. |
| 129 | /// |
| 130 | /// When disabled (the default), the builder is guaranteed to produce a |
| 131 | /// regex that will only ever match valid UTF-8 (otherwise, the builder |
| 132 | /// will return an error). |
| 133 | pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder { |
| 134 | self.config.allow_invalid_utf8 = yes; |
| 135 | self |
| 136 | } |
| 137 | |
| 138 | /// Reverse the NFA. |
| 139 | /// |
| 140 | /// A NFA reversal is performed by reversing all of the concatenated |
| 141 | /// sub-expressions in the original pattern, recursively. The resulting |
| 142 | /// NFA can be used to match the pattern starting from the end of a string |
| 143 | /// instead of the beginning of a string. |
| 144 | /// |
| 145 | /// Reversing the NFA is useful for building a reverse DFA, which is most |
| 146 | /// useful for finding the start of a match. |
| 147 | pub fn reverse(&mut self, yes: bool) -> &mut Builder { |
| 148 | self.config.reverse = yes; |
| 149 | self |
| 150 | } |
| 151 | |
| 152 | /// Apply best effort heuristics to shrink the NFA at the expense of more |
| 153 | /// time/memory. |
| 154 | /// |
| 155 | /// This is enabled by default. Generally speaking, if one is using an NFA |
| 156 | /// to compile DFA, then the extra time used to shrink the NFA will be |
| 157 | /// more than made up for during DFA construction (potentially by a lot). |
| 158 | /// In other words, enabling this can substantially decrease the overall |
| 159 | /// amount of time it takes to build a DFA. |
| 160 | /// |
| 161 | /// The only reason to disable this if you want to compile an NFA and start |
| 162 | /// using it as quickly as possible without needing to build a DFA. |
| 163 | pub fn shrink(&mut self, yes: bool) -> &mut Builder { |
| 164 | self.config.shrink = yes; |
| 165 | self |
| 166 | } |
| 167 | } |
| 168 | |
| 169 | /// A compiler that converts a regex abstract syntax to an NFA via Thompson's |
| 170 | /// construction. Namely, this compiler permits epsilon transitions between |
| 171 | /// states. |
| 172 | /// |
| 173 | /// Users of this crate cannot use a compiler directly. Instead, all one can |
| 174 | /// do is create one and use it via the |
| 175 | /// [`Builder::build_with`](struct.Builder.html#method.build_with) |
| 176 | /// method. This permits callers to reuse compilers in order to amortize |
| 177 | /// allocations. |
| 178 | #[derive(Clone, Debug)] |
| 179 | pub struct Compiler { |
| 180 | /// The set of compiled NFA states. Once a state is compiled, it is |
| 181 | /// assigned a state ID equivalent to its index in this list. Subsequent |
| 182 | /// compilation can modify previous states by adding new transitions. |
| 183 | states: RefCell<Vec<CState>>, |
| 184 | /// The configuration from the builder. |
| 185 | config: Config, |
| 186 | /// State used for compiling character classes to UTF-8 byte automata. |
| 187 | /// State is not retained between character class compilations. This just |
| 188 | /// serves to amortize allocation to the extent possible. |
| 189 | utf8_state: RefCell<Utf8State>, |
| 190 | /// State used for arranging character classes in reverse into a trie. |
| 191 | trie_state: RefCell<RangeTrie>, |
| 192 | /// State used for caching common suffixes when compiling reverse UTF-8 |
| 193 | /// automata (for Unicode character classes). |
| 194 | utf8_suffix: RefCell<Utf8SuffixMap>, |
| 195 | /// A map used to re-map state IDs when translating the compiler's internal |
| 196 | /// NFA state representation to the external NFA representation. |
| 197 | remap: RefCell<Vec<StateID>>, |
| 198 | /// A set of compiler internal state IDs that correspond to states that are |
| 199 | /// exclusively epsilon transitions, i.e., goto instructions, combined with |
| 200 | /// the state that they point to. This is used to record said states while |
| 201 | /// transforming the compiler's internal NFA representation to the external |
| 202 | /// form. |
| 203 | empties: RefCell<Vec<(StateID, StateID)>>, |
| 204 | } |
| 205 | |
| 206 | /// A compiler intermediate state representation for an NFA that is only used |
| 207 | /// during compilation. Once compilation is done, `CState`s are converted to |
| 208 | /// `State`s, which have a much simpler representation. |
| 209 | #[derive(Clone, Debug, Eq, PartialEq)] |
| 210 | enum CState { |
| 211 | /// An empty state whose only purpose is to forward the automaton to |
| 212 | /// another state via en epsilon transition. These are useful during |
| 213 | /// compilation but are otherwise removed at the end. |
| 214 | Empty { next: StateID }, |
| 215 | /// A state that only transitions to `next` if the current input byte is |
| 216 | /// in the range `[start, end]` (inclusive on both ends). |
| 217 | Range { range: Transition }, |
| 218 | /// A state with possibly many transitions, represented in a sparse |
| 219 | /// fashion. Transitions are ordered lexicographically by input range. |
| 220 | /// As such, this may only be used when every transition has equal |
| 221 | /// priority. (In practice, this is only used for encoding large UTF-8 |
| 222 | /// automata.) |
| 223 | Sparse { ranges: Vec<Transition> }, |
| 224 | /// An alternation such that there exists an epsilon transition to all |
| 225 | /// states in `alternates`, where matches found via earlier transitions |
| 226 | /// are preferred over later transitions. |
| 227 | Union { alternates: Vec<StateID> }, |
| 228 | /// An alternation such that there exists an epsilon transition to all |
| 229 | /// states in `alternates`, where matches found via later transitions |
| 230 | /// are preferred over earlier transitions. |
| 231 | /// |
| 232 | /// This "reverse" state exists for convenience during compilation that |
| 233 | /// permits easy construction of non-greedy combinations of NFA states. |
| 234 | /// At the end of compilation, Union and UnionReverse states are merged |
| 235 | /// into one Union type of state, where the latter has its epsilon |
| 236 | /// transitions reversed to reflect the priority inversion. |
| 237 | UnionReverse { alternates: Vec<StateID> }, |
| 238 | /// A match state. There is exactly one such occurrence of this state in |
| 239 | /// an NFA. |
| 240 | Match, |
| 241 | } |
| 242 | |
| 243 | /// A value that represents the result of compiling a sub-expression of a |
| 244 | /// regex's HIR. Specifically, this represents a sub-graph of the NFA that |
| 245 | /// has an initial state at `start` and a final state at `end`. |
| 246 | #[derive(Clone, Copy, Debug)] |
| 247 | pub struct ThompsonRef { |
| 248 | start: StateID, |
| 249 | end: StateID, |
| 250 | } |
| 251 | |
| 252 | impl Compiler { |
| 253 | /// Create a new compiler. |
| 254 | pub fn new() -> Compiler { |
| 255 | Compiler { |
| 256 | states: RefCell::new(vec![]), |
| 257 | config: Config::default(), |
| 258 | utf8_state: RefCell::new(Utf8State::new()), |
| 259 | trie_state: RefCell::new(RangeTrie::new()), |
| 260 | utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), |
| 261 | remap: RefCell::new(vec![]), |
| 262 | empties: RefCell::new(vec![]), |
| 263 | } |
| 264 | } |
| 265 | |
| 266 | /// Clear any memory used by this compiler such that it is ready to compile |
| 267 | /// a new regex. |
| 268 | /// |
| 269 | /// It is preferrable to reuse a compiler if possible in order to reuse |
| 270 | /// allocations. |
| 271 | fn clear(&self) { |
| 272 | self.states.borrow_mut().clear(); |
| 273 | // We don't need to clear anything else since they are cleared on |
| 274 | // their own and only when they are used. |
| 275 | } |
| 276 | |
| 277 | /// Configure this compiler from the builder's knobs. |
| 278 | /// |
| 279 | /// The compiler is always reconfigured by the builder before using it to |
| 280 | /// build an NFA. |
| 281 | fn configure(&mut self, config: Config) { |
| 282 | self.config = config; |
| 283 | } |
| 284 | |
| 285 | /// Convert the current intermediate NFA to its final compiled form. |
| 286 | fn compile(&self, nfa: &mut NFA, expr: &Hir) -> Result<()> { |
| 287 | nfa.anchored = self.config.anchored; |
| 288 | |
| 289 | let mut start = self.add_empty(); |
| 290 | if !nfa.anchored { |
| 291 | let compiled = if self.config.allow_invalid_utf8 { |
| 292 | self.c_unanchored_prefix_invalid_utf8()? |
| 293 | } else { |
| 294 | self.c_unanchored_prefix_valid_utf8()? |
| 295 | }; |
| 296 | self.patch(start, compiled.start); |
| 297 | start = compiled.end; |
| 298 | } |
| 299 | let compiled = self.c(&expr)?; |
| 300 | let match_id = self.add_match(); |
| 301 | self.patch(start, compiled.start); |
| 302 | self.patch(compiled.end, match_id); |
| 303 | self.finish(nfa); |
| 304 | Ok(()) |
| 305 | } |
| 306 | |
| 307 | /// Finishes the compilation process and populates the provide NFA with |
| 308 | /// the final graph. |
| 309 | fn finish(&self, nfa: &mut NFA) { |
| 310 | let mut bstates = self.states.borrow_mut(); |
| 311 | let mut remap = self.remap.borrow_mut(); |
| 312 | remap.resize(bstates.len(), 0); |
| 313 | let mut empties = self.empties.borrow_mut(); |
| 314 | empties.clear(); |
| 315 | |
| 316 | // We don't reuse allocations here becuase this is what we're |
| 317 | // returning. |
| 318 | nfa.states.clear(); |
| 319 | let mut byteset = ByteClassSet::new(); |
| 320 | |
| 321 | // The idea here is to convert our intermediate states to their final |
| 322 | // form. The only real complexity here is the process of converting |
| 323 | // transitions, which are expressed in terms of state IDs. The new |
| 324 | // set of states will be smaller because of partial epsilon removal, |
| 325 | // so the state IDs will not be the same. |
| 326 | for (id, bstate) in bstates.iter_mut().enumerate() { |
| 327 | match *bstate { |
| 328 | CState::Empty { next } => { |
| 329 | // Since we're removing empty states, we need to handle |
| 330 | // them later since we don't yet know which new state this |
| 331 | // empty state will be mapped to. |
| 332 | empties.push((id, next)); |
| 333 | } |
| 334 | CState::Range { ref range } => { |
| 335 | remap[id] = nfa.states.len(); |
| 336 | byteset.set_range(range.start, range.end); |
| 337 | nfa.states.push(State::Range { range: range.clone() }); |
| 338 | } |
| 339 | CState::Sparse { ref mut ranges } => { |
| 340 | remap[id] = nfa.states.len(); |
| 341 | |
| 342 | let ranges = mem::replace(ranges, vec![]); |
| 343 | for r in &ranges { |
| 344 | byteset.set_range(r.start, r.end); |
| 345 | } |
| 346 | nfa.states.push(State::Sparse { |
| 347 | ranges: ranges.into_boxed_slice(), |
| 348 | }); |
| 349 | } |
| 350 | CState::Union { ref mut alternates } => { |
| 351 | remap[id] = nfa.states.len(); |
| 352 | |
| 353 | let alternates = mem::replace(alternates, vec![]); |
| 354 | nfa.states.push(State::Union { |
| 355 | alternates: alternates.into_boxed_slice(), |
| 356 | }); |
| 357 | } |
| 358 | CState::UnionReverse { ref mut alternates } => { |
| 359 | remap[id] = nfa.states.len(); |
| 360 | |
| 361 | let mut alternates = mem::replace(alternates, vec![]); |
| 362 | alternates.reverse(); |
| 363 | nfa.states.push(State::Union { |
| 364 | alternates: alternates.into_boxed_slice(), |
| 365 | }); |
| 366 | } |
| 367 | CState::Match => { |
| 368 | remap[id] = nfa.states.len(); |
| 369 | nfa.states.push(State::Match); |
| 370 | } |
| 371 | } |
| 372 | } |
| 373 | for &(empty_id, mut empty_next) in empties.iter() { |
| 374 | // empty states can point to other empty states, forming a chain. |
| 375 | // So we must follow the chain until the end, which must end at |
| 376 | // a non-empty state, and therefore, a state that is correctly |
| 377 | // remapped. We are guaranteed to terminate because our compiler |
| 378 | // never builds a loop among empty states. |
| 379 | while let CState::Empty { next } = bstates[empty_next] { |
| 380 | empty_next = next; |
| 381 | } |
| 382 | remap[empty_id] = remap[empty_next]; |
| 383 | } |
| 384 | for state in &mut nfa.states { |
| 385 | state.remap(&remap); |
| 386 | } |
| 387 | // The compiler always begins the NFA at the first state. |
| 388 | nfa.start = remap[0]; |
| 389 | nfa.byte_classes = byteset.byte_classes(); |
| 390 | } |
| 391 | |
| 392 | fn c(&self, expr: &Hir) -> Result<ThompsonRef> { |
| 393 | match *expr.kind() { |
| 394 | HirKind::Empty => { |
| 395 | let id = self.add_empty(); |
| 396 | Ok(ThompsonRef { start: id, end: id }) |
| 397 | } |
| 398 | HirKind::Literal(hir::Literal::Unicode(ch)) => { |
| 399 | let mut buf = [0; 4]; |
| 400 | let it = ch |
| 401 | .encode_utf8(&mut buf) |
| 402 | .as_bytes() |
| 403 | .iter() |
| 404 | .map(|&b| Ok(self.c_range(b, b))); |
| 405 | self.c_concat(it) |
| 406 | } |
| 407 | HirKind::Literal(hir::Literal::Byte(b)) => Ok(self.c_range(b, b)), |
| 408 | HirKind::Class(hir::Class::Bytes(ref cls)) => { |
| 409 | self.c_byte_class(cls) |
| 410 | } |
| 411 | HirKind::Class(hir::Class::Unicode(ref cls)) => { |
| 412 | self.c_unicode_class(cls) |
| 413 | } |
| 414 | HirKind::Repetition(ref rep) => self.c_repetition(rep), |
| 415 | HirKind::Group(ref group) => self.c(&*group.hir), |
| 416 | HirKind::Concat(ref exprs) => { |
| 417 | self.c_concat(exprs.iter().map(|e| self.c(e))) |
| 418 | } |
| 419 | HirKind::Alternation(ref exprs) => { |
| 420 | self.c_alternation(exprs.iter().map(|e| self.c(e))) |
| 421 | } |
| 422 | HirKind::Anchor(_) => Err(Error::unsupported_anchor()), |
| 423 | HirKind::WordBoundary(_) => Err(Error::unsupported_word()), |
| 424 | } |
| 425 | } |
| 426 | |
| 427 | fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef> |
| 428 | where |
| 429 | I: DoubleEndedIterator<Item = Result<ThompsonRef>>, |
| 430 | { |
| 431 | let first = |
| 432 | if self.config.reverse { it.next_back() } else { it.next() }; |
| 433 | let ThompsonRef { start, mut end } = match first { |
| 434 | Some(result) => result?, |
| 435 | None => return Ok(self.c_empty()), |
| 436 | }; |
| 437 | loop { |
| 438 | let next = |
| 439 | if self.config.reverse { it.next_back() } else { it.next() }; |
| 440 | let compiled = match next { |
| 441 | Some(result) => result?, |
| 442 | None => break, |
| 443 | }; |
| 444 | self.patch(end, compiled.start); |
| 445 | end = compiled.end; |
| 446 | } |
| 447 | Ok(ThompsonRef { start, end }) |
| 448 | } |
| 449 | |
| 450 | fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef> |
| 451 | where |
| 452 | I: Iterator<Item = Result<ThompsonRef>>, |
| 453 | { |
| 454 | let first = it.next().expect("alternations must be non-empty")?; |
| 455 | let second = match it.next() { |
| 456 | None => return Ok(first), |
| 457 | Some(result) => result?, |
| 458 | }; |
| 459 | |
| 460 | let union = self.add_union(); |
| 461 | let end = self.add_empty(); |
| 462 | self.patch(union, first.start); |
| 463 | self.patch(first.end, end); |
| 464 | self.patch(union, second.start); |
| 465 | self.patch(second.end, end); |
| 466 | for result in it { |
| 467 | let compiled = result?; |
| 468 | self.patch(union, compiled.start); |
| 469 | self.patch(compiled.end, end); |
| 470 | } |
| 471 | Ok(ThompsonRef { start: union, end }) |
| 472 | } |
| 473 | |
| 474 | fn c_repetition(&self, rep: &hir::Repetition) -> Result<ThompsonRef> { |
| 475 | match rep.kind { |
| 476 | hir::RepetitionKind::ZeroOrOne => { |
| 477 | self.c_zero_or_one(&rep.hir, rep.greedy) |
| 478 | } |
| 479 | hir::RepetitionKind::ZeroOrMore => { |
| 480 | self.c_at_least(&rep.hir, rep.greedy, 0) |
| 481 | } |
| 482 | hir::RepetitionKind::OneOrMore => { |
| 483 | self.c_at_least(&rep.hir, rep.greedy, 1) |
| 484 | } |
| 485 | hir::RepetitionKind::Range(ref rng) => match *rng { |
| 486 | hir::RepetitionRange::Exactly(count) => { |
| 487 | self.c_exactly(&rep.hir, count) |
| 488 | } |
| 489 | hir::RepetitionRange::AtLeast(m) => { |
| 490 | self.c_at_least(&rep.hir, rep.greedy, m) |
| 491 | } |
| 492 | hir::RepetitionRange::Bounded(min, max) => { |
| 493 | self.c_bounded(&rep.hir, rep.greedy, min, max) |
| 494 | } |
| 495 | }, |
| 496 | } |
| 497 | } |
| 498 | |
| 499 | fn c_bounded( |
| 500 | &self, |
| 501 | expr: &Hir, |
| 502 | greedy: bool, |
| 503 | min: u32, |
| 504 | max: u32, |
| 505 | ) -> Result<ThompsonRef> { |
| 506 | let prefix = self.c_exactly(expr, min)?; |
| 507 | if min == max { |
| 508 | return Ok(prefix); |
| 509 | } |
| 510 | |
| 511 | // It is tempting here to compile the rest here as a concatenation |
| 512 | // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it |
| 513 | // were `aaa?a?a?`. The problem here is that it leads to this program: |
| 514 | // |
| 515 | // >000000: 61 => 01 |
| 516 | // 000001: 61 => 02 |
| 517 | // 000002: alt(03, 04) |
| 518 | // 000003: 61 => 04 |
| 519 | // 000004: alt(05, 06) |
| 520 | // 000005: 61 => 06 |
| 521 | // 000006: alt(07, 08) |
| 522 | // 000007: 61 => 08 |
| 523 | // 000008: MATCH |
| 524 | // |
| 525 | // And effectively, once you hit state 2, the epsilon closure will |
| 526 | // include states 3, 5, 5, 6, 7 and 8, which is quite a bit. It is |
| 527 | // better to instead compile it like so: |
| 528 | // |
| 529 | // >000000: 61 => 01 |
| 530 | // 000001: 61 => 02 |
| 531 | // 000002: alt(03, 08) |
| 532 | // 000003: 61 => 04 |
| 533 | // 000004: alt(05, 08) |
| 534 | // 000005: 61 => 06 |
| 535 | // 000006: alt(07, 08) |
| 536 | // 000007: 61 => 08 |
| 537 | // 000008: MATCH |
| 538 | // |
| 539 | // So that the epsilon closure of state 2 is now just 3 and 8. |
| 540 | let empty = self.add_empty(); |
| 541 | let mut prev_end = prefix.end; |
| 542 | for _ in min..max { |
| 543 | let union = if greedy { |
| 544 | self.add_union() |
| 545 | } else { |
| 546 | self.add_reverse_union() |
| 547 | }; |
| 548 | let compiled = self.c(expr)?; |
| 549 | self.patch(prev_end, union); |
| 550 | self.patch(union, compiled.start); |
| 551 | self.patch(union, empty); |
| 552 | prev_end = compiled.end; |
| 553 | } |
| 554 | self.patch(prev_end, empty); |
| 555 | Ok(ThompsonRef { start: prefix.start, end: empty }) |
| 556 | } |
| 557 | |
| 558 | fn c_at_least( |
| 559 | &self, |
| 560 | expr: &Hir, |
| 561 | greedy: bool, |
| 562 | n: u32, |
| 563 | ) -> Result<ThompsonRef> { |
| 564 | if n == 0 { |
| 565 | let union = if greedy { |
| 566 | self.add_union() |
| 567 | } else { |
| 568 | self.add_reverse_union() |
| 569 | }; |
| 570 | let compiled = self.c(expr)?; |
| 571 | self.patch(union, compiled.start); |
| 572 | self.patch(compiled.end, union); |
| 573 | Ok(ThompsonRef { start: union, end: union }) |
| 574 | } else if n == 1 { |
| 575 | let compiled = self.c(expr)?; |
| 576 | let union = if greedy { |
| 577 | self.add_union() |
| 578 | } else { |
| 579 | self.add_reverse_union() |
| 580 | }; |
| 581 | self.patch(compiled.end, union); |
| 582 | self.patch(union, compiled.start); |
| 583 | Ok(ThompsonRef { start: compiled.start, end: union }) |
| 584 | } else { |
| 585 | let prefix = self.c_exactly(expr, n - 1)?; |
| 586 | let last = self.c(expr)?; |
| 587 | let union = if greedy { |
| 588 | self.add_union() |
| 589 | } else { |
| 590 | self.add_reverse_union() |
| 591 | }; |
| 592 | self.patch(prefix.end, last.start); |
| 593 | self.patch(last.end, union); |
| 594 | self.patch(union, last.start); |
| 595 | Ok(ThompsonRef { start: prefix.start, end: union }) |
| 596 | } |
| 597 | } |
| 598 | |
| 599 | fn c_zero_or_one(&self, expr: &Hir, greedy: bool) -> Result<ThompsonRef> { |
| 600 | let union = |
| 601 | if greedy { self.add_union() } else { self.add_reverse_union() }; |
| 602 | let compiled = self.c(expr)?; |
| 603 | let empty = self.add_empty(); |
| 604 | self.patch(union, compiled.start); |
| 605 | self.patch(union, empty); |
| 606 | self.patch(compiled.end, empty); |
| 607 | Ok(ThompsonRef { start: union, end: empty }) |
| 608 | } |
| 609 | |
| 610 | fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef> { |
| 611 | let it = (0..n).map(|_| self.c(expr)); |
| 612 | self.c_concat(it) |
| 613 | } |
| 614 | |
| 615 | fn c_byte_class(&self, cls: &hir::ClassBytes) -> Result<ThompsonRef> { |
| 616 | let end = self.add_empty(); |
| 617 | let mut trans = Vec::with_capacity(cls.ranges().len()); |
| 618 | for r in cls.iter() { |
| 619 | trans.push(Transition { |
| 620 | start: r.start(), |
| 621 | end: r.end(), |
| 622 | next: end, |
| 623 | }); |
| 624 | } |
| 625 | Ok(ThompsonRef { start: self.add_sparse(trans), end }) |
| 626 | } |
| 627 | |
| 628 | fn c_unicode_class(&self, cls: &hir::ClassUnicode) -> Result<ThompsonRef> { |
| 629 | // If all we have are ASCII ranges wrapped in a Unicode package, then |
| 630 | // there is zero reason to bring out the big guns. We can fit all ASCII |
| 631 | // ranges within a single sparse transition. |
| 632 | if cls.is_all_ascii() { |
| 633 | let end = self.add_empty(); |
| 634 | let mut trans = Vec::with_capacity(cls.ranges().len()); |
| 635 | for r in cls.iter() { |
| 636 | assert!(r.start() <= '\x7F'); |
| 637 | assert!(r.end() <= '\x7F'); |
| 638 | trans.push(Transition { |
| 639 | start: r.start() as u8, |
| 640 | end: r.end() as u8, |
| 641 | next: end, |
| 642 | }); |
| 643 | } |
| 644 | Ok(ThompsonRef { start: self.add_sparse(trans), end }) |
| 645 | } else if self.config.reverse { |
| 646 | if !self.config.shrink { |
| 647 | // When we don't want to spend the extra time shrinking, we |
| 648 | // compile the UTF-8 automaton in reverse using something like |
| 649 | // the "naive" approach, but will attempt to re-use common |
| 650 | // suffixes. |
| 651 | self.c_unicode_class_reverse_with_suffix(cls) |
| 652 | } else { |
| 653 | // When we want to shrink our NFA for reverse UTF-8 automata, |
| 654 | // we cannot feed UTF-8 sequences directly to the UTF-8 |
| 655 | // compiler, since the UTF-8 compiler requires all sequences |
| 656 | // to be lexicographically sorted. Instead, we organize our |
| 657 | // sequences into a range trie, which can then output our |
| 658 | // sequences in the correct order. Unfortunately, building the |
| 659 | // range trie is fairly expensive (but not nearly as expensive |
| 660 | // as building a DFA). Hence the reason why the 'shrink' option |
| 661 | // exists, so that this path can be toggled off. |
| 662 | let mut trie = self.trie_state.borrow_mut(); |
| 663 | trie.clear(); |
| 664 | |
| 665 | for rng in cls.iter() { |
| 666 | for mut seq in Utf8Sequences::new(rng.start(), rng.end()) { |
| 667 | seq.reverse(); |
| 668 | trie.insert(seq.as_slice()); |
| 669 | } |
| 670 | } |
| 671 | let mut utf8_state = self.utf8_state.borrow_mut(); |
| 672 | let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state); |
| 673 | trie.iter(|seq| { |
| 674 | utf8c.add(&seq); |
| 675 | }); |
| 676 | Ok(utf8c.finish()) |
| 677 | } |
| 678 | } else { |
| 679 | // In the forward direction, we always shrink our UTF-8 automata |
| 680 | // because we can stream it right into the UTF-8 compiler. There |
| 681 | // is almost no downside (in either memory or time) to using this |
| 682 | // approach. |
| 683 | let mut utf8_state = self.utf8_state.borrow_mut(); |
| 684 | let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state); |
| 685 | for rng in cls.iter() { |
| 686 | for seq in Utf8Sequences::new(rng.start(), rng.end()) { |
| 687 | utf8c.add(seq.as_slice()); |
| 688 | } |
| 689 | } |
| 690 | Ok(utf8c.finish()) |
| 691 | } |
| 692 | |
| 693 | // For reference, the code below is the "naive" version of compiling a |
| 694 | // UTF-8 automaton. It is deliciously simple (and works for both the |
| 695 | // forward and reverse cases), but will unfortunately produce very |
| 696 | // large NFAs. When compiling a forward automaton, the size difference |
| 697 | // can sometimes be an order of magnitude. For example, the '\w' regex |
| 698 | // will generate about ~3000 NFA states using the naive approach below, |
| 699 | // but only 283 states when using the approach above. This is because |
| 700 | // the approach above actually compiles a *minimal* (or near minimal, |
| 701 | // because of the bounded hashmap) UTF-8 automaton. |
| 702 | // |
| 703 | // The code below is kept as a reference point in order to make it |
| 704 | // easier to understand the higher level goal here. |
| 705 | /* |
| 706 | let it = cls |
| 707 | .iter() |
| 708 | .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end())) |
| 709 | .map(|seq| { |
| 710 | let it = seq |
| 711 | .as_slice() |
| 712 | .iter() |
| 713 | .map(|rng| Ok(self.c_range(rng.start, rng.end))); |
| 714 | self.c_concat(it) |
| 715 | }); |
| 716 | self.c_alternation(it); |
| 717 | */ |
| 718 | } |
| 719 | |
| 720 | fn c_unicode_class_reverse_with_suffix( |
| 721 | &self, |
| 722 | cls: &hir::ClassUnicode, |
| 723 | ) -> Result<ThompsonRef> { |
| 724 | // N.B. It would likely be better to cache common *prefixes* in the |
| 725 | // reverse direction, but it's not quite clear how to do that. The |
| 726 | // advantage of caching suffixes is that it does give us a win, and |
| 727 | // has a very small additional overhead. |
| 728 | let mut cache = self.utf8_suffix.borrow_mut(); |
| 729 | cache.clear(); |
| 730 | |
| 731 | let union = self.add_union(); |
| 732 | let alt_end = self.add_empty(); |
| 733 | for urng in cls.iter() { |
| 734 | for seq in Utf8Sequences::new(urng.start(), urng.end()) { |
| 735 | let mut end = alt_end; |
| 736 | for brng in seq.as_slice() { |
| 737 | let key = Utf8SuffixKey { |
| 738 | from: end, |
| 739 | start: brng.start, |
| 740 | end: brng.end, |
| 741 | }; |
| 742 | let hash = cache.hash(&key); |
| 743 | if let Some(id) = cache.get(&key, hash) { |
| 744 | end = id; |
| 745 | continue; |
| 746 | } |
| 747 | |
| 748 | let compiled = self.c_range(brng.start, brng.end); |
| 749 | self.patch(compiled.end, end); |
| 750 | end = compiled.start; |
| 751 | cache.set(key, hash, end); |
| 752 | } |
| 753 | self.patch(union, end); |
| 754 | } |
| 755 | } |
| 756 | Ok(ThompsonRef { start: union, end: alt_end }) |
| 757 | } |
| 758 | |
| 759 | fn c_range(&self, start: u8, end: u8) -> ThompsonRef { |
| 760 | let id = self.add_range(start, end); |
| 761 | ThompsonRef { start: id, end: id } |
| 762 | } |
| 763 | |
| 764 | fn c_empty(&self) -> ThompsonRef { |
| 765 | let id = self.add_empty(); |
| 766 | ThompsonRef { start: id, end: id } |
| 767 | } |
| 768 | |
| 769 | fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef> { |
| 770 | self.c(&Hir::repetition(hir::Repetition { |
| 771 | kind: hir::RepetitionKind::ZeroOrMore, |
| 772 | greedy: false, |
| 773 | hir: Box::new(Hir::any(false)), |
| 774 | })) |
| 775 | } |
| 776 | |
| 777 | fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef> { |
| 778 | self.c(&Hir::repetition(hir::Repetition { |
| 779 | kind: hir::RepetitionKind::ZeroOrMore, |
| 780 | greedy: false, |
| 781 | hir: Box::new(Hir::any(true)), |
| 782 | })) |
| 783 | } |
| 784 | |
| 785 | fn patch(&self, from: StateID, to: StateID) { |
| 786 | match self.states.borrow_mut()[from] { |
| 787 | CState::Empty { ref mut next } => { |
| 788 | *next = to; |
| 789 | } |
| 790 | CState::Range { ref mut range } => { |
| 791 | range.next = to; |
| 792 | } |
| 793 | CState::Sparse { .. } => { |
| 794 | panic!("cannot patch from a sparse NFA state") |
| 795 | } |
| 796 | CState::Union { ref mut alternates } => { |
| 797 | alternates.push(to); |
| 798 | } |
| 799 | CState::UnionReverse { ref mut alternates } => { |
| 800 | alternates.push(to); |
| 801 | } |
| 802 | CState::Match => {} |
| 803 | } |
| 804 | } |
| 805 | |
| 806 | fn add_empty(&self) -> StateID { |
| 807 | let id = self.states.borrow().len(); |
| 808 | self.states.borrow_mut().push(CState::Empty { next: 0 }); |
| 809 | id |
| 810 | } |
| 811 | |
| 812 | fn add_range(&self, start: u8, end: u8) -> StateID { |
| 813 | let id = self.states.borrow().len(); |
| 814 | let trans = Transition { start, end, next: 0 }; |
| 815 | let state = CState::Range { range: trans }; |
| 816 | self.states.borrow_mut().push(state); |
| 817 | id |
| 818 | } |
| 819 | |
| 820 | fn add_sparse(&self, ranges: Vec<Transition>) -> StateID { |
| 821 | if ranges.len() == 1 { |
| 822 | let id = self.states.borrow().len(); |
| 823 | let state = CState::Range { range: ranges[0] }; |
| 824 | self.states.borrow_mut().push(state); |
| 825 | return id; |
| 826 | } |
| 827 | let id = self.states.borrow().len(); |
| 828 | let state = CState::Sparse { ranges }; |
| 829 | self.states.borrow_mut().push(state); |
| 830 | id |
| 831 | } |
| 832 | |
| 833 | fn add_union(&self) -> StateID { |
| 834 | let id = self.states.borrow().len(); |
| 835 | let state = CState::Union { alternates: vec![] }; |
| 836 | self.states.borrow_mut().push(state); |
| 837 | id |
| 838 | } |
| 839 | |
| 840 | fn add_reverse_union(&self) -> StateID { |
| 841 | let id = self.states.borrow().len(); |
| 842 | let state = CState::UnionReverse { alternates: vec![] }; |
| 843 | self.states.borrow_mut().push(state); |
| 844 | id |
| 845 | } |
| 846 | |
| 847 | fn add_match(&self) -> StateID { |
| 848 | let id = self.states.borrow().len(); |
| 849 | self.states.borrow_mut().push(CState::Match); |
| 850 | id |
| 851 | } |
| 852 | } |
| 853 | |
| 854 | #[derive(Debug)] |
| 855 | struct Utf8Compiler<'a> { |
| 856 | nfac: &'a Compiler, |
| 857 | state: &'a mut Utf8State, |
| 858 | target: StateID, |
| 859 | } |
| 860 | |
| 861 | #[derive(Clone, Debug)] |
| 862 | struct Utf8State { |
| 863 | compiled: Utf8BoundedMap, |
| 864 | uncompiled: Vec<Utf8Node>, |
| 865 | } |
| 866 | |
| 867 | #[derive(Clone, Debug)] |
| 868 | struct Utf8Node { |
| 869 | trans: Vec<Transition>, |
| 870 | last: Option<Utf8LastTransition>, |
| 871 | } |
| 872 | |
| 873 | #[derive(Clone, Debug)] |
| 874 | struct Utf8LastTransition { |
| 875 | start: u8, |
| 876 | end: u8, |
| 877 | } |
| 878 | |
| 879 | impl Utf8State { |
| 880 | fn new() -> Utf8State { |
| 881 | Utf8State { compiled: Utf8BoundedMap::new(5000), uncompiled: vec![] } |
| 882 | } |
| 883 | |
| 884 | fn clear(&mut self) { |
| 885 | self.compiled.clear(); |
| 886 | self.uncompiled.clear(); |
| 887 | } |
| 888 | } |
| 889 | |
| 890 | impl<'a> Utf8Compiler<'a> { |
| 891 | fn new(nfac: &'a Compiler, state: &'a mut Utf8State) -> Utf8Compiler<'a> { |
| 892 | let target = nfac.add_empty(); |
| 893 | state.clear(); |
| 894 | let mut utf8c = Utf8Compiler { nfac, state, target }; |
| 895 | utf8c.add_empty(); |
| 896 | utf8c |
| 897 | } |
| 898 | |
| 899 | fn finish(&mut self) -> ThompsonRef { |
| 900 | self.compile_from(0); |
| 901 | let node = self.pop_root(); |
| 902 | let start = self.compile(node); |
| 903 | ThompsonRef { start, end: self.target } |
| 904 | } |
| 905 | |
| 906 | fn add(&mut self, ranges: &[Utf8Range]) { |
| 907 | let prefix_len = ranges |
| 908 | .iter() |
| 909 | .zip(&self.state.uncompiled) |
| 910 | .take_while(|&(range, node)| { |
| 911 | node.last.as_ref().map_or(false, |t| { |
| 912 | (t.start, t.end) == (range.start, range.end) |
| 913 | }) |
| 914 | }) |
| 915 | .count(); |
| 916 | assert!(prefix_len < ranges.len()); |
| 917 | self.compile_from(prefix_len); |
| 918 | self.add_suffix(&ranges[prefix_len..]); |
| 919 | } |
| 920 | |
| 921 | fn compile_from(&mut self, from: usize) { |
| 922 | let mut next = self.target; |
| 923 | while from + 1 < self.state.uncompiled.len() { |
| 924 | let node = self.pop_freeze(next); |
| 925 | next = self.compile(node); |
| 926 | } |
| 927 | self.top_last_freeze(next); |
| 928 | } |
| 929 | |
| 930 | fn compile(&mut self, node: Vec<Transition>) -> StateID { |
| 931 | let hash = self.state.compiled.hash(&node); |
| 932 | if let Some(id) = self.state.compiled.get(&node, hash) { |
| 933 | return id; |
| 934 | } |
| 935 | let id = self.nfac.add_sparse(node.clone()); |
| 936 | self.state.compiled.set(node, hash, id); |
| 937 | id |
| 938 | } |
| 939 | |
| 940 | fn add_suffix(&mut self, ranges: &[Utf8Range]) { |
| 941 | assert!(!ranges.is_empty()); |
| 942 | let last = self |
| 943 | .state |
| 944 | .uncompiled |
| 945 | .len() |
| 946 | .checked_sub(1) |
| 947 | .expect("non-empty nodes"); |
| 948 | assert!(self.state.uncompiled[last].last.is_none()); |
| 949 | self.state.uncompiled[last].last = Some(Utf8LastTransition { |
| 950 | start: ranges[0].start, |
| 951 | end: ranges[0].end, |
| 952 | }); |
| 953 | for r in &ranges[1..] { |
| 954 | self.state.uncompiled.push(Utf8Node { |
| 955 | trans: vec![], |
| 956 | last: Some(Utf8LastTransition { start: r.start, end: r.end }), |
| 957 | }); |
| 958 | } |
| 959 | } |
| 960 | |
| 961 | fn add_empty(&mut self) { |
| 962 | self.state.uncompiled.push(Utf8Node { trans: vec![], last: None }); |
| 963 | } |
| 964 | |
| 965 | fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> { |
| 966 | let mut uncompiled = self.state.uncompiled.pop().unwrap(); |
| 967 | uncompiled.set_last_transition(next); |
| 968 | uncompiled.trans |
| 969 | } |
| 970 | |
| 971 | fn pop_root(&mut self) -> Vec<Transition> { |
| 972 | assert_eq!(self.state.uncompiled.len(), 1); |
| 973 | assert!(self.state.uncompiled[0].last.is_none()); |
| 974 | self.state.uncompiled.pop().expect("non-empty nodes").trans |
| 975 | } |
| 976 | |
| 977 | fn top_last_freeze(&mut self, next: StateID) { |
| 978 | let last = self |
| 979 | .state |
| 980 | .uncompiled |
| 981 | .len() |
| 982 | .checked_sub(1) |
| 983 | .expect("non-empty nodes"); |
| 984 | self.state.uncompiled[last].set_last_transition(next); |
| 985 | } |
| 986 | } |
| 987 | |
| 988 | impl Utf8Node { |
| 989 | fn set_last_transition(&mut self, next: StateID) { |
| 990 | if let Some(last) = self.last.take() { |
| 991 | self.trans.push(Transition { |
| 992 | start: last.start, |
| 993 | end: last.end, |
| 994 | next, |
| 995 | }); |
| 996 | } |
| 997 | } |
| 998 | } |
| 999 | |
| 1000 | #[cfg(test)] |
| 1001 | mod tests { |
| 1002 | use regex_syntax::hir::Hir; |
| 1003 | use regex_syntax::ParserBuilder; |
| 1004 | |
| 1005 | use super::{Builder, State, StateID, Transition, NFA}; |
| 1006 | |
| 1007 | fn parse(pattern: &str) -> Hir { |
| 1008 | ParserBuilder::new().build().parse(pattern).unwrap() |
| 1009 | } |
| 1010 | |
| 1011 | fn build(pattern: &str) -> NFA { |
| 1012 | Builder::new().anchored(true).build(&parse(pattern)).unwrap() |
| 1013 | } |
| 1014 | |
| 1015 | fn s_byte(byte: u8, next: StateID) -> State { |
| 1016 | let trans = Transition { start: byte, end: byte, next }; |
| 1017 | State::Range { range: trans } |
| 1018 | } |
| 1019 | |
| 1020 | fn s_range(start: u8, end: u8, next: StateID) -> State { |
| 1021 | let trans = Transition { start, end, next }; |
| 1022 | State::Range { range: trans } |
| 1023 | } |
| 1024 | |
| 1025 | fn s_sparse(ranges: &[(u8, u8, StateID)]) -> State { |
| 1026 | let ranges = ranges |
| 1027 | .iter() |
| 1028 | .map(|&(start, end, next)| Transition { start, end, next }) |
| 1029 | .collect(); |
| 1030 | State::Sparse { ranges } |
| 1031 | } |
| 1032 | |
| 1033 | fn s_union(alts: &[StateID]) -> State { |
| 1034 | State::Union { alternates: alts.to_vec().into_boxed_slice() } |
| 1035 | } |
| 1036 | |
| 1037 | fn s_match() -> State { |
| 1038 | State::Match |
| 1039 | } |
| 1040 | |
| 1041 | #[test] |
| 1042 | fn errors() { |
| 1043 | // unsupported anchors |
| 1044 | assert!(Builder::new().build(&parse(r"^")).is_err()); |
| 1045 | assert!(Builder::new().build(&parse(r"$")).is_err()); |
| 1046 | assert!(Builder::new().build(&parse(r"\A")).is_err()); |
| 1047 | assert!(Builder::new().build(&parse(r"\z")).is_err()); |
| 1048 | |
| 1049 | // unsupported word boundaries |
| 1050 | assert!(Builder::new().build(&parse(r"\b")).is_err()); |
| 1051 | assert!(Builder::new().build(&parse(r"\B")).is_err()); |
| 1052 | assert!(Builder::new().build(&parse(r"(?-u)\b")).is_err()); |
| 1053 | } |
| 1054 | |
| 1055 | // Test that building an unanchored NFA has an appropriate `.*?` prefix. |
| 1056 | #[test] |
| 1057 | fn compile_unanchored_prefix() { |
| 1058 | // When the machine can only match valid UTF-8. |
| 1059 | let nfa = Builder::new().anchored(false).build(&parse(r"a")).unwrap(); |
| 1060 | // There should be many states since the `.` in `.*?` matches any |
| 1061 | // Unicode scalar value. |
| 1062 | assert_eq!(11, nfa.len()); |
| 1063 | assert_eq!(nfa.states[10], s_match()); |
| 1064 | assert_eq!(nfa.states[9], s_byte(b'a', 10)); |
| 1065 | |
| 1066 | // When the machine can match invalid UTF-8. |
| 1067 | let nfa = Builder::new() |
| 1068 | .anchored(false) |
| 1069 | .allow_invalid_utf8(true) |
| 1070 | .build(&parse(r"a")) |
| 1071 | .unwrap(); |
| 1072 | assert_eq!( |
| 1073 | nfa.states, |
| 1074 | &[ |
| 1075 | s_union(&[2, 1]), |
| 1076 | s_range(0, 255, 0), |
| 1077 | s_byte(b'a', 3), |
| 1078 | s_match(), |
| 1079 | ] |
| 1080 | ); |
| 1081 | } |
| 1082 | |
| 1083 | #[test] |
| 1084 | fn compile_empty() { |
| 1085 | assert_eq!(build("").states, &[s_match(),]); |
| 1086 | } |
| 1087 | |
| 1088 | #[test] |
| 1089 | fn compile_literal() { |
| 1090 | assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(),]); |
| 1091 | assert_eq!( |
| 1092 | build("ab").states, |
| 1093 | &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),] |
| 1094 | ); |
| 1095 | assert_eq!( |
| 1096 | build("ā").states, |
| 1097 | &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(),] |
| 1098 | ); |
| 1099 | |
| 1100 | // Check that non-UTF-8 literals work. |
| 1101 | let hir = ParserBuilder::new() |
| 1102 | .allow_invalid_utf8(true) |
| 1103 | .build() |
| 1104 | .parse(r"(?-u)\xFF") |
| 1105 | .unwrap(); |
| 1106 | let nfa = Builder::new() |
| 1107 | .anchored(true) |
| 1108 | .allow_invalid_utf8(true) |
| 1109 | .build(&hir) |
| 1110 | .unwrap(); |
| 1111 | assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(),]); |
| 1112 | } |
| 1113 | |
| 1114 | #[test] |
| 1115 | fn compile_class() { |
| 1116 | assert_eq!( |
| 1117 | build(r"[a-z]").states, |
| 1118 | &[s_range(b'a', b'z', 1), s_match(),] |
| 1119 | ); |
| 1120 | assert_eq!( |
| 1121 | build(r"[x-za-c]").states, |
| 1122 | &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match()] |
| 1123 | ); |
| 1124 | assert_eq!( |
| 1125 | build(r"[\u03B1-\u03B4]").states, |
| 1126 | &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match()] |
| 1127 | ); |
| 1128 | assert_eq!( |
| 1129 | build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states, |
| 1130 | &[ |
| 1131 | s_range(0xB1, 0xB4, 5), |
| 1132 | s_range(0x99, 0x9E, 5), |
| 1133 | s_byte(0xA4, 1), |
| 1134 | s_byte(0x9F, 2), |
| 1135 | s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]), |
| 1136 | s_match(), |
| 1137 | ] |
| 1138 | ); |
| 1139 | assert_eq!( |
| 1140 | build(r"[a-zā]").states, |
| 1141 | &[ |
| 1142 | s_byte(0x83, 3), |
| 1143 | s_byte(0x98, 0), |
| 1144 | s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]), |
| 1145 | s_match(), |
| 1146 | ] |
| 1147 | ); |
| 1148 | } |
| 1149 | |
| 1150 | #[test] |
| 1151 | fn compile_repetition() { |
| 1152 | assert_eq!( |
| 1153 | build(r"a?").states, |
| 1154 | &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(),] |
| 1155 | ); |
| 1156 | assert_eq!( |
| 1157 | build(r"a??").states, |
| 1158 | &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(),] |
| 1159 | ); |
| 1160 | } |
| 1161 | |
| 1162 | #[test] |
| 1163 | fn compile_group() { |
| 1164 | assert_eq!( |
| 1165 | build(r"ab+").states, |
| 1166 | &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(),] |
| 1167 | ); |
| 1168 | assert_eq!( |
| 1169 | build(r"(ab)").states, |
| 1170 | &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),] |
| 1171 | ); |
| 1172 | assert_eq!( |
| 1173 | build(r"(ab)+").states, |
| 1174 | &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(),] |
| 1175 | ); |
| 1176 | } |
| 1177 | |
| 1178 | #[test] |
| 1179 | fn compile_alternation() { |
| 1180 | assert_eq!( |
| 1181 | build(r"a|b").states, |
| 1182 | &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(),] |
| 1183 | ); |
| 1184 | assert_eq!( |
| 1185 | build(r"|b").states, |
| 1186 | &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(),] |
| 1187 | ); |
| 1188 | assert_eq!( |
| 1189 | build(r"a|").states, |
| 1190 | &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(),] |
| 1191 | ); |
| 1192 | } |
| 1193 | } |