Blame - src/nfa/compiler.rs - platform/external/rust/crates/regex-automata

blob: d9b3945b3cc0cf62d66d330541b3e3588c85e2bc [file] [log] [blame]

Jakub Kotur	3bceaeb	2020-12-21 17:28:16 +0100	[diff] [blame]	1	// This module provides an NFA compiler using Thompson's construction
				2	// algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA
				3	// graph as output. The NFA graph is structured in a way that permits it to be
				4	// executed by a virtual machine and also used to efficiently build a DFA.
				5	//
				6	// The compiler deals with a slightly expanded set of NFA states that notably
				7	// includes an empty node that has exactly one epsilon transition to the next
				8	// state. In other words, it's a "goto" instruction if one views Thompson's NFA
				9	// as a set of bytecode instructions. These goto instructions are removed in
				10	// a subsequent phase before returning the NFA to the caller. The purpose of
				11	// these empty nodes is that they make the construction algorithm substantially
				12	// simpler to implement. We remove them before returning to the caller because
				13	// they can represent substantial overhead when traversing the NFA graph
				14	// (either while searching using the NFA directly or while building a DFA).
				15	//
				16	// In the future, it would be nice to provide a Glushkov compiler as well,
				17	// as it would work well as a bit-parallel NFA for smaller regexes. But
				18	// the Thompson construction is one I'm more familiar with and seems more
				19	// straight-forward to deal with when it comes to large Unicode character
				20	// classes.
				21	//
				22	// Internally, the compiler uses interior mutability to improve composition
				23	// in the face of the borrow checker. In particular, we'd really like to be
				24	// able to write things like this:
				25	//
				26	// self.c_concat(exprs.iter().map(\|e\| self.c(e)))
				27	//
				28	// Which elegantly uses iterators to build up a sequence of compiled regex
				29	// sub-expressions and then hands it off to the concatenating compiler
				30	// routine. Without interior mutability, the borrow checker won't let us
				31	// borrow `self` mutably both inside and outside the closure at the same
				32	// time.
				33
				34	use std::cell::RefCell;
				35	use std::mem;
				36
				37	use regex_syntax::hir::{self, Hir, HirKind};
				38	use regex_syntax::utf8::{Utf8Range, Utf8Sequences};
				39
				40	use classes::ByteClassSet;
				41	use error::{Error, Result};
				42	use nfa::map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap};
				43	use nfa::range_trie::RangeTrie;
				44	use nfa::{State, StateID, Transition, NFA};
				45
				46	/// Config knobs for the NFA compiler. See the builder's methods for more
				47	/// docs on each one.
				48	#[derive(Clone, Copy, Debug)]
				49	struct Config {
				50	anchored: bool,
				51	allow_invalid_utf8: bool,
				52	reverse: bool,
				53	shrink: bool,
				54	}
				55
				56	impl Default for Config {
				57	fn default() -> Config {
				58	Config {
				59	anchored: false,
				60	allow_invalid_utf8: false,
				61	reverse: false,
				62	shrink: true,
				63	}
				64	}
				65	}
				66
				67	/// A builder for compiling an NFA.
				68	#[derive(Clone, Debug)]
				69	pub struct Builder {
				70	config: Config,
				71	}
				72
				73	impl Builder {
				74	/// Create a new NFA builder with its default configuration.
				75	pub fn new() -> Builder {
				76	Builder { config: Config::default() }
				77	}
				78
				79	/// Compile the given high level intermediate representation of a regular
				80	/// expression into an NFA.
				81	///
				82	/// If there was a problem building the NFA, then an error is returned.
				83	/// For example, if the regex uses unsupported features (such as zero-width
				84	/// assertions), then an error is returned.
				85	pub fn build(&self, expr: &Hir) -> Result<NFA> {
				86	let mut nfa = NFA::always_match();
				87	self.build_with(&mut Compiler::new(), &mut nfa, expr)?;
				88	Ok(nfa)
				89	}
				90
				91	/// Compile the given high level intermediate representation of a regular
				92	/// expression into the NFA given using the given compiler. Callers may
				93	/// prefer this over `build` if they would like to reuse allocations while
				94	/// compiling many regular expressions.
				95	///
				96	/// On success, the given NFA is completely overwritten with the NFA
				97	/// produced by the compiler.
				98	///
				99	/// If there was a problem building the NFA, then an error is returned. For
				100	/// example, if the regex uses unsupported features (such as zero-width
				101	/// assertions), then an error is returned. When an error is returned,
				102	/// the contents of `nfa` are unspecified and should not be relied upon.
				103	/// However, it can still be reused in subsequent calls to this method.
				104	pub fn build_with(
				105	&self,
				106	compiler: &mut Compiler,
				107	nfa: &mut NFA,
				108	expr: &Hir,
				109	) -> Result<()> {
				110	compiler.clear();
				111	compiler.configure(self.config);
				112	compiler.compile(nfa, expr)
				113	}
				114
				115	/// Set whether matching must be anchored at the beginning of the input.
				116	///
				117	/// When enabled, a match must begin at the start of the input. When
				118	/// disabled, the NFA will act as if the pattern started with a `.*?`,
				119	/// which enables a match to appear anywhere.
				120	///
				121	/// By default this is disabled.
				122	pub fn anchored(&mut self, yes: bool) -> &mut Builder {
				123	self.config.anchored = yes;
				124	self
				125	}
				126
				127	/// When enabled, the builder will permit the construction of an NFA that
				128	/// may match invalid UTF-8.
				129	///
				130	/// When disabled (the default), the builder is guaranteed to produce a
				131	/// regex that will only ever match valid UTF-8 (otherwise, the builder
				132	/// will return an error).
				133	pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder {
				134	self.config.allow_invalid_utf8 = yes;
				135	self
				136	}
				137
				138	/// Reverse the NFA.
				139	///
				140	/// A NFA reversal is performed by reversing all of the concatenated
				141	/// sub-expressions in the original pattern, recursively. The resulting
				142	/// NFA can be used to match the pattern starting from the end of a string
				143	/// instead of the beginning of a string.
				144	///
				145	/// Reversing the NFA is useful for building a reverse DFA, which is most
				146	/// useful for finding the start of a match.
				147	pub fn reverse(&mut self, yes: bool) -> &mut Builder {
				148	self.config.reverse = yes;
				149	self
				150	}
				151
				152	/// Apply best effort heuristics to shrink the NFA at the expense of more
				153	/// time/memory.
				154	///
				155	/// This is enabled by default. Generally speaking, if one is using an NFA
				156	/// to compile DFA, then the extra time used to shrink the NFA will be
				157	/// more than made up for during DFA construction (potentially by a lot).
				158	/// In other words, enabling this can substantially decrease the overall
				159	/// amount of time it takes to build a DFA.
				160	///
				161	/// The only reason to disable this if you want to compile an NFA and start
				162	/// using it as quickly as possible without needing to build a DFA.
				163	pub fn shrink(&mut self, yes: bool) -> &mut Builder {
				164	self.config.shrink = yes;
				165	self
				166	}
				167	}
				168
				169	/// A compiler that converts a regex abstract syntax to an NFA via Thompson's
				170	/// construction. Namely, this compiler permits epsilon transitions between
				171	/// states.
				172	///
				173	/// Users of this crate cannot use a compiler directly. Instead, all one can
				174	/// do is create one and use it via the
				175	/// [`Builder::build_with`](struct.Builder.html#method.build_with)
				176	/// method. This permits callers to reuse compilers in order to amortize
				177	/// allocations.
				178	#[derive(Clone, Debug)]
				179	pub struct Compiler {
				180	/// The set of compiled NFA states. Once a state is compiled, it is
				181	/// assigned a state ID equivalent to its index in this list. Subsequent
				182	/// compilation can modify previous states by adding new transitions.
				183	states: RefCell<Vec<CState>>,
				184	/// The configuration from the builder.
				185	config: Config,
				186	/// State used for compiling character classes to UTF-8 byte automata.
				187	/// State is not retained between character class compilations. This just
				188	/// serves to amortize allocation to the extent possible.
				189	utf8_state: RefCell<Utf8State>,
				190	/// State used for arranging character classes in reverse into a trie.
				191	trie_state: RefCell<RangeTrie>,
				192	/// State used for caching common suffixes when compiling reverse UTF-8
				193	/// automata (for Unicode character classes).
				194	utf8_suffix: RefCell<Utf8SuffixMap>,
				195	/// A map used to re-map state IDs when translating the compiler's internal
				196	/// NFA state representation to the external NFA representation.
				197	remap: RefCell<Vec<StateID>>,
				198	/// A set of compiler internal state IDs that correspond to states that are
				199	/// exclusively epsilon transitions, i.e., goto instructions, combined with
				200	/// the state that they point to. This is used to record said states while
				201	/// transforming the compiler's internal NFA representation to the external
				202	/// form.
				203	empties: RefCell<Vec<(StateID, StateID)>>,
				204	}
				205
				206	/// A compiler intermediate state representation for an NFA that is only used
				207	/// during compilation. Once compilation is done, `CState`s are converted to
				208	/// `State`s, which have a much simpler representation.
				209	#[derive(Clone, Debug, Eq, PartialEq)]
				210	enum CState {
				211	/// An empty state whose only purpose is to forward the automaton to
				212	/// another state via en epsilon transition. These are useful during
				213	/// compilation but are otherwise removed at the end.
				214	Empty { next: StateID },
				215	/// A state that only transitions to `next` if the current input byte is
				216	/// in the range `[start, end]` (inclusive on both ends).
				217	Range { range: Transition },
				218	/// A state with possibly many transitions, represented in a sparse
				219	/// fashion. Transitions are ordered lexicographically by input range.
				220	/// As such, this may only be used when every transition has equal
				221	/// priority. (In practice, this is only used for encoding large UTF-8
				222	/// automata.)
				223	Sparse { ranges: Vec<Transition> },
				224	/// An alternation such that there exists an epsilon transition to all
				225	/// states in `alternates`, where matches found via earlier transitions
				226	/// are preferred over later transitions.
				227	Union { alternates: Vec<StateID> },
				228	/// An alternation such that there exists an epsilon transition to all
				229	/// states in `alternates`, where matches found via later transitions
				230	/// are preferred over earlier transitions.
				231	///
				232	/// This "reverse" state exists for convenience during compilation that
				233	/// permits easy construction of non-greedy combinations of NFA states.
				234	/// At the end of compilation, Union and UnionReverse states are merged
				235	/// into one Union type of state, where the latter has its epsilon
				236	/// transitions reversed to reflect the priority inversion.
				237	UnionReverse { alternates: Vec<StateID> },
				238	/// A match state. There is exactly one such occurrence of this state in
				239	/// an NFA.
				240	Match,
				241	}
				242
				243	/// A value that represents the result of compiling a sub-expression of a
				244	/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
				245	/// has an initial state at `start` and a final state at `end`.
				246	#[derive(Clone, Copy, Debug)]
				247	pub struct ThompsonRef {
				248	start: StateID,
				249	end: StateID,
				250	}
				251
				252	impl Compiler {
				253	/// Create a new compiler.
				254	pub fn new() -> Compiler {
				255	Compiler {
				256	states: RefCell::new(vec![]),
				257	config: Config::default(),
				258	utf8_state: RefCell::new(Utf8State::new()),
				259	trie_state: RefCell::new(RangeTrie::new()),
				260	utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
				261	remap: RefCell::new(vec![]),
				262	empties: RefCell::new(vec![]),
				263	}
				264	}
				265
				266	/// Clear any memory used by this compiler such that it is ready to compile
				267	/// a new regex.
				268	///
				269	/// It is preferrable to reuse a compiler if possible in order to reuse
				270	/// allocations.
				271	fn clear(&self) {
				272	self.states.borrow_mut().clear();
				273	// We don't need to clear anything else since they are cleared on
				274	// their own and only when they are used.
				275	}
				276
				277	/// Configure this compiler from the builder's knobs.
				278	///
				279	/// The compiler is always reconfigured by the builder before using it to
				280	/// build an NFA.
				281	fn configure(&mut self, config: Config) {
				282	self.config = config;
				283	}
				284
				285	/// Convert the current intermediate NFA to its final compiled form.
				286	fn compile(&self, nfa: &mut NFA, expr: &Hir) -> Result<()> {
				287	nfa.anchored = self.config.anchored;
				288
				289	let mut start = self.add_empty();
				290	if !nfa.anchored {
				291	let compiled = if self.config.allow_invalid_utf8 {
				292	self.c_unanchored_prefix_invalid_utf8()?
				293	} else {
				294	self.c_unanchored_prefix_valid_utf8()?
				295	};
				296	self.patch(start, compiled.start);
				297	start = compiled.end;
				298	}
				299	let compiled = self.c(&expr)?;
				300	let match_id = self.add_match();
				301	self.patch(start, compiled.start);
				302	self.patch(compiled.end, match_id);
				303	self.finish(nfa);
				304	Ok(())
				305	}
				306
				307	/// Finishes the compilation process and populates the provide NFA with
				308	/// the final graph.
				309	fn finish(&self, nfa: &mut NFA) {
				310	let mut bstates = self.states.borrow_mut();
				311	let mut remap = self.remap.borrow_mut();
				312	remap.resize(bstates.len(), 0);
				313	let mut empties = self.empties.borrow_mut();
				314	empties.clear();
				315
				316	// We don't reuse allocations here becuase this is what we're
				317	// returning.
				318	nfa.states.clear();
				319	let mut byteset = ByteClassSet::new();
				320
				321	// The idea here is to convert our intermediate states to their final
				322	// form. The only real complexity here is the process of converting
				323	// transitions, which are expressed in terms of state IDs. The new
				324	// set of states will be smaller because of partial epsilon removal,
				325	// so the state IDs will not be the same.
				326	for (id, bstate) in bstates.iter_mut().enumerate() {
				327	match *bstate {
				328	CState::Empty { next } => {
				329	// Since we're removing empty states, we need to handle
				330	// them later since we don't yet know which new state this
				331	// empty state will be mapped to.
				332	empties.push((id, next));
				333	}
				334	CState::Range { ref range } => {
				335	remap[id] = nfa.states.len();
				336	byteset.set_range(range.start, range.end);
				337	nfa.states.push(State::Range { range: range.clone() });
				338	}
				339	CState::Sparse { ref mut ranges } => {
				340	remap[id] = nfa.states.len();
				341
				342	let ranges = mem::replace(ranges, vec![]);
				343	for r in &ranges {
				344	byteset.set_range(r.start, r.end);
				345	}
				346	nfa.states.push(State::Sparse {
				347	ranges: ranges.into_boxed_slice(),
				348	});
				349	}
				350	CState::Union { ref mut alternates } => {
				351	remap[id] = nfa.states.len();
				352
				353	let alternates = mem::replace(alternates, vec![]);
				354	nfa.states.push(State::Union {
				355	alternates: alternates.into_boxed_slice(),
				356	});
				357	}
				358	CState::UnionReverse { ref mut alternates } => {
				359	remap[id] = nfa.states.len();
				360
				361	let mut alternates = mem::replace(alternates, vec![]);
				362	alternates.reverse();
				363	nfa.states.push(State::Union {
				364	alternates: alternates.into_boxed_slice(),
				365	});
				366	}
				367	CState::Match => {
				368	remap[id] = nfa.states.len();
				369	nfa.states.push(State::Match);
				370	}
				371	}
				372	}
				373	for &(empty_id, mut empty_next) in empties.iter() {
				374	// empty states can point to other empty states, forming a chain.
				375	// So we must follow the chain until the end, which must end at
				376	// a non-empty state, and therefore, a state that is correctly
				377	// remapped. We are guaranteed to terminate because our compiler
				378	// never builds a loop among empty states.
				379	while let CState::Empty { next } = bstates[empty_next] {
				380	empty_next = next;
				381	}
				382	remap[empty_id] = remap[empty_next];
				383	}
				384	for state in &mut nfa.states {
				385	state.remap(&remap);
				386	}
				387	// The compiler always begins the NFA at the first state.
				388	nfa.start = remap[0];
				389	nfa.byte_classes = byteset.byte_classes();
				390	}
				391
				392	fn c(&self, expr: &Hir) -> Result<ThompsonRef> {
				393	match *expr.kind() {
				394	HirKind::Empty => {
				395	let id = self.add_empty();
				396	Ok(ThompsonRef { start: id, end: id })
				397	}
				398	HirKind::Literal(hir::Literal::Unicode(ch)) => {
				399	let mut buf = [0; 4];
				400	let it = ch
				401	.encode_utf8(&mut buf)
				402	.as_bytes()
				403	.iter()
				404	.map(\|&b\| Ok(self.c_range(b, b)));
				405	self.c_concat(it)
				406	}
				407	HirKind::Literal(hir::Literal::Byte(b)) => Ok(self.c_range(b, b)),
				408	HirKind::Class(hir::Class::Bytes(ref cls)) => {
				409	self.c_byte_class(cls)
				410	}
				411	HirKind::Class(hir::Class::Unicode(ref cls)) => {
				412	self.c_unicode_class(cls)
				413	}
				414	HirKind::Repetition(ref rep) => self.c_repetition(rep),
				415	HirKind::Group(ref group) => self.c(&*group.hir),
				416	HirKind::Concat(ref exprs) => {
				417	self.c_concat(exprs.iter().map(\|e\| self.c(e)))
				418	}
				419	HirKind::Alternation(ref exprs) => {
				420	self.c_alternation(exprs.iter().map(\|e\| self.c(e)))
				421	}
				422	HirKind::Anchor(_) => Err(Error::unsupported_anchor()),
				423	HirKind::WordBoundary(_) => Err(Error::unsupported_word()),
				424	}
				425	}
				426
				427	fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef>
				428	where
				429	I: DoubleEndedIterator<Item = Result<ThompsonRef>>,
				430	{
				431	let first =
				432	if self.config.reverse { it.next_back() } else { it.next() };
				433	let ThompsonRef { start, mut end } = match first {
				434	Some(result) => result?,
				435	None => return Ok(self.c_empty()),
				436	};
				437	loop {
				438	let next =
				439	if self.config.reverse { it.next_back() } else { it.next() };
				440	let compiled = match next {
				441	Some(result) => result?,
				442	None => break,
				443	};
				444	self.patch(end, compiled.start);
				445	end = compiled.end;
				446	}
				447	Ok(ThompsonRef { start, end })
				448	}
				449
				450	fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef>
				451	where
				452	I: Iterator<Item = Result<ThompsonRef>>,
				453	{
				454	let first = it.next().expect("alternations must be non-empty")?;
				455	let second = match it.next() {
				456	None => return Ok(first),
				457	Some(result) => result?,
				458	};
				459
				460	let union = self.add_union();
				461	let end = self.add_empty();
				462	self.patch(union, first.start);
				463	self.patch(first.end, end);
				464	self.patch(union, second.start);
				465	self.patch(second.end, end);
				466	for result in it {
				467	let compiled = result?;
				468	self.patch(union, compiled.start);
				469	self.patch(compiled.end, end);
				470	}
				471	Ok(ThompsonRef { start: union, end })
				472	}
				473
				474	fn c_repetition(&self, rep: &hir::Repetition) -> Result<ThompsonRef> {
				475	match rep.kind {
				476	hir::RepetitionKind::ZeroOrOne => {
				477	self.c_zero_or_one(&rep.hir, rep.greedy)
				478	}
				479	hir::RepetitionKind::ZeroOrMore => {
				480	self.c_at_least(&rep.hir, rep.greedy, 0)
				481	}
				482	hir::RepetitionKind::OneOrMore => {
				483	self.c_at_least(&rep.hir, rep.greedy, 1)
				484	}
				485	hir::RepetitionKind::Range(ref rng) => match *rng {
				486	hir::RepetitionRange::Exactly(count) => {
				487	self.c_exactly(&rep.hir, count)
				488	}
				489	hir::RepetitionRange::AtLeast(m) => {
				490	self.c_at_least(&rep.hir, rep.greedy, m)
				491	}
				492	hir::RepetitionRange::Bounded(min, max) => {
				493	self.c_bounded(&rep.hir, rep.greedy, min, max)
				494	}
				495	},
				496	}
				497	}
				498
				499	fn c_bounded(
				500	&self,
				501	expr: &Hir,
				502	greedy: bool,
				503	min: u32,
				504	max: u32,
				505	) -> Result<ThompsonRef> {
				506	let prefix = self.c_exactly(expr, min)?;
				507	if min == max {
				508	return Ok(prefix);
				509	}
				510
				511	// It is tempting here to compile the rest here as a concatenation
				512	// of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
				513	// were `aaa?a?a?`. The problem here is that it leads to this program:
				514	//
				515	// >000000: 61 => 01
				516	// 000001: 61 => 02
				517	// 000002: alt(03, 04)
				518	// 000003: 61 => 04
				519	// 000004: alt(05, 06)
				520	// 000005: 61 => 06
				521	// 000006: alt(07, 08)
				522	// 000007: 61 => 08
				523	// 000008: MATCH
				524	//
				525	// And effectively, once you hit state 2, the epsilon closure will
				526	// include states 3, 5, 5, 6, 7 and 8, which is quite a bit. It is
				527	// better to instead compile it like so:
				528	//
				529	// >000000: 61 => 01
				530	// 000001: 61 => 02
				531	// 000002: alt(03, 08)
				532	// 000003: 61 => 04
				533	// 000004: alt(05, 08)
				534	// 000005: 61 => 06
				535	// 000006: alt(07, 08)
				536	// 000007: 61 => 08
				537	// 000008: MATCH
				538	//
				539	// So that the epsilon closure of state 2 is now just 3 and 8.
				540	let empty = self.add_empty();
				541	let mut prev_end = prefix.end;
				542	for _ in min..max {
				543	let union = if greedy {
				544	self.add_union()
				545	} else {
				546	self.add_reverse_union()
				547	};
				548	let compiled = self.c(expr)?;
				549	self.patch(prev_end, union);
				550	self.patch(union, compiled.start);
				551	self.patch(union, empty);
				552	prev_end = compiled.end;
				553	}
				554	self.patch(prev_end, empty);
				555	Ok(ThompsonRef { start: prefix.start, end: empty })
				556	}
				557
				558	fn c_at_least(
				559	&self,
				560	expr: &Hir,
				561	greedy: bool,
				562	n: u32,
				563	) -> Result<ThompsonRef> {
				564	if n == 0 {
				565	let union = if greedy {
				566	self.add_union()
				567	} else {
				568	self.add_reverse_union()
				569	};
				570	let compiled = self.c(expr)?;
				571	self.patch(union, compiled.start);
				572	self.patch(compiled.end, union);
				573	Ok(ThompsonRef { start: union, end: union })
				574	} else if n == 1 {
				575	let compiled = self.c(expr)?;
				576	let union = if greedy {
				577	self.add_union()
				578	} else {
				579	self.add_reverse_union()
				580	};
				581	self.patch(compiled.end, union);
				582	self.patch(union, compiled.start);
				583	Ok(ThompsonRef { start: compiled.start, end: union })
				584	} else {
				585	let prefix = self.c_exactly(expr, n - 1)?;
				586	let last = self.c(expr)?;
				587	let union = if greedy {
				588	self.add_union()
				589	} else {
				590	self.add_reverse_union()
				591	};
				592	self.patch(prefix.end, last.start);
				593	self.patch(last.end, union);
				594	self.patch(union, last.start);
				595	Ok(ThompsonRef { start: prefix.start, end: union })
				596	}
				597	}
				598
				599	fn c_zero_or_one(&self, expr: &Hir, greedy: bool) -> Result<ThompsonRef> {
				600	let union =
				601	if greedy { self.add_union() } else { self.add_reverse_union() };
				602	let compiled = self.c(expr)?;
				603	let empty = self.add_empty();
				604	self.patch(union, compiled.start);
				605	self.patch(union, empty);
				606	self.patch(compiled.end, empty);
				607	Ok(ThompsonRef { start: union, end: empty })
				608	}
				609
				610	fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef> {
				611	let it = (0..n).map(\|_\| self.c(expr));
				612	self.c_concat(it)
				613	}
				614
				615	fn c_byte_class(&self, cls: &hir::ClassBytes) -> Result<ThompsonRef> {
				616	let end = self.add_empty();
				617	let mut trans = Vec::with_capacity(cls.ranges().len());
				618	for r in cls.iter() {
				619	trans.push(Transition {
				620	start: r.start(),
				621	end: r.end(),
				622	next: end,
				623	});
				624	}
				625	Ok(ThompsonRef { start: self.add_sparse(trans), end })
				626	}
				627
				628	fn c_unicode_class(&self, cls: &hir::ClassUnicode) -> Result<ThompsonRef> {
				629	// If all we have are ASCII ranges wrapped in a Unicode package, then
				630	// there is zero reason to bring out the big guns. We can fit all ASCII
				631	// ranges within a single sparse transition.
				632	if cls.is_all_ascii() {
				633	let end = self.add_empty();
				634	let mut trans = Vec::with_capacity(cls.ranges().len());
				635	for r in cls.iter() {
				636	assert!(r.start() <= '\x7F');
				637	assert!(r.end() <= '\x7F');
				638	trans.push(Transition {
				639	start: r.start() as u8,
				640	end: r.end() as u8,
				641	next: end,
				642	});
				643	}
				644	Ok(ThompsonRef { start: self.add_sparse(trans), end })
				645	} else if self.config.reverse {
				646	if !self.config.shrink {
				647	// When we don't want to spend the extra time shrinking, we
				648	// compile the UTF-8 automaton in reverse using something like
				649	// the "naive" approach, but will attempt to re-use common
				650	// suffixes.
				651	self.c_unicode_class_reverse_with_suffix(cls)
				652	} else {
				653	// When we want to shrink our NFA for reverse UTF-8 automata,
				654	// we cannot feed UTF-8 sequences directly to the UTF-8
				655	// compiler, since the UTF-8 compiler requires all sequences
				656	// to be lexicographically sorted. Instead, we organize our
				657	// sequences into a range trie, which can then output our
				658	// sequences in the correct order. Unfortunately, building the
				659	// range trie is fairly expensive (but not nearly as expensive
				660	// as building a DFA). Hence the reason why the 'shrink' option
				661	// exists, so that this path can be toggled off.
				662	let mut trie = self.trie_state.borrow_mut();
				663	trie.clear();
				664
				665	for rng in cls.iter() {
				666	for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
				667	seq.reverse();
				668	trie.insert(seq.as_slice());
				669	}
				670	}
				671	let mut utf8_state = self.utf8_state.borrow_mut();
				672	let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state);
				673	trie.iter(\|seq\| {
				674	utf8c.add(&seq);
				675	});
				676	Ok(utf8c.finish())
				677	}
				678	} else {
				679	// In the forward direction, we always shrink our UTF-8 automata
				680	// because we can stream it right into the UTF-8 compiler. There
				681	// is almost no downside (in either memory or time) to using this
				682	// approach.
				683	let mut utf8_state = self.utf8_state.borrow_mut();
				684	let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state);
				685	for rng in cls.iter() {
				686	for seq in Utf8Sequences::new(rng.start(), rng.end()) {
				687	utf8c.add(seq.as_slice());
				688	}
				689	}
				690	Ok(utf8c.finish())
				691	}
				692
				693	// For reference, the code below is the "naive" version of compiling a
				694	// UTF-8 automaton. It is deliciously simple (and works for both the
				695	// forward and reverse cases), but will unfortunately produce very
				696	// large NFAs. When compiling a forward automaton, the size difference
				697	// can sometimes be an order of magnitude. For example, the '\w' regex
				698	// will generate about ~3000 NFA states using the naive approach below,
				699	// but only 283 states when using the approach above. This is because
				700	// the approach above actually compiles a minimal (or near minimal,
				701	// because of the bounded hashmap) UTF-8 automaton.
				702	//
				703	// The code below is kept as a reference point in order to make it
				704	// easier to understand the higher level goal here.
				705	/*
				706	let it = cls
				707	.iter()
				708	.flat_map(\|rng\| Utf8Sequences::new(rng.start(), rng.end()))
				709	.map(\|seq\| {
				710	let it = seq
				711	.as_slice()
				712	.iter()
				713	.map(\|rng\| Ok(self.c_range(rng.start, rng.end)));
				714	self.c_concat(it)
				715	});
				716	self.c_alternation(it);
				717	*/
				718	}
				719
				720	fn c_unicode_class_reverse_with_suffix(
				721	&self,
				722	cls: &hir::ClassUnicode,
				723	) -> Result<ThompsonRef> {
				724	// N.B. It would likely be better to cache common prefixes in the
				725	// reverse direction, but it's not quite clear how to do that. The
				726	// advantage of caching suffixes is that it does give us a win, and
				727	// has a very small additional overhead.
				728	let mut cache = self.utf8_suffix.borrow_mut();
				729	cache.clear();
				730
				731	let union = self.add_union();
				732	let alt_end = self.add_empty();
				733	for urng in cls.iter() {
				734	for seq in Utf8Sequences::new(urng.start(), urng.end()) {
				735	let mut end = alt_end;
				736	for brng in seq.as_slice() {
				737	let key = Utf8SuffixKey {
				738	from: end,
				739	start: brng.start,
				740	end: brng.end,
				741	};
				742	let hash = cache.hash(&key);
				743	if let Some(id) = cache.get(&key, hash) {
				744	end = id;
				745	continue;
				746	}
				747
				748	let compiled = self.c_range(brng.start, brng.end);
				749	self.patch(compiled.end, end);
				750	end = compiled.start;
				751	cache.set(key, hash, end);
				752	}
				753	self.patch(union, end);
				754	}
				755	}
				756	Ok(ThompsonRef { start: union, end: alt_end })
				757	}
				758
				759	fn c_range(&self, start: u8, end: u8) -> ThompsonRef {
				760	let id = self.add_range(start, end);
				761	ThompsonRef { start: id, end: id }
				762	}
				763
				764	fn c_empty(&self) -> ThompsonRef {
				765	let id = self.add_empty();
				766	ThompsonRef { start: id, end: id }
				767	}
				768
				769	fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef> {
				770	self.c(&Hir::repetition(hir::Repetition {
				771	kind: hir::RepetitionKind::ZeroOrMore,
				772	greedy: false,
				773	hir: Box::new(Hir::any(false)),
				774	}))
				775	}
				776
				777	fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef> {
				778	self.c(&Hir::repetition(hir::Repetition {
				779	kind: hir::RepetitionKind::ZeroOrMore,
				780	greedy: false,
				781	hir: Box::new(Hir::any(true)),
				782	}))
				783	}
				784
				785	fn patch(&self, from: StateID, to: StateID) {
				786	match self.states.borrow_mut()[from] {
				787	CState::Empty { ref mut next } => {
				788	*next = to;
				789	}
				790	CState::Range { ref mut range } => {
				791	range.next = to;
				792	}
				793	CState::Sparse { .. } => {
				794	panic!("cannot patch from a sparse NFA state")
				795	}
				796	CState::Union { ref mut alternates } => {
				797	alternates.push(to);
				798	}
				799	CState::UnionReverse { ref mut alternates } => {
				800	alternates.push(to);
				801	}
				802	CState::Match => {}
				803	}
				804	}
				805
				806	fn add_empty(&self) -> StateID {
				807	let id = self.states.borrow().len();
				808	self.states.borrow_mut().push(CState::Empty { next: 0 });
				809	id
				810	}
				811
				812	fn add_range(&self, start: u8, end: u8) -> StateID {
				813	let id = self.states.borrow().len();
				814	let trans = Transition { start, end, next: 0 };
				815	let state = CState::Range { range: trans };
				816	self.states.borrow_mut().push(state);
				817	id
				818	}
				819
				820	fn add_sparse(&self, ranges: Vec<Transition>) -> StateID {
				821	if ranges.len() == 1 {
				822	let id = self.states.borrow().len();
				823	let state = CState::Range { range: ranges[0] };
				824	self.states.borrow_mut().push(state);
				825	return id;
				826	}
				827	let id = self.states.borrow().len();
				828	let state = CState::Sparse { ranges };
				829	self.states.borrow_mut().push(state);
				830	id
				831	}
				832
				833	fn add_union(&self) -> StateID {
				834	let id = self.states.borrow().len();
				835	let state = CState::Union { alternates: vec![] };
				836	self.states.borrow_mut().push(state);
				837	id
				838	}
				839
				840	fn add_reverse_union(&self) -> StateID {
				841	let id = self.states.borrow().len();
				842	let state = CState::UnionReverse { alternates: vec![] };
				843	self.states.borrow_mut().push(state);
				844	id
				845	}
				846
				847	fn add_match(&self) -> StateID {
				848	let id = self.states.borrow().len();
				849	self.states.borrow_mut().push(CState::Match);
				850	id
				851	}
				852	}
				853
				854	#[derive(Debug)]
				855	struct Utf8Compiler<'a> {
				856	nfac: &'a Compiler,
				857	state: &'a mut Utf8State,
				858	target: StateID,
				859	}
				860
				861	#[derive(Clone, Debug)]
				862	struct Utf8State {
				863	compiled: Utf8BoundedMap,
				864	uncompiled: Vec<Utf8Node>,
				865	}
				866
				867	#[derive(Clone, Debug)]
				868	struct Utf8Node {
				869	trans: Vec<Transition>,
				870	last: Option<Utf8LastTransition>,
				871	}
				872
				873	#[derive(Clone, Debug)]
				874	struct Utf8LastTransition {
				875	start: u8,
				876	end: u8,
				877	}
				878
				879	impl Utf8State {
				880	fn new() -> Utf8State {
				881	Utf8State { compiled: Utf8BoundedMap::new(5000), uncompiled: vec![] }
				882	}
				883
				884	fn clear(&mut self) {
				885	self.compiled.clear();
				886	self.uncompiled.clear();
				887	}
				888	}
				889
				890	impl<'a> Utf8Compiler<'a> {
				891	fn new(nfac: &'a Compiler, state: &'a mut Utf8State) -> Utf8Compiler<'a> {
				892	let target = nfac.add_empty();
				893	state.clear();
				894	let mut utf8c = Utf8Compiler { nfac, state, target };
				895	utf8c.add_empty();
				896	utf8c
				897	}
				898
				899	fn finish(&mut self) -> ThompsonRef {
				900	self.compile_from(0);
				901	let node = self.pop_root();
				902	let start = self.compile(node);
				903	ThompsonRef { start, end: self.target }
				904	}
				905
				906	fn add(&mut self, ranges: &[Utf8Range]) {
				907	let prefix_len = ranges
				908	.iter()
				909	.zip(&self.state.uncompiled)
				910	.take_while(\|&(range, node)\| {
				911	node.last.as_ref().map_or(false, \|t\| {
				912	(t.start, t.end) == (range.start, range.end)
				913	})
				914	})
				915	.count();
				916	assert!(prefix_len < ranges.len());
				917	self.compile_from(prefix_len);
				918	self.add_suffix(&ranges[prefix_len..]);
				919	}
				920
				921	fn compile_from(&mut self, from: usize) {
				922	let mut next = self.target;
				923	while from + 1 < self.state.uncompiled.len() {
				924	let node = self.pop_freeze(next);
				925	next = self.compile(node);
				926	}
				927	self.top_last_freeze(next);
				928	}
				929
				930	fn compile(&mut self, node: Vec<Transition>) -> StateID {
				931	let hash = self.state.compiled.hash(&node);
				932	if let Some(id) = self.state.compiled.get(&node, hash) {
				933	return id;
				934	}
				935	let id = self.nfac.add_sparse(node.clone());
				936	self.state.compiled.set(node, hash, id);
				937	id
				938	}
				939
				940	fn add_suffix(&mut self, ranges: &[Utf8Range]) {
				941	assert!(!ranges.is_empty());
				942	let last = self
				943	.state
				944	.uncompiled
				945	.len()
				946	.checked_sub(1)
				947	.expect("non-empty nodes");
				948	assert!(self.state.uncompiled[last].last.is_none());
				949	self.state.uncompiled[last].last = Some(Utf8LastTransition {
				950	start: ranges[0].start,
				951	end: ranges[0].end,
				952	});
				953	for r in &ranges[1..] {
				954	self.state.uncompiled.push(Utf8Node {
				955	trans: vec![],
				956	last: Some(Utf8LastTransition { start: r.start, end: r.end }),
				957	});
				958	}
				959	}
				960
				961	fn add_empty(&mut self) {
				962	self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
				963	}
				964
				965	fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
				966	let mut uncompiled = self.state.uncompiled.pop().unwrap();
				967	uncompiled.set_last_transition(next);
				968	uncompiled.trans
				969	}
				970
				971	fn pop_root(&mut self) -> Vec<Transition> {
				972	assert_eq!(self.state.uncompiled.len(), 1);
				973	assert!(self.state.uncompiled[0].last.is_none());
				974	self.state.uncompiled.pop().expect("non-empty nodes").trans
				975	}
				976
				977	fn top_last_freeze(&mut self, next: StateID) {
				978	let last = self
				979	.state
				980	.uncompiled
				981	.len()
				982	.checked_sub(1)
				983	.expect("non-empty nodes");
				984	self.state.uncompiled[last].set_last_transition(next);
				985	}
				986	}
				987
				988	impl Utf8Node {
				989	fn set_last_transition(&mut self, next: StateID) {
				990	if let Some(last) = self.last.take() {
				991	self.trans.push(Transition {
				992	start: last.start,
				993	end: last.end,
				994	next,
				995	});
				996	}
				997	}
				998	}
				999
				1000	#[cfg(test)]
				1001	mod tests {
				1002	use regex_syntax::hir::Hir;
				1003	use regex_syntax::ParserBuilder;
				1004
				1005	use super::{Builder, State, StateID, Transition, NFA};
				1006
				1007	fn parse(pattern: &str) -> Hir {
				1008	ParserBuilder::new().build().parse(pattern).unwrap()
				1009	}
				1010
				1011	fn build(pattern: &str) -> NFA {
				1012	Builder::new().anchored(true).build(&parse(pattern)).unwrap()
				1013	}
				1014
				1015	fn s_byte(byte: u8, next: StateID) -> State {
				1016	let trans = Transition { start: byte, end: byte, next };
				1017	State::Range { range: trans }
				1018	}
				1019
				1020	fn s_range(start: u8, end: u8, next: StateID) -> State {
				1021	let trans = Transition { start, end, next };
				1022	State::Range { range: trans }
				1023	}
				1024
				1025	fn s_sparse(ranges: &[(u8, u8, StateID)]) -> State {
				1026	let ranges = ranges
				1027	.iter()
				1028	.map(\|&(start, end, next)\| Transition { start, end, next })
				1029	.collect();
				1030	State::Sparse { ranges }
				1031	}
				1032
				1033	fn s_union(alts: &[StateID]) -> State {
				1034	State::Union { alternates: alts.to_vec().into_boxed_slice() }
				1035	}
				1036
				1037	fn s_match() -> State {
				1038	State::Match
				1039	}
				1040
				1041	#[test]
				1042	fn errors() {
				1043	// unsupported anchors
				1044	assert!(Builder::new().build(&parse(r"^")).is_err());
				1045	assert!(Builder::new().build(&parse(r"$")).is_err());
				1046	assert!(Builder::new().build(&parse(r"\A")).is_err());
				1047	assert!(Builder::new().build(&parse(r"\z")).is_err());
				1048
				1049	// unsupported word boundaries
				1050	assert!(Builder::new().build(&parse(r"\b")).is_err());
				1051	assert!(Builder::new().build(&parse(r"\B")).is_err());
				1052	assert!(Builder::new().build(&parse(r"(?-u)\b")).is_err());
				1053	}
				1054
				1055	// Test that building an unanchored NFA has an appropriate `.*?` prefix.
				1056	#[test]
				1057	fn compile_unanchored_prefix() {
				1058	// When the machine can only match valid UTF-8.
				1059	let nfa = Builder::new().anchored(false).build(&parse(r"a")).unwrap();
				1060	// There should be many states since the `.` in `.*?` matches any
				1061	// Unicode scalar value.
				1062	assert_eq!(11, nfa.len());
				1063	assert_eq!(nfa.states[10], s_match());
				1064	assert_eq!(nfa.states[9], s_byte(b'a', 10));
				1065
				1066	// When the machine can match invalid UTF-8.
				1067	let nfa = Builder::new()
				1068	.anchored(false)
				1069	.allow_invalid_utf8(true)
				1070	.build(&parse(r"a"))
				1071	.unwrap();
				1072	assert_eq!(
				1073	nfa.states,
				1074	&[
				1075	s_union(&[2, 1]),
				1076	s_range(0, 255, 0),
				1077	s_byte(b'a', 3),
				1078	s_match(),
				1079	]
				1080	);
				1081	}
				1082
				1083	#[test]
				1084	fn compile_empty() {
				1085	assert_eq!(build("").states, &[s_match(),]);
				1086	}
				1087
				1088	#[test]
				1089	fn compile_literal() {
				1090	assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(),]);
				1091	assert_eq!(
				1092	build("ab").states,
				1093	&[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),]
				1094	);
				1095	assert_eq!(
				1096	build("☃").states,
				1097	&[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(),]
				1098	);
				1099
				1100	// Check that non-UTF-8 literals work.
				1101	let hir = ParserBuilder::new()
				1102	.allow_invalid_utf8(true)
				1103	.build()
				1104	.parse(r"(?-u)\xFF")
				1105	.unwrap();
				1106	let nfa = Builder::new()
				1107	.anchored(true)
				1108	.allow_invalid_utf8(true)
				1109	.build(&hir)
				1110	.unwrap();
				1111	assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(),]);
				1112	}
				1113
				1114	#[test]
				1115	fn compile_class() {
				1116	assert_eq!(
				1117	build(r"[a-z]").states,
				1118	&[s_range(b'a', b'z', 1), s_match(),]
				1119	);
				1120	assert_eq!(
				1121	build(r"[x-za-c]").states,
				1122	&[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match()]
				1123	);
				1124	assert_eq!(
				1125	build(r"[\u03B1-\u03B4]").states,
				1126	&[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match()]
				1127	);
				1128	assert_eq!(
				1129	build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states,
				1130	&[
				1131	s_range(0xB1, 0xB4, 5),
				1132	s_range(0x99, 0x9E, 5),
				1133	s_byte(0xA4, 1),
				1134	s_byte(0x9F, 2),
				1135	s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
				1136	s_match(),
				1137	]
				1138	);
				1139	assert_eq!(
				1140	build(r"[a-z☃]").states,
				1141	&[
				1142	s_byte(0x83, 3),
				1143	s_byte(0x98, 0),
				1144	s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]),
				1145	s_match(),
				1146	]
				1147	);
				1148	}
				1149
				1150	#[test]
				1151	fn compile_repetition() {
				1152	assert_eq!(
				1153	build(r"a?").states,
				1154	&[s_union(&[1, 2]), s_byte(b'a', 2), s_match(),]
				1155	);
				1156	assert_eq!(
				1157	build(r"a??").states,
				1158	&[s_union(&[2, 1]), s_byte(b'a', 2), s_match(),]
				1159	);
				1160	}
				1161
				1162	#[test]
				1163	fn compile_group() {
				1164	assert_eq!(
				1165	build(r"ab+").states,
				1166	&[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(),]
				1167	);
				1168	assert_eq!(
				1169	build(r"(ab)").states,
				1170	&[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),]
				1171	);
				1172	assert_eq!(
				1173	build(r"(ab)+").states,
				1174	&[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(),]
				1175	);
				1176	}
				1177
				1178	#[test]
				1179	fn compile_alternation() {
				1180	assert_eq!(
				1181	build(r"a\|b").states,
				1182	&[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(),]
				1183	);
				1184	assert_eq!(
				1185	build(r"\|b").states,
				1186	&[s_byte(b'b', 2), s_union(&[2, 0]), s_match(),]
				1187	);
				1188	assert_eq!(
				1189	build(r"a\|").states,
				1190	&[s_byte(b'a', 2), s_union(&[0, 2]), s_match(),]
				1191	);
				1192	}
				1193	}