Blame - src/compile.rs - platform/external/rust/crates/regex

blob: cdc583c5d74c0078ac46e27ea919b859972857e4 [file] [log] [blame]

Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	1	use std::collections::HashMap;
				2	use std::iter;
				3	use std::result;
				4	use std::sync::Arc;
				5
				6	use syntax::hir::{self, Hir};
				7	use syntax::is_word_byte;
				8	use syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
				9
				10	use prog::{
				11	EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
				12	InstSave, InstSplit, Program,
				13	};
				14
				15	use Error;
				16
				17	type Result = result::Result<Patch, Error>;
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	18	type ResultOrEmpty = result::Result<Option<Patch>, Error>;
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	19
				20	#[derive(Debug)]
				21	struct Patch {
				22	hole: Hole,
				23	entry: InstPtr,
				24	}
				25
				26	/// A compiler translates a regular expression AST to a sequence of
				27	/// instructions. The sequence of instructions represents an NFA.
				28	pub struct Compiler {
				29	insts: Vec<MaybeInst>,
				30	compiled: Program,
				31	capture_name_idx: HashMap<String, usize>,
				32	num_exprs: usize,
				33	size_limit: usize,
				34	suffix_cache: SuffixCache,
				35	utf8_seqs: Option<Utf8Sequences>,
				36	byte_classes: ByteClassSet,
				37	}
				38
				39	impl Compiler {
				40	/// Create a new regular expression compiler.
				41	///
				42	/// Various options can be set before calling `compile` on an expression.
				43	pub fn new() -> Self {
				44	Compiler {
				45	insts: vec![],
				46	compiled: Program::new(),
				47	capture_name_idx: HashMap::new(),
				48	num_exprs: 0,
				49	size_limit: 10 * (1 << 20),
				50	suffix_cache: SuffixCache::new(1000),
				51	utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
				52	byte_classes: ByteClassSet::new(),
				53	}
				54	}
				55
				56	/// The size of the resulting program is limited by size_limit. If
				57	/// the program approximately exceeds the given size (in bytes), then
				58	/// compilation will stop and return an error.
				59	pub fn size_limit(mut self, size_limit: usize) -> Self {
				60	self.size_limit = size_limit;
				61	self
				62	}
				63
				64	/// If bytes is true, then the program is compiled as a byte based
				65	/// automaton, which incorporates UTF-8 decoding into the machine. If it's
				66	/// false, then the automaton is Unicode scalar value based, e.g., an
				67	/// engine utilizing such an automaton is responsible for UTF-8 decoding.
				68	///
				69	/// The specific invariant is that when returning a byte based machine,
				70	/// the neither the `Char` nor `Ranges` instructions are produced.
				71	/// Conversely, when producing a Unicode scalar value machine, the `Bytes`
				72	/// instruction is never produced.
				73	///
				74	/// Note that `dfa(true)` implies `bytes(true)`.
				75	pub fn bytes(mut self, yes: bool) -> Self {
				76	self.compiled.is_bytes = yes;
				77	self
				78	}
				79
				80	/// When disabled, the program compiled may match arbitrary bytes.
				81	///
				82	/// When enabled (the default), all compiled programs exclusively match
				83	/// valid UTF-8 bytes.
				84	pub fn only_utf8(mut self, yes: bool) -> Self {
				85	self.compiled.only_utf8 = yes;
				86	self
				87	}
				88
				89	/// When set, the machine returned is suitable for use in the DFA matching
				90	/// engine.
				91	///
				92	/// In particular, this ensures that if the regex is not anchored in the
				93	/// beginning, then a preceding `.*?` is included in the program. (The NFA
				94	/// based engines handle the preceding `.*?` explicitly, which is difficult
				95	/// or impossible in the DFA engine.)
				96	pub fn dfa(mut self, yes: bool) -> Self {
				97	self.compiled.is_dfa = yes;
				98	self
				99	}
				100
				101	/// When set, the machine returned is suitable for matching text in
				102	/// reverse. In particular, all concatenations are flipped.
				103	pub fn reverse(mut self, yes: bool) -> Self {
				104	self.compiled.is_reverse = yes;
				105	self
				106	}
				107
				108	/// Compile a regular expression given its AST.
				109	///
				110	/// The compiler is guaranteed to succeed unless the program exceeds the
				111	/// specified size limit. If the size limit is exceeded, then compilation
				112	/// stops and returns an error.
				113	pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
				114	debug_assert!(!exprs.is_empty());
				115	self.num_exprs = exprs.len();
				116	if exprs.len() == 1 {
				117	self.compile_one(&exprs[0])
				118	} else {
				119	self.compile_many(exprs)
				120	}
				121	}
				122
				123	fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
				124	// If we're compiling a forward DFA and we aren't anchored, then
				125	// add a `.*?` before the first capture group.
				126	// Other matching engines handle this by baking the logic into the
				127	// matching engine itself.
				128	let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
				129	self.compiled.is_anchored_start = expr.is_anchored_start();
				130	self.compiled.is_anchored_end = expr.is_anchored_end();
				131	if self.compiled.needs_dotstar() {
				132	dotstar_patch = self.c_dotstar()?;
				133	self.compiled.start = dotstar_patch.entry;
				134	}
				135	self.compiled.captures = vec![None];
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	136	let patch = self.c_capture(0, expr)?.unwrap_or(self.next_inst());
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	137	if self.compiled.needs_dotstar() {
				138	self.fill(dotstar_patch.hole, patch.entry);
				139	} else {
				140	self.compiled.start = patch.entry;
				141	}
				142	self.fill_to_next(patch.hole);
				143	self.compiled.matches = vec![self.insts.len()];
				144	self.push_compiled(Inst::Match(0));
				145	self.compile_finish()
				146	}
				147
				148	fn compile_many(
				149	mut self,
				150	exprs: &[Hir],
				151	) -> result::Result<Program, Error> {
				152	debug_assert!(exprs.len() > 1);
				153
				154	self.compiled.is_anchored_start =
				155	exprs.iter().all(\|e\| e.is_anchored_start());
				156	self.compiled.is_anchored_end =
				157	exprs.iter().all(\|e\| e.is_anchored_end());
				158	let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
				159	if self.compiled.needs_dotstar() {
				160	dotstar_patch = self.c_dotstar()?;
				161	self.compiled.start = dotstar_patch.entry;
				162	} else {
				163	self.compiled.start = 0; // first instruction is always split
				164	}
				165	self.fill_to_next(dotstar_patch.hole);
				166
				167	let mut prev_hole = Hole::None;
				168	for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
				169	self.fill_to_next(prev_hole);
				170	let split = self.push_split_hole();
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	171	let Patch { hole, entry } =
				172	self.c_capture(0, expr)?.unwrap_or(self.next_inst());
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	173	self.fill_to_next(hole);
				174	self.compiled.matches.push(self.insts.len());
				175	self.push_compiled(Inst::Match(i));
				176	prev_hole = self.fill_split(split, Some(entry), None);
				177	}
				178	let i = exprs.len() - 1;
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	179	let Patch { hole, entry } =
				180	self.c_capture(0, &exprs[i])?.unwrap_or(self.next_inst());
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	181	self.fill(prev_hole, entry);
				182	self.fill_to_next(hole);
				183	self.compiled.matches.push(self.insts.len());
				184	self.push_compiled(Inst::Match(i));
				185	self.compile_finish()
				186	}
				187
				188	fn compile_finish(mut self) -> result::Result<Program, Error> {
				189	self.compiled.insts =
				190	self.insts.into_iter().map(\|inst\| inst.unwrap()).collect();
				191	self.compiled.byte_classes = self.byte_classes.byte_classes();
				192	self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
				193	Ok(self.compiled)
				194	}
				195
				196	/// Compile expr into self.insts, returning a patch on success,
				197	/// or an error if we run out of memory.
				198	///
				199	/// All of the c_* methods of the compiler share the contract outlined
				200	/// here.
				201	///
				202	/// The main thing that a c_* method does is mutate `self.insts`
				203	/// to add a list of mostly compiled instructions required to execute
				204	/// the given expression. `self.insts` contains MaybeInsts rather than
				205	/// Insts because there is some backpatching required.
				206	///
				207	/// The `Patch` value returned by each c_* method provides metadata
				208	/// about the compiled instructions emitted to `self.insts`. The
				209	/// `entry` member of the patch refers to the first instruction
				210	/// (the entry point), while the `hole` member contains zero or
				211	/// more offsets to partial instructions that need to be backpatched.
				212	/// The c_* routine can't know where its list of instructions are going to
				213	/// jump to after execution, so it is up to the caller to patch
				214	/// these jumps to point to the right place. So compiling some
				215	/// expression, e, we would end up with a situation that looked like:
				216	///
				217	/// ```text
				218	/// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
				219	/// ^ ^ ^
				220	/// \| \ /
				221	/// entry \ /
				222	/// hole
				223	/// ```
				224	///
Chih-Hung Hsieh	849e445	2020-10-26 13:16:47 -0700	[diff] [blame]	225	/// To compile two expressions, e1 and e2, concatenated together we
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	226	/// would do:
				227	///
				228	/// ```ignore
				229	/// let patch1 = self.c(e1);
				230	/// let patch2 = self.c(e2);
				231	/// ```
				232	///
				233	/// while leaves us with a situation that looks like
				234	///
				235	/// ```text
				236	/// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
				237	/// ^ ^ ^ ^
				238	/// \| \| \| \|
				239	/// entry1 hole1 entry2 hole2
				240	/// ```
				241	///
				242	/// Then to merge the two patches together into one we would backpatch
				243	/// hole1 with entry2 and return a new patch that enters at entry1
				244	/// and has hole2 for a hole. In fact, if you look at the c_concat
				245	/// method you will see that it does exactly this, though it handles
				246	/// a list of expressions rather than just the two that we use for
				247	/// an example.
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	248	///
				249	/// Ok(None) is returned when an expression is compiled to no
				250	/// instruction, and so no patch.entry value makes sense.
				251	fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	252	use prog;
				253	use syntax::hir::HirKind::*;
				254
				255	self.check_size()?;
				256	match *expr.kind() {
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	257	Empty => Ok(None),
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	258	Literal(hir::Literal::Unicode(c)) => self.c_char(c),
				259	Literal(hir::Literal::Byte(b)) => {
				260	assert!(self.compiled.uses_bytes());
				261	self.c_byte(b)
				262	}
				263	Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
				264	Class(hir::Class::Bytes(ref cls)) => {
				265	if self.compiled.uses_bytes() {
				266	self.c_class_bytes(cls.ranges())
				267	} else {
				268	assert!(cls.is_all_ascii());
				269	let mut char_ranges = vec![];
				270	for r in cls.iter() {
				271	let (s, e) = (r.start() as char, r.end() as char);
				272	char_ranges.push(hir::ClassUnicodeRange::new(s, e));
				273	}
				274	self.c_class(&char_ranges)
				275	}
				276	}
				277	Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
				278	self.byte_classes.set_range(b'\n', b'\n');
				279	self.c_empty_look(prog::EmptyLook::EndLine)
				280	}
				281	Anchor(hir::Anchor::StartLine) => {
				282	self.byte_classes.set_range(b'\n', b'\n');
				283	self.c_empty_look(prog::EmptyLook::StartLine)
				284	}
				285	Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
				286	self.byte_classes.set_range(b'\n', b'\n');
				287	self.c_empty_look(prog::EmptyLook::StartLine)
				288	}
				289	Anchor(hir::Anchor::EndLine) => {
				290	self.byte_classes.set_range(b'\n', b'\n');
				291	self.c_empty_look(prog::EmptyLook::EndLine)
				292	}
				293	Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
				294	self.c_empty_look(prog::EmptyLook::EndText)
				295	}
				296	Anchor(hir::Anchor::StartText) => {
				297	self.c_empty_look(prog::EmptyLook::StartText)
				298	}
				299	Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
				300	self.c_empty_look(prog::EmptyLook::StartText)
				301	}
				302	Anchor(hir::Anchor::EndText) => {
				303	self.c_empty_look(prog::EmptyLook::EndText)
				304	}
				305	WordBoundary(hir::WordBoundary::Unicode) => {
				306	if !cfg!(feature = "unicode-perl") {
				307	return Err(Error::Syntax(
				308	"Unicode word boundaries are unavailable when \
				309	the unicode-perl feature is disabled"
				310	.to_string(),
				311	));
				312	}
				313	self.compiled.has_unicode_word_boundary = true;
				314	self.byte_classes.set_word_boundary();
				315	self.c_empty_look(prog::EmptyLook::WordBoundary)
				316	}
				317	WordBoundary(hir::WordBoundary::UnicodeNegate) => {
				318	if !cfg!(feature = "unicode-perl") {
				319	return Err(Error::Syntax(
				320	"Unicode word boundaries are unavailable when \
				321	the unicode-perl feature is disabled"
				322	.to_string(),
				323	));
				324	}
				325	self.compiled.has_unicode_word_boundary = true;
				326	self.byte_classes.set_word_boundary();
				327	self.c_empty_look(prog::EmptyLook::NotWordBoundary)
				328	}
				329	WordBoundary(hir::WordBoundary::Ascii) => {
				330	self.byte_classes.set_word_boundary();
				331	self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
				332	}
				333	WordBoundary(hir::WordBoundary::AsciiNegate) => {
				334	self.byte_classes.set_word_boundary();
				335	self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
				336	}
				337	Group(ref g) => match g.kind {
				338	hir::GroupKind::NonCapturing => self.c(&g.hir),
				339	hir::GroupKind::CaptureIndex(index) => {
				340	if index as usize >= self.compiled.captures.len() {
				341	self.compiled.captures.push(None);
				342	}
				343	self.c_capture(2 * index as usize, &g.hir)
				344	}
				345	hir::GroupKind::CaptureName { index, ref name } => {
				346	if index as usize >= self.compiled.captures.len() {
				347	let n = name.to_string();
				348	self.compiled.captures.push(Some(n.clone()));
				349	self.capture_name_idx.insert(n, index as usize);
				350	}
				351	self.c_capture(2 * index as usize, &g.hir)
				352	}
				353	},
				354	Concat(ref es) => {
				355	if self.compiled.is_reverse {
				356	self.c_concat(es.iter().rev())
				357	} else {
				358	self.c_concat(es)
				359	}
				360	}
				361	Alternation(ref es) => self.c_alternate(&**es),
				362	Repetition(ref rep) => self.c_repeat(rep),
				363	}
				364	}
				365
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	366	fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	367	if self.num_exprs > 1 \|\| self.compiled.is_dfa {
				368	// Don't ever compile Save instructions for regex sets because
				369	// they are never used. They are also never used in DFA programs
				370	// because DFAs can't handle captures.
				371	self.c(expr)
				372	} else {
				373	let entry = self.insts.len();
				374	let hole = self.push_hole(InstHole::Save { slot: first_slot });
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	375	let patch = self.c(expr)?.unwrap_or(self.next_inst());
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	376	self.fill(hole, patch.entry);
				377	self.fill_to_next(patch.hole);
				378	let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	379	Ok(Some(Patch { hole: hole, entry: entry }))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	380	}
				381	}
				382
				383	fn c_dotstar(&mut self) -> Result {
				384	Ok(if !self.compiled.only_utf8() {
				385	self.c(&Hir::repetition(hir::Repetition {
				386	kind: hir::RepetitionKind::ZeroOrMore,
				387	greedy: false,
				388	hir: Box::new(Hir::any(true)),
				389	}))?
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	390	.unwrap()
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	391	} else {
				392	self.c(&Hir::repetition(hir::Repetition {
				393	kind: hir::RepetitionKind::ZeroOrMore,
				394	greedy: false,
				395	hir: Box::new(Hir::any(false)),
				396	}))?
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	397	.unwrap()
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	398	})
				399	}
				400
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	401	fn c_char(&mut self, c: char) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	402	if self.compiled.uses_bytes() {
				403	if c.is_ascii() {
				404	let b = c as u8;
				405	let hole =
				406	self.push_hole(InstHole::Bytes { start: b, end: b });
				407	self.byte_classes.set_range(b, b);
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	408	Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	409	} else {
				410	self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
				411	}
				412	} else {
				413	let hole = self.push_hole(InstHole::Char { c: c });
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	414	Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	415	}
				416	}
				417
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	418	fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	419	assert!(!ranges.is_empty());
				420	if self.compiled.uses_bytes() {
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	421	Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	422	} else {
				423	let ranges: Vec<(char, char)> =
				424	ranges.iter().map(\|r\| (r.start(), r.end())).collect();
				425	let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
				426	self.push_hole(InstHole::Char { c: ranges[0].0 })
				427	} else {
				428	self.push_hole(InstHole::Ranges { ranges: ranges })
				429	};
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	430	Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	431	}
				432	}
				433
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	434	fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	435	self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
				436	}
				437
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	438	fn c_class_bytes(
				439	&mut self,
				440	ranges: &[hir::ClassBytesRange],
				441	) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	442	debug_assert!(!ranges.is_empty());
				443
				444	let first_split_entry = self.insts.len();
				445	let mut holes = vec![];
				446	let mut prev_hole = Hole::None;
				447	for r in &ranges[0..ranges.len() - 1] {
				448	self.fill_to_next(prev_hole);
				449	let split = self.push_split_hole();
				450	let next = self.insts.len();
				451	self.byte_classes.set_range(r.start(), r.end());
				452	holes.push(self.push_hole(InstHole::Bytes {
				453	start: r.start(),
				454	end: r.end(),
				455	}));
				456	prev_hole = self.fill_split(split, Some(next), None);
				457	}
				458	let next = self.insts.len();
				459	let r = &ranges[ranges.len() - 1];
				460	self.byte_classes.set_range(r.start(), r.end());
				461	holes.push(
				462	self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
				463	);
				464	self.fill(prev_hole, next);
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	465	Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	466	}
				467
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	468	fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	469	let hole = self.push_hole(InstHole::EmptyLook { look: look });
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	470	Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	471	}
				472
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	473	fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	474	where
				475	I: IntoIterator<Item = &'a Hir>,
				476	{
				477	let mut exprs = exprs.into_iter();
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	478	let Patch { mut hole, entry } = loop {
				479	match exprs.next() {
				480	None => return Ok(None),
				481	Some(e) => {
				482	if let Some(p) = self.c(e)? {
				483	break p;
				484	}
				485	}
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	486	}
				487	};
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	488	for e in exprs {
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	489	if let Some(p) = self.c(e)? {
				490	self.fill(hole, p.entry);
				491	hole = p.hole;
				492	}
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	493	}
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	494	Ok(Some(Patch { hole: hole, entry: entry }))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	495	}
				496
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	497	fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	498	debug_assert!(
				499	exprs.len() >= 2,
				500	"alternates must have at least 2 exprs"
				501	);
				502
				503	// Initial entry point is always the first split.
				504	let first_split_entry = self.insts.len();
				505
				506	// Save up all of the holes from each alternate. They will all get
				507	// patched to point to the same location.
				508	let mut holes = vec![];
				509
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	510	// true indicates that the hole is a split where we want to fill
				511	// the second branch.
				512	let mut prev_hole = (Hole::None, false);
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	513	for e in &exprs[0..exprs.len() - 1] {
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	514	if prev_hole.1 {
				515	let next = self.insts.len();
				516	self.fill_split(prev_hole.0, None, Some(next));
				517	} else {
				518	self.fill_to_next(prev_hole.0);
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	519	}
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	520	let split = self.push_split_hole();
				521	if let Some(Patch { hole, entry }) = self.c(e)? {
				522	holes.push(hole);
				523	prev_hole = (self.fill_split(split, Some(entry), None), false);
				524	} else {
				525	let (split1, split2) = split.dup_one();
				526	holes.push(split1);
				527	prev_hole = (split2, true);
				528	}
				529	}
				530	if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	531	holes.push(hole);
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	532	if prev_hole.1 {
				533	self.fill_split(prev_hole.0, None, Some(entry));
				534	} else {
				535	self.fill(prev_hole.0, entry);
				536	}
				537	} else {
				538	// We ignore prev_hole.1. When it's true, it means we have two
				539	// empty branches both pushing prev_hole.0 into holes, so both
				540	// branches will go to the same place anyway.
				541	holes.push(prev_hole.0);
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	542	}
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	543	Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	544	}
				545
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	546	fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	547	use syntax::hir::RepetitionKind::*;
				548	match rep.kind {
				549	ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
				550	ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
				551	OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
				552	Range(hir::RepetitionRange::Exactly(min_max)) => {
				553	self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
				554	}
				555	Range(hir::RepetitionRange::AtLeast(min)) => {
				556	self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
				557	}
				558	Range(hir::RepetitionRange::Bounded(min, max)) => {
				559	self.c_repeat_range(&rep.hir, rep.greedy, min, max)
				560	}
				561	}
				562	}
				563
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	564	fn c_repeat_zero_or_one(
				565	&mut self,
				566	expr: &Hir,
				567	greedy: bool,
				568	) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	569	let split_entry = self.insts.len();
				570	let split = self.push_split_hole();
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	571	let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
				572	Some(p) => p,
				573	None => return self.pop_split_hole(),
				574	};
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	575	let split_hole = if greedy {
				576	self.fill_split(split, Some(entry_rep), None)
				577	} else {
				578	self.fill_split(split, None, Some(entry_rep))
				579	};
				580	let holes = vec![hole_rep, split_hole];
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	581	Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	582	}
				583
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	584	fn c_repeat_zero_or_more(
				585	&mut self,
				586	expr: &Hir,
				587	greedy: bool,
				588	) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	589	let split_entry = self.insts.len();
				590	let split = self.push_split_hole();
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	591	let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
				592	Some(p) => p,
				593	None => return self.pop_split_hole(),
				594	};
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	595
				596	self.fill(hole_rep, split_entry);
				597	let split_hole = if greedy {
				598	self.fill_split(split, Some(entry_rep), None)
				599	} else {
				600	self.fill_split(split, None, Some(entry_rep))
				601	};
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	602	Ok(Some(Patch { hole: split_hole, entry: split_entry }))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	603	}
				604
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	605	fn c_repeat_one_or_more(
				606	&mut self,
				607	expr: &Hir,
				608	greedy: bool,
				609	) -> ResultOrEmpty {
				610	let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
				611	Some(p) => p,
				612	None => return Ok(None),
				613	};
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	614	self.fill_to_next(hole_rep);
				615	let split = self.push_split_hole();
				616
				617	let split_hole = if greedy {
				618	self.fill_split(split, Some(entry_rep), None)
				619	} else {
				620	self.fill_split(split, None, Some(entry_rep))
				621	};
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	622	Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	623	}
				624
				625	fn c_repeat_range_min_or_more(
				626	&mut self,
				627	expr: &Hir,
				628	greedy: bool,
				629	min: u32,
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	630	) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	631	let min = u32_to_usize(min);
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	632	// Using next_inst() is ok, because we can't return it (concat would
				633	// have to return Some(_) while c_repeat_range_min_or_more returns
				634	// None).
				635	let patch_concat = self
				636	.c_concat(iter::repeat(expr).take(min))?
				637	.unwrap_or(self.next_inst());
				638	if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
				639	self.fill(patch_concat.hole, patch_rep.entry);
				640	Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
				641	} else {
				642	Ok(None)
				643	}
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	644	}
				645
				646	fn c_repeat_range(
				647	&mut self,
				648	expr: &Hir,
				649	greedy: bool,
				650	min: u32,
				651	max: u32,
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	652	) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	653	let (min, max) = (u32_to_usize(min), u32_to_usize(max));
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	654	debug_assert!(min <= max);
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	655	let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	656	if min == max {
				657	return Ok(patch_concat);
				658	}
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	659	// Same reasoning as in c_repeat_range_min_or_more (we know that min <
				660	// max at this point).
				661	let patch_concat = patch_concat.unwrap_or(self.next_inst());
				662	let initial_entry = patch_concat.entry;
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	663	// It is much simpler to compile, e.g., `a{2,5}` as:
				664	//
				665	// aaa?a?a?
				666	//
				667	// But you end up with a sequence of instructions like this:
				668	//
				669	// 0: 'a'
				670	// 1: 'a',
				671	// 2: split(3, 4)
				672	// 3: 'a'
				673	// 4: split(5, 6)
				674	// 5: 'a'
				675	// 6: split(7, 8)
				676	// 7: 'a'
				677	// 8: MATCH
				678	//
				679	// This is incredibly inefficient because the splits end
				680	// up forming a chain, which has to be resolved everything a
				681	// transition is followed.
				682	let mut holes = vec![];
				683	let mut prev_hole = patch_concat.hole;
				684	for _ in min..max {
				685	self.fill_to_next(prev_hole);
				686	let split = self.push_split_hole();
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	687	let Patch { hole, entry } = match self.c(expr)? {
				688	Some(p) => p,
				689	None => return self.pop_split_hole(),
				690	};
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	691	prev_hole = hole;
				692	if greedy {
				693	holes.push(self.fill_split(split, Some(entry), None));
				694	} else {
				695	holes.push(self.fill_split(split, None, Some(entry)));
				696	}
				697	}
				698	holes.push(prev_hole);
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	699	Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
				700	}
				701
				702	/// Can be used as a default value for the c_* functions when the call to
				703	/// c_function is followed by inserting at least one instruction that is
				704	/// always executed after the ones written by the c* function.
				705	fn next_inst(&self) -> Patch {
				706	Patch { hole: Hole::None, entry: self.insts.len() }
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	707	}
				708
				709	fn fill(&mut self, hole: Hole, goto: InstPtr) {
				710	match hole {
				711	Hole::None => {}
				712	Hole::One(pc) => {
				713	self.insts[pc].fill(goto);
				714	}
				715	Hole::Many(holes) => {
				716	for hole in holes {
				717	self.fill(hole, goto);
				718	}
				719	}
				720	}
				721	}
				722
				723	fn fill_to_next(&mut self, hole: Hole) {
				724	let next = self.insts.len();
				725	self.fill(hole, next);
				726	}
				727
				728	fn fill_split(
				729	&mut self,
				730	hole: Hole,
				731	goto1: Option<InstPtr>,
				732	goto2: Option<InstPtr>,
				733	) -> Hole {
				734	match hole {
				735	Hole::None => Hole::None,
				736	Hole::One(pc) => match (goto1, goto2) {
				737	(Some(goto1), Some(goto2)) => {
				738	self.insts[pc].fill_split(goto1, goto2);
				739	Hole::None
				740	}
				741	(Some(goto1), None) => {
				742	self.insts[pc].half_fill_split_goto1(goto1);
				743	Hole::One(pc)
				744	}
				745	(None, Some(goto2)) => {
				746	self.insts[pc].half_fill_split_goto2(goto2);
				747	Hole::One(pc)
				748	}
				749	(None, None) => unreachable!(
				750	"at least one of the split \
				751	holes must be filled"
				752	),
				753	},
				754	Hole::Many(holes) => {
				755	let mut new_holes = vec![];
				756	for hole in holes {
				757	new_holes.push(self.fill_split(hole, goto1, goto2));
				758	}
				759	if new_holes.is_empty() {
				760	Hole::None
				761	} else if new_holes.len() == 1 {
				762	new_holes.pop().unwrap()
				763	} else {
				764	Hole::Many(new_holes)
				765	}
				766	}
				767	}
				768	}
				769
				770	fn push_compiled(&mut self, inst: Inst) {
				771	self.insts.push(MaybeInst::Compiled(inst));
				772	}
				773
				774	fn push_hole(&mut self, inst: InstHole) -> Hole {
				775	let hole = self.insts.len();
				776	self.insts.push(MaybeInst::Uncompiled(inst));
				777	Hole::One(hole)
				778	}
				779
				780	fn push_split_hole(&mut self) -> Hole {
				781	let hole = self.insts.len();
				782	self.insts.push(MaybeInst::Split);
				783	Hole::One(hole)
				784	}
				785
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	786	fn pop_split_hole(&mut self) -> ResultOrEmpty {
				787	self.insts.pop();
				788	Ok(None)
				789	}
				790
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	791	fn check_size(&self) -> result::Result<(), Error> {
				792	use std::mem::size_of;
				793
				794	if self.insts.len() * size_of::<Inst>() > self.size_limit {
				795	Err(Error::CompiledTooBig(self.size_limit))
				796	} else {
				797	Ok(())
				798	}
				799	}
				800	}
				801
				802	#[derive(Debug)]
				803	enum Hole {
				804	None,
				805	One(InstPtr),
				806	Many(Vec<Hole>),
				807	}
				808
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	809	impl Hole {
				810	fn dup_one(self) -> (Self, Self) {
				811	match self {
				812	Hole::One(pc) => (Hole::One(pc), Hole::One(pc)),
				813	Hole::None \| Hole::Many(_) => {
				814	unreachable!("must be called on single hole")
				815	}
				816	}
				817	}
				818	}
				819
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	820	#[derive(Clone, Debug)]
				821	enum MaybeInst {
				822	Compiled(Inst),
				823	Uncompiled(InstHole),
				824	Split,
				825	Split1(InstPtr),
				826	Split2(InstPtr),
				827	}
				828
				829	impl MaybeInst {
				830	fn fill(&mut self, goto: InstPtr) {
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	831	let maybeinst = match *self {
				832	MaybeInst::Split => MaybeInst::Split1(goto),
				833	MaybeInst::Uncompiled(ref inst) => {
				834	MaybeInst::Compiled(inst.fill(goto))
				835	}
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	836	MaybeInst::Split1(goto1) => {
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	837	MaybeInst::Compiled(Inst::Split(InstSplit {
				838	goto1: goto1,
				839	goto2: goto,
				840	}))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	841	}
				842	MaybeInst::Split2(goto2) => {
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	843	MaybeInst::Compiled(Inst::Split(InstSplit {
				844	goto1: goto,
				845	goto2: goto2,
				846	}))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	847	}
				848	_ => unreachable!(
				849	"not all instructions were compiled! \
				850	found uncompiled instruction: {:?}",
				851	self
				852	),
				853	};
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	854	*self = maybeinst;
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	855	}
				856
				857	fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
				858	let filled = match *self {
				859	MaybeInst::Split => {
				860	Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
				861	}
				862	_ => unreachable!(
				863	"must be called on Split instruction, \
				864	instead it was called on: {:?}",
				865	self
				866	),
				867	};
				868	*self = MaybeInst::Compiled(filled);
				869	}
				870
				871	fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
				872	let half_filled = match *self {
				873	MaybeInst::Split => goto1,
				874	_ => unreachable!(
				875	"must be called on Split instruction, \
				876	instead it was called on: {:?}",
				877	self
				878	),
				879	};
				880	*self = MaybeInst::Split1(half_filled);
				881	}
				882
				883	fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
				884	let half_filled = match *self {
				885	MaybeInst::Split => goto2,
				886	_ => unreachable!(
				887	"must be called on Split instruction, \
				888	instead it was called on: {:?}",
				889	self
				890	),
				891	};
				892	*self = MaybeInst::Split2(half_filled);
				893	}
				894
				895	fn unwrap(self) -> Inst {
				896	match self {
				897	MaybeInst::Compiled(inst) => inst,
				898	_ => unreachable!(
				899	"must be called on a compiled instruction, \
				900	instead it was called on: {:?}",
				901	self
				902	),
				903	}
				904	}
				905	}
				906
				907	#[derive(Clone, Debug)]
				908	enum InstHole {
				909	Save { slot: usize },
				910	EmptyLook { look: EmptyLook },
				911	Char { c: char },
				912	Ranges { ranges: Vec<(char, char)> },
				913	Bytes { start: u8, end: u8 },
				914	}
				915
				916	impl InstHole {
				917	fn fill(&self, goto: InstPtr) -> Inst {
				918	match *self {
				919	InstHole::Save { slot } => {
				920	Inst::Save(InstSave { goto: goto, slot: slot })
				921	}
				922	InstHole::EmptyLook { look } => {
				923	Inst::EmptyLook(InstEmptyLook { goto: goto, look: look })
				924	}
				925	InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }),
				926	InstHole::Ranges { ref ranges } => {
				927	Inst::Ranges(InstRanges { goto: goto, ranges: ranges.clone() })
				928	}
				929	InstHole::Bytes { start, end } => {
				930	Inst::Bytes(InstBytes { goto: goto, start: start, end: end })
				931	}
				932	}
				933	}
				934	}
				935
				936	struct CompileClass<'a, 'b> {
				937	c: &'a mut Compiler,
				938	ranges: &'b [hir::ClassUnicodeRange],
				939	}
				940
				941	impl<'a, 'b> CompileClass<'a, 'b> {
				942	fn compile(mut self) -> Result {
				943	let mut holes = vec![];
				944	let mut initial_entry = None;
				945	let mut last_split = Hole::None;
				946	let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
				947	self.c.suffix_cache.clear();
				948
				949	for (i, range) in self.ranges.iter().enumerate() {
				950	let is_last_range = i + 1 == self.ranges.len();
				951	utf8_seqs.reset(range.start(), range.end());
				952	let mut it = (&mut utf8_seqs).peekable();
				953	loop {
				954	let utf8_seq = match it.next() {
				955	None => break,
				956	Some(utf8_seq) => utf8_seq,
				957	};
				958	if is_last_range && it.peek().is_none() {
				959	let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
				960	holes.push(hole);
				961	self.c.fill(last_split, entry);
				962	last_split = Hole::None;
				963	if initial_entry.is_none() {
				964	initial_entry = Some(entry);
				965	}
				966	} else {
				967	if initial_entry.is_none() {
				968	initial_entry = Some(self.c.insts.len());
				969	}
				970	self.c.fill_to_next(last_split);
				971	last_split = self.c.push_split_hole();
				972	let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
				973	holes.push(hole);
				974	last_split =
				975	self.c.fill_split(last_split, Some(entry), None);
				976	}
				977	}
				978	}
				979	self.c.utf8_seqs = Some(utf8_seqs);
				980	Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
				981	}
				982
				983	fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
				984	if self.c.compiled.is_reverse {
				985	self.c_utf8_seq_(seq)
				986	} else {
				987	self.c_utf8_seq_(seq.into_iter().rev())
				988	}
				989	}
				990
				991	fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
				992	where
				993	I: IntoIterator<Item = &'r Utf8Range>,
				994	{
				995	// The initial instruction for each UTF-8 sequence should be the same.
				996	let mut from_inst = ::std::usize::MAX;
				997	let mut last_hole = Hole::None;
				998	for byte_range in seq {
				999	let key = SuffixCacheKey {
				1000	from_inst: from_inst,
				1001	start: byte_range.start,
				1002	end: byte_range.end,
				1003	};
				1004	{
				1005	let pc = self.c.insts.len();
				1006	if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
				1007	from_inst = cached_pc;
				1008	continue;
				1009	}
				1010	}
				1011	self.c.byte_classes.set_range(byte_range.start, byte_range.end);
				1012	if from_inst == ::std::usize::MAX {
				1013	last_hole = self.c.push_hole(InstHole::Bytes {
				1014	start: byte_range.start,
				1015	end: byte_range.end,
				1016	});
				1017	} else {
				1018	self.c.push_compiled(Inst::Bytes(InstBytes {
				1019	goto: from_inst,
				1020	start: byte_range.start,
				1021	end: byte_range.end,
				1022	}));
				1023	}
				1024	from_inst = self.c.insts.len().checked_sub(1).unwrap();
				1025	debug_assert!(from_inst < ::std::usize::MAX);
				1026	}
				1027	debug_assert!(from_inst < ::std::usize::MAX);
				1028	Ok(Patch { hole: last_hole, entry: from_inst })
				1029	}
				1030	}
				1031
				1032	/// `SuffixCache` is a simple bounded hash map for caching suffix entries in
				1033	/// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
				1034	/// The set of byte ranges looks like this:
				1035	///
				1036	/// [0-7F]
				1037	/// [C2-DF][80-BF]
				1038	/// [E0][A0-BF][80-BF]
				1039	/// [E1-EC][80-BF][80-BF]
				1040	/// [ED][80-9F][80-BF]
				1041	/// [EE-EF][80-BF][80-BF]
				1042	///
				1043	/// Each line above translates to one alternate in the compiled regex program.
				1044	/// However, all but one of the alternates end in the same suffix, which is
				1045	/// a waste of an instruction. The suffix cache facilitates reusing them across
				1046	/// alternates.
				1047	///
				1048	/// Note that a HashMap could be trivially used for this, but we don't need its
				1049	/// overhead. Some small bounded space (LRU style) is more than enough.
				1050	///
				1051	/// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
				1052	/// except it uses hashes as original indices and then compares full keys for
				1053	/// validation against `dense` array.
				1054	struct SuffixCache {
				1055	sparse: Box<[usize]>,
				1056	dense: Vec<SuffixCacheEntry>,
				1057	}
				1058
				1059	#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
				1060	struct SuffixCacheEntry {
				1061	key: SuffixCacheKey,
				1062	pc: InstPtr,
				1063	}
				1064
				1065	#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
				1066	struct SuffixCacheKey {
				1067	from_inst: InstPtr,
				1068	start: u8,
				1069	end: u8,
				1070	}
				1071
				1072	impl SuffixCache {
				1073	fn new(size: usize) -> Self {
				1074	SuffixCache {
				1075	sparse: vec![0usize; size].into(),
				1076	dense: Vec::with_capacity(size),
				1077	}
				1078	}
				1079
				1080	fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
				1081	let hash = self.hash(&key);
				1082	let pos = &mut self.sparse[hash];
				1083	if let Some(entry) = self.dense.get(*pos) {
				1084	if entry.key == key {
				1085	return Some(entry.pc);
				1086	}
				1087	}
				1088	*pos = self.dense.len();
				1089	self.dense.push(SuffixCacheEntry { key: key, pc: pc });
				1090	None
				1091	}
				1092
				1093	fn clear(&mut self) {
				1094	self.dense.clear();
				1095	}
				1096
				1097	fn hash(&self, suffix: &SuffixCacheKey) -> usize {
				1098	// Basic FNV-1a hash as described:
				1099	// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
				1100	const FNV_PRIME: u64 = 1099511628211;
				1101	let mut h = 14695981039346656037;
				1102	h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
				1103	h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
				1104	h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
				1105	(h as usize) % self.sparse.len()
				1106	}
				1107	}
				1108
				1109	struct ByteClassSet([bool; 256]);
				1110
				1111	impl ByteClassSet {
				1112	fn new() -> Self {
				1113	ByteClassSet([false; 256])
				1114	}
				1115
				1116	fn set_range(&mut self, start: u8, end: u8) {
				1117	debug_assert!(start <= end);
				1118	if start > 0 {
				1119	self.0[start as usize - 1] = true;
				1120	}
				1121	self.0[end as usize] = true;
				1122	}
				1123
				1124	fn set_word_boundary(&mut self) {
				1125	// We need to mark all ranges of bytes whose pairs result in
				1126	// evaluating \b differently.
				1127	let iswb = is_word_byte;
				1128	let mut b1: u16 = 0;
				1129	let mut b2: u16;
				1130	while b1 <= 255 {
				1131	b2 = b1 + 1;
				1132	while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
				1133	b2 += 1;
				1134	}
				1135	self.set_range(b1 as u8, (b2 - 1) as u8);
				1136	b1 = b2;
				1137	}
				1138	}
				1139
				1140	fn byte_classes(&self) -> Vec<u8> {
				1141	// N.B. If you're debugging the DFA, it's useful to simply return
				1142	// `(0..256).collect()`, which effectively removes the byte classes
				1143	// and makes the transitions easier to read.
				1144	// (0usize..256).map(\|x\| x as u8).collect()
				1145	let mut byte_classes = vec![0; 256];
				1146	let mut class = 0u8;
				1147	let mut i = 0;
				1148	loop {
				1149	byte_classes[i] = class as u8;
				1150	if i >= 255 {
				1151	break;
				1152	}
				1153	if self.0[i] {
				1154	class = class.checked_add(1).unwrap();
				1155	}
				1156	i += 1;
				1157	}
				1158	byte_classes
				1159	}
				1160	}
				1161
				1162	fn u32_to_usize(n: u32) -> usize {
				1163	// In case usize is less than 32 bits, we need to guard against overflow.
				1164	// On most platforms this compiles to nothing.
				1165	// TODO Use `std::convert::TryFrom` once it's stable.
				1166	if (n as u64) > (::std::usize::MAX as u64) {
				1167	panic!("BUG: {} is too big to be pointer sized", n)
				1168	}
				1169	n as usize
				1170	}
				1171
				1172	#[cfg(test)]
				1173	mod tests {
				1174	use super::ByteClassSet;
				1175
				1176	#[test]
				1177	fn byte_classes() {
				1178	let mut set = ByteClassSet::new();
				1179	set.set_range(b'a', b'z');
				1180	let classes = set.byte_classes();
				1181	assert_eq!(classes[0], 0);
				1182	assert_eq!(classes[1], 0);
				1183	assert_eq!(classes[2], 0);
				1184	assert_eq!(classes[b'a' as usize - 1], 0);
				1185	assert_eq!(classes[b'a' as usize], 1);
				1186	assert_eq!(classes[b'm' as usize], 1);
				1187	assert_eq!(classes[b'z' as usize], 1);
				1188	assert_eq!(classes[b'z' as usize + 1], 2);
				1189	assert_eq!(classes[254], 2);
				1190	assert_eq!(classes[255], 2);
				1191
				1192	let mut set = ByteClassSet::new();
				1193	set.set_range(0, 2);
				1194	set.set_range(4, 6);
				1195	let classes = set.byte_classes();
				1196	assert_eq!(classes[0], 0);
				1197	assert_eq!(classes[1], 0);
				1198	assert_eq!(classes[2], 0);
				1199	assert_eq!(classes[3], 1);
				1200	assert_eq!(classes[4], 2);
				1201	assert_eq!(classes[5], 2);
				1202	assert_eq!(classes[6], 2);
				1203	assert_eq!(classes[7], 3);
				1204	assert_eq!(classes[255], 3);
				1205	}
				1206
				1207	#[test]
				1208	fn full_byte_classes() {
				1209	let mut set = ByteClassSet::new();
				1210	for i in 0..256u16 {
				1211	set.set_range(i as u8, i as u8);
				1212	}
				1213	assert_eq!(set.byte_classes().len(), 256);
				1214	}
				1215	}