Blame - src/compile.rs - platform/external/rust/crates/regex

blob: 9a2ed5e92a0d6452875b99f46ed02b632cd72385 [file] [log] [blame]

Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	1	use std::collections::HashMap;
Haibo Huang	47619dd	2021-01-08 17:05:43 -0800	[diff] [blame]	2	use std::fmt;
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	3	use std::iter;
				4	use std::result;
				5	use std::sync::Arc;
				6
Joel Galenson	3874808	2021-05-19 16:51:51 -0700	[diff] [blame^]	7	use regex_syntax::hir::{self, Hir};
				8	use regex_syntax::is_word_byte;
				9	use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	10
Joel Galenson	3874808	2021-05-19 16:51:51 -0700	[diff] [blame^]	11	use crate::prog::{
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	12	EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
				13	InstSave, InstSplit, Program,
				14	};
				15
Joel Galenson	3874808	2021-05-19 16:51:51 -0700	[diff] [blame^]	16	use crate::Error;
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	17
				18	type Result = result::Result<Patch, Error>;
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	19	type ResultOrEmpty = result::Result<Option<Patch>, Error>;
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	20
				21	#[derive(Debug)]
				22	struct Patch {
				23	hole: Hole,
				24	entry: InstPtr,
				25	}
				26
				27	/// A compiler translates a regular expression AST to a sequence of
				28	/// instructions. The sequence of instructions represents an NFA.
Haibo Huang	47619dd	2021-01-08 17:05:43 -0800	[diff] [blame]	29	// `Compiler` is only public via the `internal` module, so avoid deriving
				30	// `Debug`.
				31	#[allow(missing_debug_implementations)]
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	32	pub struct Compiler {
				33	insts: Vec<MaybeInst>,
				34	compiled: Program,
				35	capture_name_idx: HashMap<String, usize>,
				36	num_exprs: usize,
				37	size_limit: usize,
				38	suffix_cache: SuffixCache,
				39	utf8_seqs: Option<Utf8Sequences>,
				40	byte_classes: ByteClassSet,
Joel Galenson	3874808	2021-05-19 16:51:51 -0700	[diff] [blame^]	41	extra_inst_bytes: usize,
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	42	}
				43
				44	impl Compiler {
				45	/// Create a new regular expression compiler.
				46	///
				47	/// Various options can be set before calling `compile` on an expression.
				48	pub fn new() -> Self {
				49	Compiler {
				50	insts: vec![],
				51	compiled: Program::new(),
				52	capture_name_idx: HashMap::new(),
				53	num_exprs: 0,
				54	size_limit: 10 * (1 << 20),
				55	suffix_cache: SuffixCache::new(1000),
				56	utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
				57	byte_classes: ByteClassSet::new(),
Joel Galenson	3874808	2021-05-19 16:51:51 -0700	[diff] [blame^]	58	extra_inst_bytes: 0,
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	59	}
				60	}
				61
				62	/// The size of the resulting program is limited by size_limit. If
				63	/// the program approximately exceeds the given size (in bytes), then
				64	/// compilation will stop and return an error.
				65	pub fn size_limit(mut self, size_limit: usize) -> Self {
				66	self.size_limit = size_limit;
				67	self
				68	}
				69
				70	/// If bytes is true, then the program is compiled as a byte based
				71	/// automaton, which incorporates UTF-8 decoding into the machine. If it's
				72	/// false, then the automaton is Unicode scalar value based, e.g., an
				73	/// engine utilizing such an automaton is responsible for UTF-8 decoding.
				74	///
				75	/// The specific invariant is that when returning a byte based machine,
				76	/// the neither the `Char` nor `Ranges` instructions are produced.
				77	/// Conversely, when producing a Unicode scalar value machine, the `Bytes`
				78	/// instruction is never produced.
				79	///
				80	/// Note that `dfa(true)` implies `bytes(true)`.
				81	pub fn bytes(mut self, yes: bool) -> Self {
				82	self.compiled.is_bytes = yes;
				83	self
				84	}
				85
				86	/// When disabled, the program compiled may match arbitrary bytes.
				87	///
				88	/// When enabled (the default), all compiled programs exclusively match
				89	/// valid UTF-8 bytes.
				90	pub fn only_utf8(mut self, yes: bool) -> Self {
				91	self.compiled.only_utf8 = yes;
				92	self
				93	}
				94
				95	/// When set, the machine returned is suitable for use in the DFA matching
				96	/// engine.
				97	///
				98	/// In particular, this ensures that if the regex is not anchored in the
				99	/// beginning, then a preceding `.*?` is included in the program. (The NFA
				100	/// based engines handle the preceding `.*?` explicitly, which is difficult
				101	/// or impossible in the DFA engine.)
				102	pub fn dfa(mut self, yes: bool) -> Self {
				103	self.compiled.is_dfa = yes;
				104	self
				105	}
				106
				107	/// When set, the machine returned is suitable for matching text in
				108	/// reverse. In particular, all concatenations are flipped.
				109	pub fn reverse(mut self, yes: bool) -> Self {
				110	self.compiled.is_reverse = yes;
				111	self
				112	}
				113
				114	/// Compile a regular expression given its AST.
				115	///
				116	/// The compiler is guaranteed to succeed unless the program exceeds the
				117	/// specified size limit. If the size limit is exceeded, then compilation
				118	/// stops and returns an error.
				119	pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
				120	debug_assert!(!exprs.is_empty());
				121	self.num_exprs = exprs.len();
				122	if exprs.len() == 1 {
				123	self.compile_one(&exprs[0])
				124	} else {
				125	self.compile_many(exprs)
				126	}
				127	}
				128
				129	fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
				130	// If we're compiling a forward DFA and we aren't anchored, then
				131	// add a `.*?` before the first capture group.
				132	// Other matching engines handle this by baking the logic into the
				133	// matching engine itself.
				134	let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
				135	self.compiled.is_anchored_start = expr.is_anchored_start();
				136	self.compiled.is_anchored_end = expr.is_anchored_end();
				137	if self.compiled.needs_dotstar() {
				138	dotstar_patch = self.c_dotstar()?;
				139	self.compiled.start = dotstar_patch.entry;
				140	}
				141	self.compiled.captures = vec![None];
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	142	let patch = self.c_capture(0, expr)?.unwrap_or(self.next_inst());
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	143	if self.compiled.needs_dotstar() {
				144	self.fill(dotstar_patch.hole, patch.entry);
				145	} else {
				146	self.compiled.start = patch.entry;
				147	}
				148	self.fill_to_next(patch.hole);
				149	self.compiled.matches = vec![self.insts.len()];
				150	self.push_compiled(Inst::Match(0));
				151	self.compile_finish()
				152	}
				153
				154	fn compile_many(
				155	mut self,
				156	exprs: &[Hir],
				157	) -> result::Result<Program, Error> {
				158	debug_assert!(exprs.len() > 1);
				159
				160	self.compiled.is_anchored_start =
				161	exprs.iter().all(\|e\| e.is_anchored_start());
				162	self.compiled.is_anchored_end =
				163	exprs.iter().all(\|e\| e.is_anchored_end());
				164	let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
				165	if self.compiled.needs_dotstar() {
				166	dotstar_patch = self.c_dotstar()?;
				167	self.compiled.start = dotstar_patch.entry;
				168	} else {
				169	self.compiled.start = 0; // first instruction is always split
				170	}
				171	self.fill_to_next(dotstar_patch.hole);
				172
				173	let mut prev_hole = Hole::None;
				174	for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
				175	self.fill_to_next(prev_hole);
				176	let split = self.push_split_hole();
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	177	let Patch { hole, entry } =
				178	self.c_capture(0, expr)?.unwrap_or(self.next_inst());
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	179	self.fill_to_next(hole);
				180	self.compiled.matches.push(self.insts.len());
				181	self.push_compiled(Inst::Match(i));
				182	prev_hole = self.fill_split(split, Some(entry), None);
				183	}
				184	let i = exprs.len() - 1;
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	185	let Patch { hole, entry } =
				186	self.c_capture(0, &exprs[i])?.unwrap_or(self.next_inst());
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	187	self.fill(prev_hole, entry);
				188	self.fill_to_next(hole);
				189	self.compiled.matches.push(self.insts.len());
				190	self.push_compiled(Inst::Match(i));
				191	self.compile_finish()
				192	}
				193
				194	fn compile_finish(mut self) -> result::Result<Program, Error> {
				195	self.compiled.insts =
				196	self.insts.into_iter().map(\|inst\| inst.unwrap()).collect();
				197	self.compiled.byte_classes = self.byte_classes.byte_classes();
				198	self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
				199	Ok(self.compiled)
				200	}
				201
				202	/// Compile expr into self.insts, returning a patch on success,
				203	/// or an error if we run out of memory.
				204	///
				205	/// All of the c_* methods of the compiler share the contract outlined
				206	/// here.
				207	///
				208	/// The main thing that a c_* method does is mutate `self.insts`
				209	/// to add a list of mostly compiled instructions required to execute
				210	/// the given expression. `self.insts` contains MaybeInsts rather than
				211	/// Insts because there is some backpatching required.
				212	///
				213	/// The `Patch` value returned by each c_* method provides metadata
				214	/// about the compiled instructions emitted to `self.insts`. The
				215	/// `entry` member of the patch refers to the first instruction
				216	/// (the entry point), while the `hole` member contains zero or
				217	/// more offsets to partial instructions that need to be backpatched.
				218	/// The c_* routine can't know where its list of instructions are going to
				219	/// jump to after execution, so it is up to the caller to patch
				220	/// these jumps to point to the right place. So compiling some
				221	/// expression, e, we would end up with a situation that looked like:
				222	///
				223	/// ```text
				224	/// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
				225	/// ^ ^ ^
				226	/// \| \ /
				227	/// entry \ /
				228	/// hole
				229	/// ```
				230	///
Chih-Hung Hsieh	849e445	2020-10-26 13:16:47 -0700	[diff] [blame]	231	/// To compile two expressions, e1 and e2, concatenated together we
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	232	/// would do:
				233	///
				234	/// ```ignore
				235	/// let patch1 = self.c(e1);
				236	/// let patch2 = self.c(e2);
				237	/// ```
				238	///
				239	/// while leaves us with a situation that looks like
				240	///
				241	/// ```text
				242	/// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
				243	/// ^ ^ ^ ^
				244	/// \| \| \| \|
				245	/// entry1 hole1 entry2 hole2
				246	/// ```
				247	///
				248	/// Then to merge the two patches together into one we would backpatch
				249	/// hole1 with entry2 and return a new patch that enters at entry1
				250	/// and has hole2 for a hole. In fact, if you look at the c_concat
				251	/// method you will see that it does exactly this, though it handles
				252	/// a list of expressions rather than just the two that we use for
				253	/// an example.
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	254	///
				255	/// Ok(None) is returned when an expression is compiled to no
				256	/// instruction, and so no patch.entry value makes sense.
				257	fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
Joel Galenson	3874808	2021-05-19 16:51:51 -0700	[diff] [blame^]	258	use crate::prog;
				259	use regex_syntax::hir::HirKind::*;
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	260
				261	self.check_size()?;
				262	match *expr.kind() {
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	263	Empty => Ok(None),
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	264	Literal(hir::Literal::Unicode(c)) => self.c_char(c),
				265	Literal(hir::Literal::Byte(b)) => {
				266	assert!(self.compiled.uses_bytes());
				267	self.c_byte(b)
				268	}
				269	Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
				270	Class(hir::Class::Bytes(ref cls)) => {
				271	if self.compiled.uses_bytes() {
				272	self.c_class_bytes(cls.ranges())
				273	} else {
				274	assert!(cls.is_all_ascii());
				275	let mut char_ranges = vec![];
				276	for r in cls.iter() {
				277	let (s, e) = (r.start() as char, r.end() as char);
				278	char_ranges.push(hir::ClassUnicodeRange::new(s, e));
				279	}
				280	self.c_class(&char_ranges)
				281	}
				282	}
				283	Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
				284	self.byte_classes.set_range(b'\n', b'\n');
				285	self.c_empty_look(prog::EmptyLook::EndLine)
				286	}
				287	Anchor(hir::Anchor::StartLine) => {
				288	self.byte_classes.set_range(b'\n', b'\n');
				289	self.c_empty_look(prog::EmptyLook::StartLine)
				290	}
				291	Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
				292	self.byte_classes.set_range(b'\n', b'\n');
				293	self.c_empty_look(prog::EmptyLook::StartLine)
				294	}
				295	Anchor(hir::Anchor::EndLine) => {
				296	self.byte_classes.set_range(b'\n', b'\n');
				297	self.c_empty_look(prog::EmptyLook::EndLine)
				298	}
				299	Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
				300	self.c_empty_look(prog::EmptyLook::EndText)
				301	}
				302	Anchor(hir::Anchor::StartText) => {
				303	self.c_empty_look(prog::EmptyLook::StartText)
				304	}
				305	Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
				306	self.c_empty_look(prog::EmptyLook::StartText)
				307	}
				308	Anchor(hir::Anchor::EndText) => {
				309	self.c_empty_look(prog::EmptyLook::EndText)
				310	}
				311	WordBoundary(hir::WordBoundary::Unicode) => {
				312	if !cfg!(feature = "unicode-perl") {
				313	return Err(Error::Syntax(
				314	"Unicode word boundaries are unavailable when \
				315	the unicode-perl feature is disabled"
				316	.to_string(),
				317	));
				318	}
				319	self.compiled.has_unicode_word_boundary = true;
				320	self.byte_classes.set_word_boundary();
Joel Galenson	3874808	2021-05-19 16:51:51 -0700	[diff] [blame^]	321	// We also make sure that all ASCII bytes are in a different
				322	// class from non-ASCII bytes. Otherwise, it's possible for
				323	// ASCII bytes to get lumped into the same class as non-ASCII
				324	// bytes. This in turn may cause the lazy DFA to falsely start
				325	// when it sees an ASCII byte that maps to a byte class with
				326	// non-ASCII bytes. This ensures that never happens.
				327	self.byte_classes.set_range(0, 0x7F);
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	328	self.c_empty_look(prog::EmptyLook::WordBoundary)
				329	}
				330	WordBoundary(hir::WordBoundary::UnicodeNegate) => {
				331	if !cfg!(feature = "unicode-perl") {
				332	return Err(Error::Syntax(
				333	"Unicode word boundaries are unavailable when \
				334	the unicode-perl feature is disabled"
				335	.to_string(),
				336	));
				337	}
				338	self.compiled.has_unicode_word_boundary = true;
				339	self.byte_classes.set_word_boundary();
Joel Galenson	3874808	2021-05-19 16:51:51 -0700	[diff] [blame^]	340	// See comments above for why we set the ASCII range here.
				341	self.byte_classes.set_range(0, 0x7F);
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	342	self.c_empty_look(prog::EmptyLook::NotWordBoundary)
				343	}
				344	WordBoundary(hir::WordBoundary::Ascii) => {
				345	self.byte_classes.set_word_boundary();
				346	self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
				347	}
				348	WordBoundary(hir::WordBoundary::AsciiNegate) => {
				349	self.byte_classes.set_word_boundary();
				350	self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
				351	}
				352	Group(ref g) => match g.kind {
				353	hir::GroupKind::NonCapturing => self.c(&g.hir),
				354	hir::GroupKind::CaptureIndex(index) => {
				355	if index as usize >= self.compiled.captures.len() {
				356	self.compiled.captures.push(None);
				357	}
				358	self.c_capture(2 * index as usize, &g.hir)
				359	}
				360	hir::GroupKind::CaptureName { index, ref name } => {
				361	if index as usize >= self.compiled.captures.len() {
				362	let n = name.to_string();
				363	self.compiled.captures.push(Some(n.clone()));
				364	self.capture_name_idx.insert(n, index as usize);
				365	}
				366	self.c_capture(2 * index as usize, &g.hir)
				367	}
				368	},
				369	Concat(ref es) => {
				370	if self.compiled.is_reverse {
				371	self.c_concat(es.iter().rev())
				372	} else {
				373	self.c_concat(es)
				374	}
				375	}
				376	Alternation(ref es) => self.c_alternate(&**es),
				377	Repetition(ref rep) => self.c_repeat(rep),
				378	}
				379	}
				380
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	381	fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	382	if self.num_exprs > 1 \|\| self.compiled.is_dfa {
				383	// Don't ever compile Save instructions for regex sets because
				384	// they are never used. They are also never used in DFA programs
				385	// because DFAs can't handle captures.
				386	self.c(expr)
				387	} else {
				388	let entry = self.insts.len();
				389	let hole = self.push_hole(InstHole::Save { slot: first_slot });
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	390	let patch = self.c(expr)?.unwrap_or(self.next_inst());
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	391	self.fill(hole, patch.entry);
				392	self.fill_to_next(patch.hole);
				393	let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	394	Ok(Some(Patch { hole: hole, entry: entry }))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	395	}
				396	}
				397
				398	fn c_dotstar(&mut self) -> Result {
				399	Ok(if !self.compiled.only_utf8() {
				400	self.c(&Hir::repetition(hir::Repetition {
				401	kind: hir::RepetitionKind::ZeroOrMore,
				402	greedy: false,
				403	hir: Box::new(Hir::any(true)),
				404	}))?
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	405	.unwrap()
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	406	} else {
				407	self.c(&Hir::repetition(hir::Repetition {
				408	kind: hir::RepetitionKind::ZeroOrMore,
				409	greedy: false,
				410	hir: Box::new(Hir::any(false)),
				411	}))?
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	412	.unwrap()
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	413	})
				414	}
				415
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	416	fn c_char(&mut self, c: char) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	417	if self.compiled.uses_bytes() {
				418	if c.is_ascii() {
				419	let b = c as u8;
				420	let hole =
				421	self.push_hole(InstHole::Bytes { start: b, end: b });
				422	self.byte_classes.set_range(b, b);
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	423	Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	424	} else {
				425	self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
				426	}
				427	} else {
				428	let hole = self.push_hole(InstHole::Char { c: c });
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	429	Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	430	}
				431	}
				432
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	433	fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
Joel Galenson	3874808	2021-05-19 16:51:51 -0700	[diff] [blame^]	434	use std::mem::size_of;
				435
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	436	assert!(!ranges.is_empty());
				437	if self.compiled.uses_bytes() {
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	438	Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	439	} else {
				440	let ranges: Vec<(char, char)> =
				441	ranges.iter().map(\|r\| (r.start(), r.end())).collect();
				442	let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
				443	self.push_hole(InstHole::Char { c: ranges[0].0 })
				444	} else {
Joel Galenson	3874808	2021-05-19 16:51:51 -0700	[diff] [blame^]	445	self.extra_inst_bytes +=
				446	ranges.len() * (size_of::<char>() * 2);
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	447	self.push_hole(InstHole::Ranges { ranges: ranges })
				448	};
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	449	Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	450	}
				451	}
				452
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	453	fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	454	self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
				455	}
				456
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	457	fn c_class_bytes(
				458	&mut self,
				459	ranges: &[hir::ClassBytesRange],
				460	) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	461	debug_assert!(!ranges.is_empty());
				462
				463	let first_split_entry = self.insts.len();
				464	let mut holes = vec![];
				465	let mut prev_hole = Hole::None;
				466	for r in &ranges[0..ranges.len() - 1] {
				467	self.fill_to_next(prev_hole);
				468	let split = self.push_split_hole();
				469	let next = self.insts.len();
				470	self.byte_classes.set_range(r.start(), r.end());
				471	holes.push(self.push_hole(InstHole::Bytes {
				472	start: r.start(),
				473	end: r.end(),
				474	}));
				475	prev_hole = self.fill_split(split, Some(next), None);
				476	}
				477	let next = self.insts.len();
				478	let r = &ranges[ranges.len() - 1];
				479	self.byte_classes.set_range(r.start(), r.end());
				480	holes.push(
				481	self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
				482	);
				483	self.fill(prev_hole, next);
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	484	Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	485	}
				486
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	487	fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	488	let hole = self.push_hole(InstHole::EmptyLook { look: look });
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	489	Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	490	}
				491
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	492	fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	493	where
				494	I: IntoIterator<Item = &'a Hir>,
				495	{
				496	let mut exprs = exprs.into_iter();
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	497	let Patch { mut hole, entry } = loop {
				498	match exprs.next() {
				499	None => return Ok(None),
				500	Some(e) => {
				501	if let Some(p) = self.c(e)? {
				502	break p;
				503	}
				504	}
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	505	}
				506	};
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	507	for e in exprs {
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	508	if let Some(p) = self.c(e)? {
				509	self.fill(hole, p.entry);
				510	hole = p.hole;
				511	}
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	512	}
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	513	Ok(Some(Patch { hole: hole, entry: entry }))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	514	}
				515
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	516	fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	517	debug_assert!(
				518	exprs.len() >= 2,
				519	"alternates must have at least 2 exprs"
				520	);
				521
				522	// Initial entry point is always the first split.
				523	let first_split_entry = self.insts.len();
				524
				525	// Save up all of the holes from each alternate. They will all get
				526	// patched to point to the same location.
				527	let mut holes = vec![];
				528
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	529	// true indicates that the hole is a split where we want to fill
				530	// the second branch.
				531	let mut prev_hole = (Hole::None, false);
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	532	for e in &exprs[0..exprs.len() - 1] {
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	533	if prev_hole.1 {
				534	let next = self.insts.len();
				535	self.fill_split(prev_hole.0, None, Some(next));
				536	} else {
				537	self.fill_to_next(prev_hole.0);
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	538	}
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	539	let split = self.push_split_hole();
				540	if let Some(Patch { hole, entry }) = self.c(e)? {
				541	holes.push(hole);
				542	prev_hole = (self.fill_split(split, Some(entry), None), false);
				543	} else {
				544	let (split1, split2) = split.dup_one();
				545	holes.push(split1);
				546	prev_hole = (split2, true);
				547	}
				548	}
				549	if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	550	holes.push(hole);
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	551	if prev_hole.1 {
				552	self.fill_split(prev_hole.0, None, Some(entry));
				553	} else {
				554	self.fill(prev_hole.0, entry);
				555	}
				556	} else {
				557	// We ignore prev_hole.1. When it's true, it means we have two
				558	// empty branches both pushing prev_hole.0 into holes, so both
				559	// branches will go to the same place anyway.
				560	holes.push(prev_hole.0);
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	561	}
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	562	Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	563	}
				564
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	565	fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
Joel Galenson	3874808	2021-05-19 16:51:51 -0700	[diff] [blame^]	566	use regex_syntax::hir::RepetitionKind::*;
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	567	match rep.kind {
				568	ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
				569	ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
				570	OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
				571	Range(hir::RepetitionRange::Exactly(min_max)) => {
				572	self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
				573	}
				574	Range(hir::RepetitionRange::AtLeast(min)) => {
				575	self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
				576	}
				577	Range(hir::RepetitionRange::Bounded(min, max)) => {
				578	self.c_repeat_range(&rep.hir, rep.greedy, min, max)
				579	}
				580	}
				581	}
				582
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	583	fn c_repeat_zero_or_one(
				584	&mut self,
				585	expr: &Hir,
				586	greedy: bool,
				587	) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	588	let split_entry = self.insts.len();
				589	let split = self.push_split_hole();
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	590	let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
				591	Some(p) => p,
				592	None => return self.pop_split_hole(),
				593	};
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	594	let split_hole = if greedy {
				595	self.fill_split(split, Some(entry_rep), None)
				596	} else {
				597	self.fill_split(split, None, Some(entry_rep))
				598	};
				599	let holes = vec![hole_rep, split_hole];
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	600	Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	601	}
				602
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	603	fn c_repeat_zero_or_more(
				604	&mut self,
				605	expr: &Hir,
				606	greedy: bool,
				607	) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	608	let split_entry = self.insts.len();
				609	let split = self.push_split_hole();
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	610	let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
				611	Some(p) => p,
				612	None => return self.pop_split_hole(),
				613	};
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	614
				615	self.fill(hole_rep, split_entry);
				616	let split_hole = if greedy {
				617	self.fill_split(split, Some(entry_rep), None)
				618	} else {
				619	self.fill_split(split, None, Some(entry_rep))
				620	};
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	621	Ok(Some(Patch { hole: split_hole, entry: split_entry }))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	622	}
				623
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	624	fn c_repeat_one_or_more(
				625	&mut self,
				626	expr: &Hir,
				627	greedy: bool,
				628	) -> ResultOrEmpty {
				629	let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
				630	Some(p) => p,
				631	None => return Ok(None),
				632	};
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	633	self.fill_to_next(hole_rep);
				634	let split = self.push_split_hole();
				635
				636	let split_hole = if greedy {
				637	self.fill_split(split, Some(entry_rep), None)
				638	} else {
				639	self.fill_split(split, None, Some(entry_rep))
				640	};
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	641	Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	642	}
				643
				644	fn c_repeat_range_min_or_more(
				645	&mut self,
				646	expr: &Hir,
				647	greedy: bool,
				648	min: u32,
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	649	) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	650	let min = u32_to_usize(min);
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	651	// Using next_inst() is ok, because we can't return it (concat would
				652	// have to return Some(_) while c_repeat_range_min_or_more returns
				653	// None).
				654	let patch_concat = self
				655	.c_concat(iter::repeat(expr).take(min))?
				656	.unwrap_or(self.next_inst());
				657	if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
				658	self.fill(patch_concat.hole, patch_rep.entry);
				659	Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
				660	} else {
				661	Ok(None)
				662	}
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	663	}
				664
				665	fn c_repeat_range(
				666	&mut self,
				667	expr: &Hir,
				668	greedy: bool,
				669	min: u32,
				670	max: u32,
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	671	) -> ResultOrEmpty {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	672	let (min, max) = (u32_to_usize(min), u32_to_usize(max));
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	673	debug_assert!(min <= max);
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	674	let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	675	if min == max {
				676	return Ok(patch_concat);
				677	}
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	678	// Same reasoning as in c_repeat_range_min_or_more (we know that min <
				679	// max at this point).
				680	let patch_concat = patch_concat.unwrap_or(self.next_inst());
				681	let initial_entry = patch_concat.entry;
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	682	// It is much simpler to compile, e.g., `a{2,5}` as:
				683	//
				684	// aaa?a?a?
				685	//
				686	// But you end up with a sequence of instructions like this:
				687	//
				688	// 0: 'a'
				689	// 1: 'a',
				690	// 2: split(3, 4)
				691	// 3: 'a'
				692	// 4: split(5, 6)
				693	// 5: 'a'
				694	// 6: split(7, 8)
				695	// 7: 'a'
				696	// 8: MATCH
				697	//
				698	// This is incredibly inefficient because the splits end
				699	// up forming a chain, which has to be resolved everything a
				700	// transition is followed.
				701	let mut holes = vec![];
				702	let mut prev_hole = patch_concat.hole;
				703	for _ in min..max {
				704	self.fill_to_next(prev_hole);
				705	let split = self.push_split_hole();
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	706	let Patch { hole, entry } = match self.c(expr)? {
				707	Some(p) => p,
				708	None => return self.pop_split_hole(),
				709	};
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	710	prev_hole = hole;
				711	if greedy {
				712	holes.push(self.fill_split(split, Some(entry), None));
				713	} else {
				714	holes.push(self.fill_split(split, None, Some(entry)));
				715	}
				716	}
				717	holes.push(prev_hole);
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	718	Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
				719	}
				720
				721	/// Can be used as a default value for the c_* functions when the call to
				722	/// c_function is followed by inserting at least one instruction that is
				723	/// always executed after the ones written by the c* function.
				724	fn next_inst(&self) -> Patch {
				725	Patch { hole: Hole::None, entry: self.insts.len() }
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	726	}
				727
				728	fn fill(&mut self, hole: Hole, goto: InstPtr) {
				729	match hole {
				730	Hole::None => {}
				731	Hole::One(pc) => {
				732	self.insts[pc].fill(goto);
				733	}
				734	Hole::Many(holes) => {
				735	for hole in holes {
				736	self.fill(hole, goto);
				737	}
				738	}
				739	}
				740	}
				741
				742	fn fill_to_next(&mut self, hole: Hole) {
				743	let next = self.insts.len();
				744	self.fill(hole, next);
				745	}
				746
				747	fn fill_split(
				748	&mut self,
				749	hole: Hole,
				750	goto1: Option<InstPtr>,
				751	goto2: Option<InstPtr>,
				752	) -> Hole {
				753	match hole {
				754	Hole::None => Hole::None,
				755	Hole::One(pc) => match (goto1, goto2) {
				756	(Some(goto1), Some(goto2)) => {
				757	self.insts[pc].fill_split(goto1, goto2);
				758	Hole::None
				759	}
				760	(Some(goto1), None) => {
				761	self.insts[pc].half_fill_split_goto1(goto1);
				762	Hole::One(pc)
				763	}
				764	(None, Some(goto2)) => {
				765	self.insts[pc].half_fill_split_goto2(goto2);
				766	Hole::One(pc)
				767	}
				768	(None, None) => unreachable!(
				769	"at least one of the split \
				770	holes must be filled"
				771	),
				772	},
				773	Hole::Many(holes) => {
				774	let mut new_holes = vec![];
				775	for hole in holes {
				776	new_holes.push(self.fill_split(hole, goto1, goto2));
				777	}
				778	if new_holes.is_empty() {
				779	Hole::None
				780	} else if new_holes.len() == 1 {
				781	new_holes.pop().unwrap()
				782	} else {
				783	Hole::Many(new_holes)
				784	}
				785	}
				786	}
				787	}
				788
				789	fn push_compiled(&mut self, inst: Inst) {
				790	self.insts.push(MaybeInst::Compiled(inst));
				791	}
				792
				793	fn push_hole(&mut self, inst: InstHole) -> Hole {
				794	let hole = self.insts.len();
				795	self.insts.push(MaybeInst::Uncompiled(inst));
				796	Hole::One(hole)
				797	}
				798
				799	fn push_split_hole(&mut self) -> Hole {
				800	let hole = self.insts.len();
				801	self.insts.push(MaybeInst::Split);
				802	Hole::One(hole)
				803	}
				804
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	805	fn pop_split_hole(&mut self) -> ResultOrEmpty {
				806	self.insts.pop();
				807	Ok(None)
				808	}
				809
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	810	fn check_size(&self) -> result::Result<(), Error> {
				811	use std::mem::size_of;
				812
Joel Galenson	3874808	2021-05-19 16:51:51 -0700	[diff] [blame^]	813	let size =
				814	self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>());
				815	if size > self.size_limit {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	816	Err(Error::CompiledTooBig(self.size_limit))
				817	} else {
				818	Ok(())
				819	}
				820	}
				821	}
				822
				823	#[derive(Debug)]
				824	enum Hole {
				825	None,
				826	One(InstPtr),
				827	Many(Vec<Hole>),
				828	}
				829
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	830	impl Hole {
				831	fn dup_one(self) -> (Self, Self) {
				832	match self {
				833	Hole::One(pc) => (Hole::One(pc), Hole::One(pc)),
				834	Hole::None \| Hole::Many(_) => {
				835	unreachable!("must be called on single hole")
				836	}
				837	}
				838	}
				839	}
				840
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	841	#[derive(Clone, Debug)]
				842	enum MaybeInst {
				843	Compiled(Inst),
				844	Uncompiled(InstHole),
				845	Split,
				846	Split1(InstPtr),
				847	Split2(InstPtr),
				848	}
				849
				850	impl MaybeInst {
				851	fn fill(&mut self, goto: InstPtr) {
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	852	let maybeinst = match *self {
				853	MaybeInst::Split => MaybeInst::Split1(goto),
				854	MaybeInst::Uncompiled(ref inst) => {
				855	MaybeInst::Compiled(inst.fill(goto))
				856	}
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	857	MaybeInst::Split1(goto1) => {
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	858	MaybeInst::Compiled(Inst::Split(InstSplit {
				859	goto1: goto1,
				860	goto2: goto,
				861	}))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	862	}
				863	MaybeInst::Split2(goto2) => {
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	864	MaybeInst::Compiled(Inst::Split(InstSplit {
				865	goto1: goto,
				866	goto2: goto2,
				867	}))
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	868	}
				869	_ => unreachable!(
				870	"not all instructions were compiled! \
				871	found uncompiled instruction: {:?}",
				872	self
				873	),
				874	};
Haibo Huang	49cbe5f	2020-05-28 20:14:24 -0700	[diff] [blame]	875	*self = maybeinst;
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	876	}
				877
				878	fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
				879	let filled = match *self {
				880	MaybeInst::Split => {
				881	Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
				882	}
				883	_ => unreachable!(
				884	"must be called on Split instruction, \
				885	instead it was called on: {:?}",
				886	self
				887	),
				888	};
				889	*self = MaybeInst::Compiled(filled);
				890	}
				891
				892	fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
				893	let half_filled = match *self {
				894	MaybeInst::Split => goto1,
				895	_ => unreachable!(
				896	"must be called on Split instruction, \
				897	instead it was called on: {:?}",
				898	self
				899	),
				900	};
				901	*self = MaybeInst::Split1(half_filled);
				902	}
				903
				904	fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
				905	let half_filled = match *self {
				906	MaybeInst::Split => goto2,
				907	_ => unreachable!(
				908	"must be called on Split instruction, \
				909	instead it was called on: {:?}",
				910	self
				911	),
				912	};
				913	*self = MaybeInst::Split2(half_filled);
				914	}
				915
				916	fn unwrap(self) -> Inst {
				917	match self {
				918	MaybeInst::Compiled(inst) => inst,
				919	_ => unreachable!(
				920	"must be called on a compiled instruction, \
				921	instead it was called on: {:?}",
				922	self
				923	),
				924	}
				925	}
				926	}
				927
				928	#[derive(Clone, Debug)]
				929	enum InstHole {
				930	Save { slot: usize },
				931	EmptyLook { look: EmptyLook },
				932	Char { c: char },
				933	Ranges { ranges: Vec<(char, char)> },
				934	Bytes { start: u8, end: u8 },
				935	}
				936
				937	impl InstHole {
				938	fn fill(&self, goto: InstPtr) -> Inst {
				939	match *self {
				940	InstHole::Save { slot } => {
				941	Inst::Save(InstSave { goto: goto, slot: slot })
				942	}
				943	InstHole::EmptyLook { look } => {
				944	Inst::EmptyLook(InstEmptyLook { goto: goto, look: look })
				945	}
				946	InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }),
Joel Galenson	3874808	2021-05-19 16:51:51 -0700	[diff] [blame^]	947	InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
				948	goto: goto,
				949	ranges: ranges.clone().into_boxed_slice(),
				950	}),
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	951	InstHole::Bytes { start, end } => {
				952	Inst::Bytes(InstBytes { goto: goto, start: start, end: end })
				953	}
				954	}
				955	}
				956	}
				957
				958	struct CompileClass<'a, 'b> {
				959	c: &'a mut Compiler,
				960	ranges: &'b [hir::ClassUnicodeRange],
				961	}
				962
				963	impl<'a, 'b> CompileClass<'a, 'b> {
				964	fn compile(mut self) -> Result {
				965	let mut holes = vec![];
				966	let mut initial_entry = None;
				967	let mut last_split = Hole::None;
				968	let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
				969	self.c.suffix_cache.clear();
				970
				971	for (i, range) in self.ranges.iter().enumerate() {
				972	let is_last_range = i + 1 == self.ranges.len();
				973	utf8_seqs.reset(range.start(), range.end());
				974	let mut it = (&mut utf8_seqs).peekable();
				975	loop {
				976	let utf8_seq = match it.next() {
				977	None => break,
				978	Some(utf8_seq) => utf8_seq,
				979	};
				980	if is_last_range && it.peek().is_none() {
				981	let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
				982	holes.push(hole);
				983	self.c.fill(last_split, entry);
				984	last_split = Hole::None;
				985	if initial_entry.is_none() {
				986	initial_entry = Some(entry);
				987	}
				988	} else {
				989	if initial_entry.is_none() {
				990	initial_entry = Some(self.c.insts.len());
				991	}
				992	self.c.fill_to_next(last_split);
				993	last_split = self.c.push_split_hole();
				994	let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
				995	holes.push(hole);
				996	last_split =
				997	self.c.fill_split(last_split, Some(entry), None);
				998	}
				999	}
				1000	}
				1001	self.c.utf8_seqs = Some(utf8_seqs);
				1002	Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
				1003	}
				1004
				1005	fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
				1006	if self.c.compiled.is_reverse {
				1007	self.c_utf8_seq_(seq)
				1008	} else {
				1009	self.c_utf8_seq_(seq.into_iter().rev())
				1010	}
				1011	}
				1012
				1013	fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
				1014	where
				1015	I: IntoIterator<Item = &'r Utf8Range>,
				1016	{
				1017	// The initial instruction for each UTF-8 sequence should be the same.
				1018	let mut from_inst = ::std::usize::MAX;
				1019	let mut last_hole = Hole::None;
				1020	for byte_range in seq {
				1021	let key = SuffixCacheKey {
				1022	from_inst: from_inst,
				1023	start: byte_range.start,
				1024	end: byte_range.end,
				1025	};
				1026	{
				1027	let pc = self.c.insts.len();
				1028	if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
				1029	from_inst = cached_pc;
				1030	continue;
				1031	}
				1032	}
				1033	self.c.byte_classes.set_range(byte_range.start, byte_range.end);
				1034	if from_inst == ::std::usize::MAX {
				1035	last_hole = self.c.push_hole(InstHole::Bytes {
				1036	start: byte_range.start,
				1037	end: byte_range.end,
				1038	});
				1039	} else {
				1040	self.c.push_compiled(Inst::Bytes(InstBytes {
				1041	goto: from_inst,
				1042	start: byte_range.start,
				1043	end: byte_range.end,
				1044	}));
				1045	}
				1046	from_inst = self.c.insts.len().checked_sub(1).unwrap();
				1047	debug_assert!(from_inst < ::std::usize::MAX);
				1048	}
				1049	debug_assert!(from_inst < ::std::usize::MAX);
				1050	Ok(Patch { hole: last_hole, entry: from_inst })
				1051	}
				1052	}
				1053
				1054	/// `SuffixCache` is a simple bounded hash map for caching suffix entries in
				1055	/// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
				1056	/// The set of byte ranges looks like this:
				1057	///
				1058	/// [0-7F]
				1059	/// [C2-DF][80-BF]
				1060	/// [E0][A0-BF][80-BF]
				1061	/// [E1-EC][80-BF][80-BF]
				1062	/// [ED][80-9F][80-BF]
				1063	/// [EE-EF][80-BF][80-BF]
				1064	///
				1065	/// Each line above translates to one alternate in the compiled regex program.
				1066	/// However, all but one of the alternates end in the same suffix, which is
				1067	/// a waste of an instruction. The suffix cache facilitates reusing them across
				1068	/// alternates.
				1069	///
				1070	/// Note that a HashMap could be trivially used for this, but we don't need its
				1071	/// overhead. Some small bounded space (LRU style) is more than enough.
				1072	///
				1073	/// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
				1074	/// except it uses hashes as original indices and then compares full keys for
				1075	/// validation against `dense` array.
Haibo Huang	47619dd	2021-01-08 17:05:43 -0800	[diff] [blame]	1076	#[derive(Debug)]
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	1077	struct SuffixCache {
				1078	sparse: Box<[usize]>,
				1079	dense: Vec<SuffixCacheEntry>,
				1080	}
				1081
				1082	#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
				1083	struct SuffixCacheEntry {
				1084	key: SuffixCacheKey,
				1085	pc: InstPtr,
				1086	}
				1087
				1088	#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
				1089	struct SuffixCacheKey {
				1090	from_inst: InstPtr,
				1091	start: u8,
				1092	end: u8,
				1093	}
				1094
				1095	impl SuffixCache {
				1096	fn new(size: usize) -> Self {
				1097	SuffixCache {
				1098	sparse: vec![0usize; size].into(),
				1099	dense: Vec::with_capacity(size),
				1100	}
				1101	}
				1102
				1103	fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
				1104	let hash = self.hash(&key);
				1105	let pos = &mut self.sparse[hash];
				1106	if let Some(entry) = self.dense.get(*pos) {
				1107	if entry.key == key {
				1108	return Some(entry.pc);
				1109	}
				1110	}
				1111	*pos = self.dense.len();
				1112	self.dense.push(SuffixCacheEntry { key: key, pc: pc });
				1113	None
				1114	}
				1115
				1116	fn clear(&mut self) {
				1117	self.dense.clear();
				1118	}
				1119
				1120	fn hash(&self, suffix: &SuffixCacheKey) -> usize {
				1121	// Basic FNV-1a hash as described:
				1122	// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
				1123	const FNV_PRIME: u64 = 1099511628211;
				1124	let mut h = 14695981039346656037;
				1125	h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
				1126	h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
				1127	h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
				1128	(h as usize) % self.sparse.len()
				1129	}
				1130	}
				1131
				1132	struct ByteClassSet([bool; 256]);
				1133
				1134	impl ByteClassSet {
				1135	fn new() -> Self {
				1136	ByteClassSet([false; 256])
				1137	}
				1138
				1139	fn set_range(&mut self, start: u8, end: u8) {
				1140	debug_assert!(start <= end);
				1141	if start > 0 {
				1142	self.0[start as usize - 1] = true;
				1143	}
				1144	self.0[end as usize] = true;
				1145	}
				1146
				1147	fn set_word_boundary(&mut self) {
				1148	// We need to mark all ranges of bytes whose pairs result in
				1149	// evaluating \b differently.
				1150	let iswb = is_word_byte;
				1151	let mut b1: u16 = 0;
				1152	let mut b2: u16;
				1153	while b1 <= 255 {
				1154	b2 = b1 + 1;
				1155	while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
				1156	b2 += 1;
				1157	}
				1158	self.set_range(b1 as u8, (b2 - 1) as u8);
				1159	b1 = b2;
				1160	}
				1161	}
				1162
				1163	fn byte_classes(&self) -> Vec<u8> {
				1164	// N.B. If you're debugging the DFA, it's useful to simply return
				1165	// `(0..256).collect()`, which effectively removes the byte classes
				1166	// and makes the transitions easier to read.
				1167	// (0usize..256).map(\|x\| x as u8).collect()
				1168	let mut byte_classes = vec![0; 256];
				1169	let mut class = 0u8;
				1170	let mut i = 0;
				1171	loop {
				1172	byte_classes[i] = class as u8;
				1173	if i >= 255 {
				1174	break;
				1175	}
				1176	if self.0[i] {
				1177	class = class.checked_add(1).unwrap();
				1178	}
				1179	i += 1;
				1180	}
				1181	byte_classes
				1182	}
				1183	}
				1184
Haibo Huang	47619dd	2021-01-08 17:05:43 -0800	[diff] [blame]	1185	impl fmt::Debug for ByteClassSet {
				1186	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
				1187	f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish()
				1188	}
				1189	}
				1190
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	1191	fn u32_to_usize(n: u32) -> usize {
				1192	// In case usize is less than 32 bits, we need to guard against overflow.
				1193	// On most platforms this compiles to nothing.
				1194	// TODO Use `std::convert::TryFrom` once it's stable.
				1195	if (n as u64) > (::std::usize::MAX as u64) {
				1196	panic!("BUG: {} is too big to be pointer sized", n)
				1197	}
				1198	n as usize
				1199	}
				1200
				1201	#[cfg(test)]
				1202	mod tests {
				1203	use super::ByteClassSet;
				1204
				1205	#[test]
				1206	fn byte_classes() {
				1207	let mut set = ByteClassSet::new();
				1208	set.set_range(b'a', b'z');
				1209	let classes = set.byte_classes();
				1210	assert_eq!(classes[0], 0);
				1211	assert_eq!(classes[1], 0);
				1212	assert_eq!(classes[2], 0);
				1213	assert_eq!(classes[b'a' as usize - 1], 0);
				1214	assert_eq!(classes[b'a' as usize], 1);
				1215	assert_eq!(classes[b'm' as usize], 1);
				1216	assert_eq!(classes[b'z' as usize], 1);
				1217	assert_eq!(classes[b'z' as usize + 1], 2);
				1218	assert_eq!(classes[254], 2);
				1219	assert_eq!(classes[255], 2);
				1220
				1221	let mut set = ByteClassSet::new();
				1222	set.set_range(0, 2);
				1223	set.set_range(4, 6);
				1224	let classes = set.byte_classes();
				1225	assert_eq!(classes[0], 0);
				1226	assert_eq!(classes[1], 0);
				1227	assert_eq!(classes[2], 0);
				1228	assert_eq!(classes[3], 1);
				1229	assert_eq!(classes[4], 2);
				1230	assert_eq!(classes[5], 2);
				1231	assert_eq!(classes[6], 2);
				1232	assert_eq!(classes[7], 3);
				1233	assert_eq!(classes[255], 3);
				1234	}
				1235
				1236	#[test]
				1237	fn full_byte_classes() {
				1238	let mut set = ByteClassSet::new();
				1239	for i in 0..256u16 {
				1240	set.set_range(i as u8, i as u8);
				1241	}
				1242	assert_eq!(set.byte_classes().len(), 256);
				1243	}
				1244	}