Blame - src/backtrack.rs - platform/external/rust/crates/regex

blob: a3d25d6622f9d62588ba2070bba58e6b98169464 [file] [log] [blame]

Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	1	// This is the backtracking matching engine. It has the same exact capability
				2	// as the full NFA simulation, except it is artificially restricted to small
				3	// regexes on small inputs because of its memory requirements.
				4	//
				5	// In particular, this is a bounded backtracking engine. It retains worst
				6	// case linear time by keeping track of the states that it has visited (using a
				7	// bitmap). Namely, once a state is visited, it is never visited again. Since a
				8	// state is keyed by `(instruction index, input index)`, we have that its time
				9	// complexity is `O(mn)` (i.e., linear in the size of the search text).
				10	//
				11	// The backtracking engine can beat out the NFA simulation on small
				12	// regexes/inputs because it doesn't have to keep track of multiple copies of
				13	// the capture groups. In benchmarks, the backtracking engine is roughly twice
				14	// as fast as the full NFA simulation. Note though that its performance doesn't
				15	// scale, even if you're willing to live with the memory requirements. Namely,
				16	// the bitset has to be zeroed on each execution, which becomes quite expensive
				17	// on large bitsets.
				18
Joel Galenson	3874808	2021-05-19 16:51:51 -0700	[diff] [blame^]	19	use crate::exec::ProgramCache;
				20	use crate::input::{Input, InputAt};
				21	use crate::prog::{InstPtr, Program};
				22	use crate::re_trait::Slot;
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	23
				24	type Bits = u32;
				25
				26	const BIT_SIZE: usize = 32;
				27	const MAX_SIZE_BYTES: usize = 256 * (1 << 10); // 256 KB
				28
				29	/// Returns true iff the given regex and input should be executed by this
				30	/// engine with reasonable memory usage.
				31	pub fn should_exec(num_insts: usize, text_len: usize) -> bool {
				32	// Total memory usage in bytes is determined by:
				33	//
				34	// ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32))
				35	//
				36	// The actual limit picked is pretty much a heuristic.
				37	// See: https://github.com/rust-lang/regex/issues/215
				38	let size = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4;
				39	size <= MAX_SIZE_BYTES
				40	}
				41
				42	/// A backtracking matching engine.
				43	#[derive(Debug)]
				44	pub struct Bounded<'a, 'm, 'r, 's, I> {
				45	prog: &'r Program,
				46	input: I,
				47	matches: &'m mut [bool],
				48	slots: &'s mut [Slot],
				49	m: &'a mut Cache,
				50	}
				51
				52	/// Shared cached state between multiple invocations of a backtracking engine
				53	/// in the same thread.
				54	#[derive(Clone, Debug)]
				55	pub struct Cache {
				56	jobs: Vec<Job>,
				57	visited: Vec<Bits>,
				58	}
				59
				60	impl Cache {
				61	/// Create new empty cache for the backtracking engine.
				62	pub fn new(_prog: &Program) -> Self {
				63	Cache { jobs: vec![], visited: vec![] }
				64	}
				65	}
				66
				67	/// A job is an explicit unit of stack space in the backtracking engine.
				68	///
				69	/// The "normal" representation is a single state transition, which corresponds
				70	/// to an NFA state and a character in the input. However, the backtracking
				71	/// engine must keep track of old capture group values. We use the explicit
				72	/// stack to do it.
				73	#[derive(Clone, Copy, Debug)]
				74	enum Job {
				75	Inst { ip: InstPtr, at: InputAt },
				76	SaveRestore { slot: usize, old_pos: Option<usize> },
				77	}
				78
				79	impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
				80	/// Execute the backtracking matching engine.
				81	///
				82	/// If there's a match, `exec` returns `true` and populates the given
				83	/// captures accordingly.
				84	pub fn exec(
				85	prog: &'r Program,
				86	cache: &ProgramCache,
				87	matches: &'m mut [bool],
				88	slots: &'s mut [Slot],
				89	input: I,
				90	start: usize,
				91	end: usize,
				92	) -> bool {
				93	let mut cache = cache.borrow_mut();
				94	let cache = &mut cache.backtrack;
				95	let start = input.at(start);
				96	let mut b = Bounded {
				97	prog: prog,
				98	input: input,
				99	matches: matches,
				100	slots: slots,
				101	m: cache,
				102	};
				103	b.exec_(start, end)
				104	}
				105
				106	/// Clears the cache such that the backtracking engine can be executed
				107	/// on some input of fixed length.
				108	fn clear(&mut self) {
				109	// Reset the job memory so that we start fresh.
				110	self.m.jobs.clear();
				111
				112	// Now we need to clear the bit state set.
				113	// We do this by figuring out how much space we need to keep track
				114	// of the states we've visited.
				115	// Then we reset all existing allocated space to 0.
				116	// Finally, we request more space if we need it.
				117	//
Elliott Hughes	ffb6030	2021-04-01 17:11:40 -0700	[diff] [blame]	118	// This is all a little circuitous, but doing this using unchecked
				119	// operations doesn't seem to have a measurable impact on performance.
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	120	// (Probably because backtracking is limited to such small
				121	// inputs/regexes in the first place.)
				122	let visited_len =
				123	(self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1)
				124	/ BIT_SIZE;
				125	self.m.visited.truncate(visited_len);
				126	for v in &mut self.m.visited {
				127	*v = 0;
				128	}
				129	if visited_len > self.m.visited.len() {
				130	let len = self.m.visited.len();
				131	self.m.visited.reserve_exact(visited_len - len);
				132	for _ in 0..(visited_len - len) {
				133	self.m.visited.push(0);
				134	}
				135	}
				136	}
				137
				138	/// Start backtracking at the given position in the input, but also look
				139	/// for literal prefixes.
				140	fn exec_(&mut self, mut at: InputAt, end: usize) -> bool {
				141	self.clear();
				142	// If this is an anchored regex at the beginning of the input, then
				143	// we're either already done or we only need to try backtracking once.
				144	if self.prog.is_anchored_start {
				145	return if !at.is_start() { false } else { self.backtrack(at) };
				146	}
				147	let mut matched = false;
				148	loop {
				149	if !self.prog.prefixes.is_empty() {
				150	at = match self.input.prefix_at(&self.prog.prefixes, at) {
				151	None => break,
				152	Some(at) => at,
				153	};
				154	}
				155	matched = self.backtrack(at) \|\| matched;
				156	if matched && self.prog.matches.len() == 1 {
				157	return true;
				158	}
				159	if at.pos() >= end {
				160	break;
				161	}
				162	at = self.input.at(at.next_pos());
				163	}
				164	matched
				165	}
				166
				167	/// The main backtracking loop starting at the given input position.
				168	fn backtrack(&mut self, start: InputAt) -> bool {
				169	// N.B. We use an explicit stack to avoid recursion.
				170	// To avoid excessive pushing and popping, most transitions are handled
				171	// in the `step` helper function, which only pushes to the stack when
				172	// there's a capture or a branch.
				173	let mut matched = false;
				174	self.m.jobs.push(Job::Inst { ip: 0, at: start });
				175	while let Some(job) = self.m.jobs.pop() {
				176	match job {
				177	Job::Inst { ip, at } => {
				178	if self.step(ip, at) {
				179	// Only quit if we're matching one regex.
				180	// If we're matching a regex set, then mush on and
				181	// try to find other matches (if we want them).
				182	if self.prog.matches.len() == 1 {
				183	return true;
				184	}
				185	matched = true;
				186	}
				187	}
				188	Job::SaveRestore { slot, old_pos } => {
				189	if slot < self.slots.len() {
				190	self.slots[slot] = old_pos;
				191	}
				192	}
				193	}
				194	}
				195	matched
				196	}
				197
				198	fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool {
Joel Galenson	3874808	2021-05-19 16:51:51 -0700	[diff] [blame^]	199	use crate::prog::Inst::*;
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	200	loop {
				201	// This loop is an optimization to avoid constantly pushing/popping
				202	// from the stack. Namely, if we're pushing a job only to run it
				203	// next, avoid the push and just mutate `ip` (and possibly `at`)
				204	// in place.
				205	if self.has_visited(ip, at) {
				206	return false;
				207	}
				208	match self.prog[ip] {
				209	Match(slot) => {
				210	if slot < self.matches.len() {
				211	self.matches[slot] = true;
				212	}
				213	return true;
				214	}
				215	Save(ref inst) => {
				216	if let Some(&old_pos) = self.slots.get(inst.slot) {
				217	// If this path doesn't work out, then we save the old
				218	// capture index (if one exists) in an alternate
				219	// job. If the next path fails, then the alternate
				220	// job is popped and the old capture index is restored.
				221	self.m.jobs.push(Job::SaveRestore {
				222	slot: inst.slot,
				223	old_pos: old_pos,
				224	});
				225	self.slots[inst.slot] = Some(at.pos());
				226	}
				227	ip = inst.goto;
				228	}
				229	Split(ref inst) => {
				230	self.m.jobs.push(Job::Inst { ip: inst.goto2, at: at });
				231	ip = inst.goto1;
				232	}
				233	EmptyLook(ref inst) => {
				234	if self.input.is_empty_match(at, inst) {
				235	ip = inst.goto;
				236	} else {
				237	return false;
				238	}
				239	}
				240	Char(ref inst) => {
				241	if inst.c == at.char() {
				242	ip = inst.goto;
				243	at = self.input.at(at.next_pos());
				244	} else {
				245	return false;
				246	}
				247	}
				248	Ranges(ref inst) => {
				249	if inst.matches(at.char()) {
				250	ip = inst.goto;
				251	at = self.input.at(at.next_pos());
				252	} else {
				253	return false;
				254	}
				255	}
				256	Bytes(ref inst) => {
				257	if let Some(b) = at.byte() {
				258	if inst.matches(b) {
				259	ip = inst.goto;
				260	at = self.input.at(at.next_pos());
				261	continue;
				262	}
				263	}
				264	return false;
				265	}
				266	}
				267	}
				268	}
				269
				270	fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool {
				271	let k = ip * (self.input.len() + 1) + at.pos();
				272	let k1 = k / BIT_SIZE;
				273	let k2 = usize_to_u32(1 << (k & (BIT_SIZE - 1)));
				274	if self.m.visited[k1] & k2 == 0 {
				275	self.m.visited[k1] \|= k2;
				276	false
				277	} else {
				278	true
				279	}
				280	}
				281	}
				282
				283	fn usize_to_u32(n: usize) -> u32 {
				284	if (n as u64) > (::std::u32::MAX as u64) {
				285	panic!("BUG: {} is too big to fit into u32", n)
				286	}
				287	n as u32
				288	}