Blame - src/pikevm.rs - platform/external/rust/crates/regex

blob: c106c76f31edf97450b43342b41bb92cbecbab6d [file] [log] [blame]

Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame^]	1	// This module implements the Pike VM. That is, it guarantees linear time
				2	// search of a regex on any text with memory use proportional to the size of
				3	// the regex.
				4	//
				5	// It is equal in power to the backtracking engine in this crate, except the
				6	// backtracking engine is typically faster on small regexes/texts at the
				7	// expense of a bigger memory footprint.
				8	//
				9	// It can do more than the DFA can (specifically, record capture locations
				10	// and execute Unicode word boundary assertions), but at a slower speed.
				11	// Specifically, the Pike VM exectues a DFA implicitly by repeatedly expanding
				12	// epsilon transitions. That is, the Pike VM engine can be in multiple states
				13	// at once where as the DFA is only ever in one state at a time.
				14	//
				15	// Therefore, the Pike VM is generally treated as the fallback when the other
				16	// matching engines either aren't feasible to run or are insufficient.
				17
				18	use std::mem;
				19
				20	use exec::ProgramCache;
				21	use input::{Input, InputAt};
				22	use prog::{InstPtr, Program};
				23	use re_trait::Slot;
				24	use sparse::SparseSet;
				25
				26	/// An NFA simulation matching engine.
				27	#[derive(Debug)]
				28	pub struct Fsm<'r, I> {
				29	/// The sequence of opcodes (among other things) that is actually executed.
				30	///
				31	/// The program may be byte oriented or Unicode codepoint oriented.
				32	prog: &'r Program,
				33	/// An explicit stack used for following epsilon transitions. (This is
				34	/// borrowed from the cache.)
				35	stack: &'r mut Vec<FollowEpsilon>,
				36	/// The input to search.
				37	input: I,
				38	}
				39
				40	/// A cached allocation that can be reused on each execution.
				41	#[derive(Clone, Debug)]
				42	pub struct Cache {
				43	/// A pair of ordered sets for tracking NFA states.
				44	clist: Threads,
				45	nlist: Threads,
				46	/// An explicit stack used for following epsilon transitions.
				47	stack: Vec<FollowEpsilon>,
				48	}
				49
				50	/// An ordered set of NFA states and their captures.
				51	#[derive(Clone, Debug)]
				52	struct Threads {
				53	/// An ordered set of opcodes (each opcode is an NFA state).
				54	set: SparseSet,
				55	/// Captures for every NFA state.
				56	///
				57	/// It is stored in row-major order, where the columns are the capture
				58	/// slots and the rows are the states.
				59	caps: Vec<Slot>,
				60	/// The number of capture slots stored per thread. (Every capture has
				61	/// two slots.)
				62	slots_per_thread: usize,
				63	}
				64
				65	/// A representation of an explicit stack frame when following epsilon
				66	/// transitions. This is used to avoid recursion.
				67	#[derive(Clone, Debug)]
				68	enum FollowEpsilon {
				69	/// Follow transitions at the given instruction pointer.
				70	IP(InstPtr),
				71	/// Restore the capture slot with the given position in the input.
				72	Capture { slot: usize, pos: Slot },
				73	}
				74
				75	impl Cache {
				76	/// Create a new allocation used by the NFA machine to record execution
				77	/// and captures.
				78	pub fn new(_prog: &Program) -> Self {
				79	Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] }
				80	}
				81	}
				82
				83	impl<'r, I: Input> Fsm<'r, I> {
				84	/// Execute the NFA matching engine.
				85	///
				86	/// If there's a match, `exec` returns `true` and populates the given
				87	/// captures accordingly.
				88	pub fn exec(
				89	prog: &'r Program,
				90	cache: &ProgramCache,
				91	matches: &mut [bool],
				92	slots: &mut [Slot],
				93	quit_after_match: bool,
				94	input: I,
				95	start: usize,
				96	end: usize,
				97	) -> bool {
				98	let mut cache = cache.borrow_mut();
				99	let cache = &mut cache.pikevm;
				100	cache.clist.resize(prog.len(), prog.captures.len());
				101	cache.nlist.resize(prog.len(), prog.captures.len());
				102	let at = input.at(start);
				103	Fsm { prog: prog, stack: &mut cache.stack, input: input }.exec_(
				104	&mut cache.clist,
				105	&mut cache.nlist,
				106	matches,
				107	slots,
				108	quit_after_match,
				109	at,
				110	end,
				111	)
				112	}
				113
				114	fn exec_(
				115	&mut self,
				116	mut clist: &mut Threads,
				117	mut nlist: &mut Threads,
				118	matches: &mut [bool],
				119	slots: &mut [Slot],
				120	quit_after_match: bool,
				121	mut at: InputAt,
				122	end: usize,
				123	) -> bool {
				124	let mut matched = false;
				125	let mut all_matched = false;
				126	clist.set.clear();
				127	nlist.set.clear();
				128	'LOOP: loop {
				129	if clist.set.is_empty() {
				130	// Three ways to bail out when our current set of threads is
				131	// empty.
				132	//
				133	// 1. We have a match---so we're done exploring any possible
				134	// alternatives. Time to quit. (We can't do this if we're
				135	// looking for matches for multiple regexes, unless we know
				136	// they all matched.)
				137	//
				138	// 2. If the expression starts with a '^' we can terminate as
				139	// soon as the last thread dies.
				140	if (matched && matches.len() <= 1)
				141	\|\| all_matched
				142	\|\| (!at.is_start() && self.prog.is_anchored_start)
				143	{
				144	break;
				145	}
				146
				147	// 3. If there's a literal prefix for the program, try to
				148	// jump ahead quickly. If it can't be found, then we can
				149	// bail out early.
				150	if !self.prog.prefixes.is_empty() {
				151	at = match self.input.prefix_at(&self.prog.prefixes, at) {
				152	None => break,
				153	Some(at) => at,
				154	};
				155	}
				156	}
				157
				158	// This simulates a preceding '.*?' for every regex by adding
				159	// a state starting at the current position in the input for the
				160	// beginning of the program only if we don't already have a match.
				161	if clist.set.is_empty()
				162	\|\| (!self.prog.is_anchored_start && !all_matched)
				163	{
				164	self.add(&mut clist, slots, 0, at);
				165	}
				166	// The previous call to "add" actually inspects the position just
				167	// before the current character. For stepping through the machine,
				168	// we can to look at the current character, so we advance the
				169	// input.
				170	let at_next = self.input.at(at.next_pos());
				171	for i in 0..clist.set.len() {
				172	let ip = clist.set[i];
				173	if self.step(
				174	&mut nlist,
				175	matches,
				176	slots,
				177	clist.caps(ip),
				178	ip,
				179	at,
				180	at_next,
				181	) {
				182	matched = true;
				183	all_matched = all_matched \|\| matches.iter().all(\|&b\| b);
				184	if quit_after_match {
				185	// If we only care if a match occurs (not its
				186	// position), then we can quit right now.
				187	break 'LOOP;
				188	}
				189	if self.prog.matches.len() == 1 {
				190	// We don't need to check the rest of the threads
				191	// in this set because we've matched something
				192	// ("leftmost-first"). However, we still need to check
				193	// threads in the next set to support things like
				194	// greedy matching.
				195	//
				196	// This is only true on normal regexes. For regex sets,
				197	// we need to mush on to observe other matches.
				198	break;
				199	}
				200	}
				201	}
				202	if at.pos() >= end {
				203	break;
				204	}
				205	at = at_next;
				206	mem::swap(clist, nlist);
				207	nlist.set.clear();
				208	}
				209	matched
				210	}
				211
				212	/// Step through the input, one token (byte or codepoint) at a time.
				213	///
				214	/// nlist is the set of states that will be processed on the next token
				215	/// in the input.
				216	///
				217	/// caps is the set of captures passed by the caller of the NFA. They are
				218	/// written to only when a match state is visited.
				219	///
				220	/// thread_caps is the set of captures set for the current NFA state, ip.
				221	///
				222	/// at and at_next are the current and next positions in the input. at or
				223	/// at_next may be EOF.
				224	fn step(
				225	&mut self,
				226	nlist: &mut Threads,
				227	matches: &mut [bool],
				228	slots: &mut [Slot],
				229	thread_caps: &mut [Option<usize>],
				230	ip: usize,
				231	at: InputAt,
				232	at_next: InputAt,
				233	) -> bool {
				234	use prog::Inst::*;
				235	match self.prog[ip] {
				236	Match(match_slot) => {
				237	if match_slot < matches.len() {
				238	matches[match_slot] = true;
				239	}
				240	for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) {
				241	slot = val;
				242	}
				243	true
				244	}
				245	Char(ref inst) => {
				246	if inst.c == at.char() {
				247	self.add(nlist, thread_caps, inst.goto, at_next);
				248	}
				249	false
				250	}
				251	Ranges(ref inst) => {
				252	if inst.matches(at.char()) {
				253	self.add(nlist, thread_caps, inst.goto, at_next);
				254	}
				255	false
				256	}
				257	Bytes(ref inst) => {
				258	if let Some(b) = at.byte() {
				259	if inst.matches(b) {
				260	self.add(nlist, thread_caps, inst.goto, at_next);
				261	}
				262	}
				263	false
				264	}
				265	EmptyLook(_) \| Save(_) \| Split(_) => false,
				266	}
				267	}
				268
				269	/// Follows epsilon transitions and adds them for processing to nlist,
				270	/// starting at and including ip.
				271	fn add(
				272	&mut self,
				273	nlist: &mut Threads,
				274	thread_caps: &mut [Option<usize>],
				275	ip: usize,
				276	at: InputAt,
				277	) {
				278	self.stack.push(FollowEpsilon::IP(ip));
				279	while let Some(frame) = self.stack.pop() {
				280	match frame {
				281	FollowEpsilon::IP(ip) => {
				282	self.add_step(nlist, thread_caps, ip, at);
				283	}
				284	FollowEpsilon::Capture { slot, pos } => {
				285	thread_caps[slot] = pos;
				286	}
				287	}
				288	}
				289	}
				290
				291	/// A helper function for add that avoids excessive pushing to the stack.
				292	fn add_step(
				293	&mut self,
				294	nlist: &mut Threads,
				295	thread_caps: &mut [Option<usize>],
				296	mut ip: usize,
				297	at: InputAt,
				298	) {
				299	// Instead of pushing and popping to the stack, we mutate ip as we
				300	// traverse the set of states. We only push to the stack when we
				301	// absolutely need recursion (restoring captures or following a
				302	// branch).
				303	use prog::Inst::*;
				304	loop {
				305	// Don't visit states we've already added.
				306	if nlist.set.contains(ip) {
				307	return;
				308	}
				309	nlist.set.insert(ip);
				310	match self.prog[ip] {
				311	EmptyLook(ref inst) => {
				312	if self.input.is_empty_match(at, inst) {
				313	ip = inst.goto;
				314	}
				315	}
				316	Save(ref inst) => {
				317	if inst.slot < thread_caps.len() {
				318	self.stack.push(FollowEpsilon::Capture {
				319	slot: inst.slot,
				320	pos: thread_caps[inst.slot],
				321	});
				322	thread_caps[inst.slot] = Some(at.pos());
				323	}
				324	ip = inst.goto;
				325	}
				326	Split(ref inst) => {
				327	self.stack.push(FollowEpsilon::IP(inst.goto2));
				328	ip = inst.goto1;
				329	}
				330	Match(_) \| Char(_) \| Ranges(_) \| Bytes(_) => {
				331	let t = &mut nlist.caps(ip);
				332	for (slot, val) in t.iter_mut().zip(thread_caps.iter()) {
				333	slot = val;
				334	}
				335	return;
				336	}
				337	}
				338	}
				339	}
				340	}
				341
				342	impl Threads {
				343	fn new() -> Self {
				344	Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 }
				345	}
				346
				347	fn resize(&mut self, num_insts: usize, ncaps: usize) {
				348	if num_insts == self.set.capacity() {
				349	return;
				350	}
				351	self.slots_per_thread = ncaps * 2;
				352	self.set = SparseSet::new(num_insts);
				353	self.caps = vec![None; self.slots_per_thread * num_insts];
				354	}
				355
				356	fn caps(&mut self, pc: usize) -> &mut [Option<usize>] {
				357	let i = pc * self.slots_per_thread;
				358	&mut self.caps[i..i + self.slots_per_thread]
				359	}
				360	}