Blame - src/exec.rs - platform/external/rust/crates/regex

blob: 3d5a52bead1012b1d241577d2e297dbb80bd5e3b [file] [log] [blame]

Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	1	use std::cell::RefCell;
				2	use std::collections::HashMap;
Elliott Hughes	ffb6030	2021-04-01 17:11:40 -0700	[diff] [blame^]	3	use std::panic::AssertUnwindSafe;
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	4	use std::sync::Arc;
				5
				6	#[cfg(feature = "perf-literal")]
				7	use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
				8	use syntax::hir::literal::Literals;
				9	use syntax::hir::Hir;
				10	use syntax::ParserBuilder;
				11
				12	use backtrack;
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	13	use compile::Compiler;
				14	#[cfg(feature = "perf-dfa")]
				15	use dfa;
				16	use error::Error;
				17	use input::{ByteInput, CharInput};
				18	use literal::LiteralSearcher;
				19	use pikevm;
Elliott Hughes	ffb6030	2021-04-01 17:11:40 -0700	[diff] [blame^]	20	use pool::{Pool, PoolGuard};
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	21	use prog::Program;
				22	use re_builder::RegexOptions;
				23	use re_bytes;
				24	use re_set;
				25	use re_trait::{Locations, RegularExpression, Slot};
				26	use re_unicode;
				27	use utf8::next_utf8;
				28
				29	/// `Exec` manages the execution of a regular expression.
				30	///
				31	/// In particular, this manages the various compiled forms of a single regular
				32	/// expression and the choice of which matching engine to use to execute a
				33	/// regular expression.
Haibo Huang	47619dd	2021-01-08 17:05:43 -0800	[diff] [blame]	34	#[derive(Debug)]
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	35	pub struct Exec {
				36	/// All read only state.
				37	ro: Arc<ExecReadOnly>,
Elliott Hughes	ffb6030	2021-04-01 17:11:40 -0700	[diff] [blame^]	38	/// A pool of reusable values for the various matching engines.
				39	///
				40	/// Note that boxing this value is not strictly necessary, but it is an
				41	/// easy way to ensure that T does not bloat the stack sized used by a pool
				42	/// in the case where T is big. And this turns out to be the case at the
				43	/// time of writing for regex's use of this pool. At the time of writing,
				44	/// the size of a Regex on the stack is 856 bytes. Boxing this value
				45	/// reduces that size to 16 bytes.
				46	pool: Box<Pool<ProgramCache>>,
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	47	}
				48
				49	/// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
				50	/// means it is no longer Sync, but we can now avoid the overhead of
				51	/// synchronization to fetch the cache.
				52	#[derive(Debug)]
				53	pub struct ExecNoSync<'c> {
				54	/// All read only state.
				55	ro: &'c Arc<ExecReadOnly>,
				56	/// Caches for the various matching engines.
Elliott Hughes	ffb6030	2021-04-01 17:11:40 -0700	[diff] [blame^]	57	cache: PoolGuard<'c, ProgramCache>,
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	58	}
				59
				60	/// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
Haibo Huang	47619dd	2021-01-08 17:05:43 -0800	[diff] [blame]	61	#[derive(Debug)]
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	62	pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
				63
				64	/// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
				65	/// state is determined at compile time and never changes during search.
				66	#[derive(Debug)]
				67	struct ExecReadOnly {
				68	/// The original regular expressions given by the caller to compile.
				69	res: Vec<String>,
				70	/// A compiled program that is used in the NFA simulation and backtracking.
				71	/// It can be byte-based or Unicode codepoint based.
				72	///
				73	/// N.B. It is not possibly to make this byte-based from the public API.
				74	/// It is only used for testing byte based programs in the NFA simulations.
				75	nfa: Program,
				76	/// A compiled byte based program for DFA execution. This is only used
				77	/// if a DFA can be executed. (Currently, only word boundary assertions are
				78	/// not supported.) Note that this program contains an embedded `.*?`
				79	/// preceding the first capture group, unless the regex is anchored at the
				80	/// beginning.
				81	dfa: Program,
				82	/// The same as above, except the program is reversed (and there is no
				83	/// preceding `.*?`). This is used by the DFA to find the starting location
				84	/// of matches.
				85	dfa_reverse: Program,
				86	/// A set of suffix literals extracted from the regex.
				87	///
				88	/// Prefix literals are stored on the `Program`, since they are used inside
				89	/// the matching engines.
				90	suffixes: LiteralSearcher,
				91	/// An Aho-Corasick automaton with leftmost-first match semantics.
				92	///
				93	/// This is only set when the entire regex is a simple unanchored
				94	/// alternation of literals. We could probably use it more circumstances,
				95	/// but this is already hacky enough in this architecture.
				96	///
				97	/// N.B. We use u32 as a state ID representation under the assumption that
				98	/// if we were to exhaust the ID space, we probably would have long
				99	/// surpassed the compilation size limit.
				100	#[cfg(feature = "perf-literal")]
				101	ac: Option<AhoCorasick<u32>>,
				102	/// match_type encodes as much upfront knowledge about how we're going to
				103	/// execute a search as possible.
				104	match_type: MatchType,
				105	}
				106
				107	/// Facilitates the construction of an executor by exposing various knobs
				108	/// to control how a regex is executed and what kinds of resources it's
				109	/// permitted to use.
Haibo Huang	47619dd	2021-01-08 17:05:43 -0800	[diff] [blame]	110	// `ExecBuilder` is only public via the `internal` module, so avoid deriving
				111	// `Debug`.
				112	#[allow(missing_debug_implementations)]
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	113	pub struct ExecBuilder {
				114	options: RegexOptions,
				115	match_type: Option<MatchType>,
				116	bytes: bool,
				117	only_utf8: bool,
				118	}
				119
				120	/// Parsed represents a set of parsed regular expressions and their detected
				121	/// literals.
				122	struct Parsed {
				123	exprs: Vec<Hir>,
				124	prefixes: Literals,
				125	suffixes: Literals,
				126	bytes: bool,
				127	}
				128
				129	impl ExecBuilder {
				130	/// Create a regex execution builder.
				131	///
				132	/// This uses default settings for everything except the regex itself,
				133	/// which must be provided. Further knobs can be set by calling methods,
				134	/// and then finally, `build` to actually create the executor.
				135	pub fn new(re: &str) -> Self {
				136	Self::new_many(&[re])
				137	}
				138
				139	/// Like new, but compiles the union of the given regular expressions.
				140	///
				141	/// Note that when compiling 2 or more regular expressions, capture groups
				142	/// are completely unsupported. (This means both `find` and `captures`
				143	/// wont work.)
				144	pub fn new_many<I, S>(res: I) -> Self
				145	where
				146	S: AsRef<str>,
				147	I: IntoIterator<Item = S>,
				148	{
				149	let mut opts = RegexOptions::default();
				150	opts.pats = res.into_iter().map(\|s\| s.as_ref().to_owned()).collect();
				151	Self::new_options(opts)
				152	}
				153
				154	/// Create a regex execution builder.
				155	pub fn new_options(opts: RegexOptions) -> Self {
				156	ExecBuilder {
				157	options: opts,
				158	match_type: None,
				159	bytes: false,
				160	only_utf8: true,
				161	}
				162	}
				163
				164	/// Set the matching engine to be automatically determined.
				165	///
				166	/// This is the default state and will apply whatever optimizations are
				167	/// possible, such as running a DFA.
				168	///
				169	/// This overrides whatever was previously set via the `nfa` or
				170	/// `bounded_backtracking` methods.
				171	pub fn automatic(mut self) -> Self {
				172	self.match_type = None;
				173	self
				174	}
				175
				176	/// Sets the matching engine to use the NFA algorithm no matter what
				177	/// optimizations are possible.
				178	///
				179	/// This overrides whatever was previously set via the `automatic` or
				180	/// `bounded_backtracking` methods.
				181	pub fn nfa(mut self) -> Self {
				182	self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
				183	self
				184	}
				185
				186	/// Sets the matching engine to use a bounded backtracking engine no
				187	/// matter what optimizations are possible.
				188	///
				189	/// One must use this with care, since the bounded backtracking engine
				190	/// uses memory proportion to `len(regex) * len(text)`.
				191	///
				192	/// This overrides whatever was previously set via the `automatic` or
				193	/// `nfa` methods.
				194	pub fn bounded_backtracking(mut self) -> Self {
				195	self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
				196	self
				197	}
				198
				199	/// Compiles byte based programs for use with the NFA matching engines.
				200	///
				201	/// By default, the NFA engines match on Unicode scalar values. They can
				202	/// be made to use byte based programs instead. In general, the byte based
				203	/// programs are slower because of a less efficient encoding of character
				204	/// classes.
				205	///
				206	/// Note that this does not impact DFA matching engines, which always
				207	/// execute on bytes.
				208	pub fn bytes(mut self, yes: bool) -> Self {
				209	self.bytes = yes;
				210	self
				211	}
				212
				213	/// When disabled, the program compiled may match arbitrary bytes.
				214	///
				215	/// When enabled (the default), all compiled programs exclusively match
				216	/// valid UTF-8 bytes.
				217	pub fn only_utf8(mut self, yes: bool) -> Self {
				218	self.only_utf8 = yes;
				219	self
				220	}
				221
				222	/// Set the Unicode flag.
				223	pub fn unicode(mut self, yes: bool) -> Self {
				224	self.options.unicode = yes;
				225	self
				226	}
				227
				228	/// Parse the current set of patterns into their AST and extract literals.
				229	fn parse(&self) -> Result<Parsed, Error> {
				230	let mut exprs = Vec::with_capacity(self.options.pats.len());
				231	let mut prefixes = Some(Literals::empty());
				232	let mut suffixes = Some(Literals::empty());
				233	let mut bytes = false;
				234	let is_set = self.options.pats.len() > 1;
				235	// If we're compiling a regex set and that set has any anchored
				236	// expressions, then disable all literal optimizations.
				237	for pat in &self.options.pats {
				238	let mut parser = ParserBuilder::new()
				239	.octal(self.options.octal)
				240	.case_insensitive(self.options.case_insensitive)
				241	.multi_line(self.options.multi_line)
				242	.dot_matches_new_line(self.options.dot_matches_new_line)
				243	.swap_greed(self.options.swap_greed)
				244	.ignore_whitespace(self.options.ignore_whitespace)
				245	.unicode(self.options.unicode)
				246	.allow_invalid_utf8(!self.only_utf8)
				247	.nest_limit(self.options.nest_limit)
				248	.build();
				249	let expr =
				250	parser.parse(pat).map_err(\|e\| Error::Syntax(e.to_string()))?;
				251	bytes = bytes \|\| !expr.is_always_utf8();
				252
				253	if cfg!(feature = "perf-literal") {
				254	if !expr.is_anchored_start() && expr.is_any_anchored_start() {
				255	// Partial anchors unfortunately make it hard to use
				256	// prefixes, so disable them.
				257	prefixes = None;
				258	} else if is_set && expr.is_anchored_start() {
				259	// Regex sets with anchors do not go well with literal
				260	// optimizations.
				261	prefixes = None;
				262	}
				263	prefixes = prefixes.and_then(\|mut prefixes\| {
				264	if !prefixes.union_prefixes(&expr) {
				265	None
				266	} else {
				267	Some(prefixes)
				268	}
				269	});
				270
				271	if !expr.is_anchored_end() && expr.is_any_anchored_end() {
				272	// Partial anchors unfortunately make it hard to use
				273	// suffixes, so disable them.
				274	suffixes = None;
				275	} else if is_set && expr.is_anchored_end() {
				276	// Regex sets with anchors do not go well with literal
				277	// optimizations.
				278	suffixes = None;
				279	}
				280	suffixes = suffixes.and_then(\|mut suffixes\| {
				281	if !suffixes.union_suffixes(&expr) {
				282	None
				283	} else {
				284	Some(suffixes)
				285	}
				286	});
				287	}
				288	exprs.push(expr);
				289	}
				290	Ok(Parsed {
				291	exprs: exprs,
				292	prefixes: prefixes.unwrap_or_else(Literals::empty),
				293	suffixes: suffixes.unwrap_or_else(Literals::empty),
				294	bytes: bytes,
				295	})
				296	}
				297
				298	/// Build an executor that can run a regular expression.
				299	pub fn build(self) -> Result<Exec, Error> {
				300	// Special case when we have no patterns to compile.
				301	// This can happen when compiling a regex set.
				302	if self.options.pats.is_empty() {
				303	let ro = Arc::new(ExecReadOnly {
				304	res: vec![],
				305	nfa: Program::new(),
				306	dfa: Program::new(),
				307	dfa_reverse: Program::new(),
				308	suffixes: LiteralSearcher::empty(),
				309	#[cfg(feature = "perf-literal")]
				310	ac: None,
				311	match_type: MatchType::Nothing,
				312	});
Elliott Hughes	ffb6030	2021-04-01 17:11:40 -0700	[diff] [blame^]	313	let pool = ExecReadOnly::new_pool(&ro);
				314	return Ok(Exec { ro: ro, pool });
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	315	}
				316	let parsed = self.parse()?;
				317	let mut nfa = Compiler::new()
				318	.size_limit(self.options.size_limit)
				319	.bytes(self.bytes \|\| parsed.bytes)
				320	.only_utf8(self.only_utf8)
				321	.compile(&parsed.exprs)?;
				322	let mut dfa = Compiler::new()
				323	.size_limit(self.options.size_limit)
				324	.dfa(true)
				325	.only_utf8(self.only_utf8)
				326	.compile(&parsed.exprs)?;
				327	let mut dfa_reverse = Compiler::new()
				328	.size_limit(self.options.size_limit)
				329	.dfa(true)
				330	.only_utf8(self.only_utf8)
				331	.reverse(true)
				332	.compile(&parsed.exprs)?;
				333
				334	#[cfg(feature = "perf-literal")]
				335	let ac = self.build_aho_corasick(&parsed);
				336	nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
				337	dfa.prefixes = nfa.prefixes.clone();
				338	dfa.dfa_size_limit = self.options.dfa_size_limit;
				339	dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
				340
				341	let mut ro = ExecReadOnly {
				342	res: self.options.pats,
				343	nfa: nfa,
				344	dfa: dfa,
				345	dfa_reverse: dfa_reverse,
				346	suffixes: LiteralSearcher::suffixes(parsed.suffixes),
				347	#[cfg(feature = "perf-literal")]
				348	ac: ac,
				349	match_type: MatchType::Nothing,
				350	};
				351	ro.match_type = ro.choose_match_type(self.match_type);
				352
				353	let ro = Arc::new(ro);
Elliott Hughes	ffb6030	2021-04-01 17:11:40 -0700	[diff] [blame^]	354	let pool = ExecReadOnly::new_pool(&ro);
				355	Ok(Exec { ro, pool })
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	356	}
				357
				358	#[cfg(feature = "perf-literal")]
				359	fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> {
				360	if parsed.exprs.len() != 1 {
				361	return None;
				362	}
				363	let lits = match alternation_literals(&parsed.exprs[0]) {
				364	None => return None,
				365	Some(lits) => lits,
				366	};
				367	// If we have a small number of literals, then let Teddy handle
				368	// things (see literal/mod.rs).
				369	if lits.len() <= 32 {
				370	return None;
				371	}
				372	Some(
				373	AhoCorasickBuilder::new()
				374	.match_kind(MatchKind::LeftmostFirst)
				375	.auto_configure(&lits)
				376	// We always want this to reduce size, regardless
				377	// of what auto-configure does.
				378	.byte_classes(true)
				379	.build_with_size::<u32, _, _>(&lits)
				380	// This should never happen because we'd long exceed the
				381	// compilation limit for regexes first.
				382	.expect("AC automaton too big"),
				383	)
				384	}
				385	}
				386
				387	impl<'c> RegularExpression for ExecNoSyncStr<'c> {
				388	type Text = str;
				389
				390	fn slots_len(&self) -> usize {
				391	self.0.slots_len()
				392	}
				393
				394	fn next_after_empty(&self, text: &str, i: usize) -> usize {
				395	next_utf8(text.as_bytes(), i)
				396	}
				397
				398	#[cfg_attr(feature = "perf-inline", inline(always))]
				399	fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
				400	self.0.shortest_match_at(text.as_bytes(), start)
				401	}
				402
				403	#[cfg_attr(feature = "perf-inline", inline(always))]
				404	fn is_match_at(&self, text: &str, start: usize) -> bool {
				405	self.0.is_match_at(text.as_bytes(), start)
				406	}
				407
				408	#[cfg_attr(feature = "perf-inline", inline(always))]
				409	fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
				410	self.0.find_at(text.as_bytes(), start)
				411	}
				412
				413	#[cfg_attr(feature = "perf-inline", inline(always))]
				414	fn captures_read_at(
				415	&self,
				416	locs: &mut Locations,
				417	text: &str,
				418	start: usize,
				419	) -> Option<(usize, usize)> {
				420	self.0.captures_read_at(locs, text.as_bytes(), start)
				421	}
				422	}
				423
				424	impl<'c> RegularExpression for ExecNoSync<'c> {
				425	type Text = [u8];
				426
				427	/// Returns the number of capture slots in the regular expression. (There
				428	/// are two slots for every capture group, corresponding to possibly empty
				429	/// start and end locations of the capture.)
				430	fn slots_len(&self) -> usize {
				431	self.ro.nfa.captures.len() * 2
				432	}
				433
				434	fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
				435	i + 1
				436	}
				437
				438	/// Returns the end of a match location, possibly occurring before the
				439	/// end location of the correct leftmost-first match.
				440	#[cfg_attr(feature = "perf-inline", inline(always))]
				441	fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
				442	if !self.is_anchor_end_match(text) {
				443	return None;
				444	}
				445	match self.ro.match_type {
				446	#[cfg(feature = "perf-literal")]
				447	MatchType::Literal(ty) => {
				448	self.find_literals(ty, text, start).map(\|(_, e)\| e)
				449	}
				450	#[cfg(feature = "perf-dfa")]
				451	MatchType::Dfa \| MatchType::DfaMany => {
				452	match self.shortest_dfa(text, start) {
				453	dfa::Result::Match(end) => Some(end),
				454	dfa::Result::NoMatch(_) => None,
				455	dfa::Result::Quit => self.shortest_nfa(text, start),
				456	}
				457	}
				458	#[cfg(feature = "perf-dfa")]
				459	MatchType::DfaAnchoredReverse => {
				460	match dfa::Fsm::reverse(
				461	&self.ro.dfa_reverse,
				462	self.cache.value(),
				463	true,
				464	&text[start..],
				465	text.len(),
				466	) {
				467	dfa::Result::Match(_) => Some(text.len()),
				468	dfa::Result::NoMatch(_) => None,
				469	dfa::Result::Quit => self.shortest_nfa(text, start),
				470	}
				471	}
				472	#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
				473	MatchType::DfaSuffix => {
				474	match self.shortest_dfa_reverse_suffix(text, start) {
				475	dfa::Result::Match(e) => Some(e),
				476	dfa::Result::NoMatch(_) => None,
				477	dfa::Result::Quit => self.shortest_nfa(text, start),
				478	}
				479	}
				480	MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
				481	MatchType::Nothing => None,
				482	}
				483	}
				484
				485	/// Returns true if and only if the regex matches text.
				486	///
				487	/// For single regular expressions, this is equivalent to calling
				488	/// shortest_match(...).is_some().
				489	#[cfg_attr(feature = "perf-inline", inline(always))]
				490	fn is_match_at(&self, text: &[u8], start: usize) -> bool {
				491	if !self.is_anchor_end_match(text) {
				492	return false;
				493	}
				494	// We need to do this dance because shortest_match relies on the NFA
				495	// filling in captures[1], but a RegexSet has no captures. In other
				496	// words, a RegexSet can't (currently) use shortest_match. ---AG
				497	match self.ro.match_type {
				498	#[cfg(feature = "perf-literal")]
				499	MatchType::Literal(ty) => {
				500	self.find_literals(ty, text, start).is_some()
				501	}
				502	#[cfg(feature = "perf-dfa")]
				503	MatchType::Dfa \| MatchType::DfaMany => {
				504	match self.shortest_dfa(text, start) {
				505	dfa::Result::Match(_) => true,
				506	dfa::Result::NoMatch(_) => false,
				507	dfa::Result::Quit => self.match_nfa(text, start),
				508	}
				509	}
				510	#[cfg(feature = "perf-dfa")]
				511	MatchType::DfaAnchoredReverse => {
				512	match dfa::Fsm::reverse(
				513	&self.ro.dfa_reverse,
				514	self.cache.value(),
				515	true,
				516	&text[start..],
				517	text.len(),
				518	) {
				519	dfa::Result::Match(_) => true,
				520	dfa::Result::NoMatch(_) => false,
				521	dfa::Result::Quit => self.match_nfa(text, start),
				522	}
				523	}
				524	#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
				525	MatchType::DfaSuffix => {
				526	match self.shortest_dfa_reverse_suffix(text, start) {
				527	dfa::Result::Match(_) => true,
				528	dfa::Result::NoMatch(_) => false,
				529	dfa::Result::Quit => self.match_nfa(text, start),
				530	}
				531	}
				532	MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
				533	MatchType::Nothing => false,
				534	}
				535	}
				536
				537	/// Finds the start and end location of the leftmost-first match, starting
				538	/// at the given location.
				539	#[cfg_attr(feature = "perf-inline", inline(always))]
				540	fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
				541	if !self.is_anchor_end_match(text) {
				542	return None;
				543	}
				544	match self.ro.match_type {
				545	#[cfg(feature = "perf-literal")]
				546	MatchType::Literal(ty) => self.find_literals(ty, text, start),
				547	#[cfg(feature = "perf-dfa")]
				548	MatchType::Dfa => match self.find_dfa_forward(text, start) {
				549	dfa::Result::Match((s, e)) => Some((s, e)),
				550	dfa::Result::NoMatch(_) => None,
				551	dfa::Result::Quit => {
				552	self.find_nfa(MatchNfaType::Auto, text, start)
				553	}
				554	},
				555	#[cfg(feature = "perf-dfa")]
				556	MatchType::DfaAnchoredReverse => {
				557	match self.find_dfa_anchored_reverse(text, start) {
				558	dfa::Result::Match((s, e)) => Some((s, e)),
				559	dfa::Result::NoMatch(_) => None,
				560	dfa::Result::Quit => {
				561	self.find_nfa(MatchNfaType::Auto, text, start)
				562	}
				563	}
				564	}
				565	#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
				566	MatchType::DfaSuffix => {
				567	match self.find_dfa_reverse_suffix(text, start) {
				568	dfa::Result::Match((s, e)) => Some((s, e)),
				569	dfa::Result::NoMatch(_) => None,
				570	dfa::Result::Quit => {
				571	self.find_nfa(MatchNfaType::Auto, text, start)
				572	}
				573	}
				574	}
				575	MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
				576	MatchType::Nothing => None,
				577	#[cfg(feature = "perf-dfa")]
				578	MatchType::DfaMany => {
				579	unreachable!("BUG: RegexSet cannot be used with find")
				580	}
				581	}
				582	}
				583
				584	/// Finds the start and end location of the leftmost-first match and also
				585	/// fills in all matching capture groups.
				586	///
				587	/// The number of capture slots given should be equal to the total number
				588	/// of capture slots in the compiled program.
				589	///
				590	/// Note that the first two slots always correspond to the start and end
				591	/// locations of the overall match.
				592	fn captures_read_at(
				593	&self,
				594	locs: &mut Locations,
				595	text: &[u8],
				596	start: usize,
				597	) -> Option<(usize, usize)> {
				598	let slots = locs.as_slots();
				599	for slot in slots.iter_mut() {
				600	*slot = None;
				601	}
				602	// If the caller unnecessarily uses this, then we try to save them
				603	// from themselves.
				604	match slots.len() {
				605	0 => return self.find_at(text, start),
				606	2 => {
				607	return self.find_at(text, start).map(\|(s, e)\| {
				608	slots[0] = Some(s);
				609	slots[1] = Some(e);
				610	(s, e)
				611	});
				612	}
				613	_ => {} // fallthrough
				614	}
				615	if !self.is_anchor_end_match(text) {
				616	return None;
				617	}
				618	match self.ro.match_type {
				619	#[cfg(feature = "perf-literal")]
				620	MatchType::Literal(ty) => {
				621	self.find_literals(ty, text, start).and_then(\|(s, e)\| {
				622	self.captures_nfa_type(
				623	MatchNfaType::Auto,
				624	slots,
				625	text,
				626	s,
				627	e,
				628	)
				629	})
				630	}
				631	#[cfg(feature = "perf-dfa")]
				632	MatchType::Dfa => {
				633	if self.ro.nfa.is_anchored_start {
				634	self.captures_nfa(slots, text, start)
				635	} else {
				636	match self.find_dfa_forward(text, start) {
				637	dfa::Result::Match((s, e)) => self.captures_nfa_type(
				638	MatchNfaType::Auto,
				639	slots,
				640	text,
				641	s,
				642	e,
				643	),
				644	dfa::Result::NoMatch(_) => None,
				645	dfa::Result::Quit => {
				646	self.captures_nfa(slots, text, start)
				647	}
				648	}
				649	}
				650	}
				651	#[cfg(feature = "perf-dfa")]
				652	MatchType::DfaAnchoredReverse => {
				653	match self.find_dfa_anchored_reverse(text, start) {
				654	dfa::Result::Match((s, e)) => self.captures_nfa_type(
				655	MatchNfaType::Auto,
				656	slots,
				657	text,
				658	s,
				659	e,
				660	),
				661	dfa::Result::NoMatch(_) => None,
				662	dfa::Result::Quit => self.captures_nfa(slots, text, start),
				663	}
				664	}
				665	#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
				666	MatchType::DfaSuffix => {
				667	match self.find_dfa_reverse_suffix(text, start) {
				668	dfa::Result::Match((s, e)) => self.captures_nfa_type(
				669	MatchNfaType::Auto,
				670	slots,
				671	text,
				672	s,
				673	e,
				674	),
				675	dfa::Result::NoMatch(_) => None,
				676	dfa::Result::Quit => self.captures_nfa(slots, text, start),
				677	}
				678	}
				679	MatchType::Nfa(ty) => {
				680	self.captures_nfa_type(ty, slots, text, start, text.len())
				681	}
				682	MatchType::Nothing => None,
				683	#[cfg(feature = "perf-dfa")]
				684	MatchType::DfaMany => {
				685	unreachable!("BUG: RegexSet cannot be used with captures")
				686	}
				687	}
				688	}
				689	}
				690
				691	impl<'c> ExecNoSync<'c> {
				692	/// Finds the leftmost-first match using only literal search.
				693	#[cfg(feature = "perf-literal")]
				694	#[cfg_attr(feature = "perf-inline", inline(always))]
				695	fn find_literals(
				696	&self,
				697	ty: MatchLiteralType,
				698	text: &[u8],
				699	start: usize,
				700	) -> Option<(usize, usize)> {
				701	use self::MatchLiteralType::*;
				702	match ty {
				703	Unanchored => {
				704	let lits = &self.ro.nfa.prefixes;
				705	lits.find(&text[start..]).map(\|(s, e)\| (start + s, start + e))
				706	}
				707	AnchoredStart => {
				708	let lits = &self.ro.nfa.prefixes;
				709	if start == 0 \|\| !self.ro.nfa.is_anchored_start {
				710	lits.find_start(&text[start..])
				711	.map(\|(s, e)\| (start + s, start + e))
				712	} else {
				713	None
				714	}
				715	}
				716	AnchoredEnd => {
				717	let lits = &self.ro.suffixes;
				718	lits.find_end(&text[start..])
				719	.map(\|(s, e)\| (start + s, start + e))
				720	}
				721	AhoCorasick => self
				722	.ro
				723	.ac
				724	.as_ref()
				725	.unwrap()
				726	.find(&text[start..])
				727	.map(\|m\| (start + m.start(), start + m.end())),
				728	}
				729	}
				730
				731	/// Finds the leftmost-first match (start and end) using only the DFA.
				732	///
				733	/// If the result returned indicates that the DFA quit, then another
				734	/// matching engine should be used.
				735	#[cfg(feature = "perf-dfa")]
				736	#[cfg_attr(feature = "perf-inline", inline(always))]
				737	fn find_dfa_forward(
				738	&self,
				739	text: &[u8],
				740	start: usize,
				741	) -> dfa::Result<(usize, usize)> {
				742	use dfa::Result::*;
				743	let end = match dfa::Fsm::forward(
				744	&self.ro.dfa,
				745	self.cache.value(),
				746	false,
				747	text,
				748	start,
				749	) {
				750	NoMatch(i) => return NoMatch(i),
				751	Quit => return Quit,
				752	Match(end) if start == end => return Match((start, start)),
				753	Match(end) => end,
				754	};
				755	// Now run the DFA in reverse to find the start of the match.
				756	match dfa::Fsm::reverse(
				757	&self.ro.dfa_reverse,
				758	self.cache.value(),
				759	false,
				760	&text[start..],
				761	end - start,
				762	) {
				763	Match(s) => Match((start + s, end)),
				764	NoMatch(i) => NoMatch(i),
				765	Quit => Quit,
				766	}
				767	}
				768
				769	/// Finds the leftmost-first match (start and end) using only the DFA,
				770	/// but assumes the regex is anchored at the end and therefore starts at
				771	/// the end of the regex and matches in reverse.
				772	///
				773	/// If the result returned indicates that the DFA quit, then another
				774	/// matching engine should be used.
				775	#[cfg(feature = "perf-dfa")]
				776	#[cfg_attr(feature = "perf-inline", inline(always))]
				777	fn find_dfa_anchored_reverse(
				778	&self,
				779	text: &[u8],
				780	start: usize,
				781	) -> dfa::Result<(usize, usize)> {
				782	use dfa::Result::*;
				783	match dfa::Fsm::reverse(
				784	&self.ro.dfa_reverse,
				785	self.cache.value(),
				786	false,
				787	&text[start..],
				788	text.len() - start,
				789	) {
				790	Match(s) => Match((start + s, text.len())),
				791	NoMatch(i) => NoMatch(i),
				792	Quit => Quit,
				793	}
				794	}
				795
				796	/// Finds the end of the shortest match using only the DFA.
				797	#[cfg(feature = "perf-dfa")]
				798	#[cfg_attr(feature = "perf-inline", inline(always))]
				799	fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
				800	dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start)
				801	}
				802
				803	/// Finds the end of the shortest match using only the DFA by scanning for
				804	/// suffix literals.
				805	#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
				806	#[cfg_attr(feature = "perf-inline", inline(always))]
				807	fn shortest_dfa_reverse_suffix(
				808	&self,
				809	text: &[u8],
				810	start: usize,
				811	) -> dfa::Result<usize> {
				812	match self.exec_dfa_reverse_suffix(text, start) {
				813	None => self.shortest_dfa(text, start),
				814	Some(r) => r.map(\|(_, end)\| end),
				815	}
				816	}
				817
				818	/// Finds the end of the shortest match using only the DFA by scanning for
				819	/// suffix literals. It also reports the start of the match.
				820	///
				821	/// Note that if None is returned, then the optimization gave up to avoid
				822	/// worst case quadratic behavior. A forward scanning DFA should be tried
				823	/// next.
				824	///
				825	/// If a match is returned and the full leftmost-first match is desired,
				826	/// then a forward scan starting from the beginning of the match must be
				827	/// done.
				828	///
				829	/// If the result returned indicates that the DFA quit, then another
				830	/// matching engine should be used.
				831	#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
				832	#[cfg_attr(feature = "perf-inline", inline(always))]
				833	fn exec_dfa_reverse_suffix(
				834	&self,
				835	text: &[u8],
				836	original_start: usize,
				837	) -> Option<dfa::Result<(usize, usize)>> {
				838	use dfa::Result::*;
				839
				840	let lcs = self.ro.suffixes.lcs();
				841	debug_assert!(lcs.len() >= 1);
				842	let mut start = original_start;
				843	let mut end = start;
				844	let mut last_literal = start;
				845	while end <= text.len() {
				846	last_literal += match lcs.find(&text[last_literal..]) {
				847	None => return Some(NoMatch(text.len())),
				848	Some(i) => i,
				849	};
				850	end = last_literal + lcs.len();
				851	match dfa::Fsm::reverse(
				852	&self.ro.dfa_reverse,
				853	self.cache.value(),
				854	false,
				855	&text[start..end],
				856	end - start,
				857	) {
				858	Match(0) \| NoMatch(0) => return None,
				859	Match(i) => return Some(Match((start + i, end))),
				860	NoMatch(i) => {
				861	start += i;
				862	last_literal += 1;
				863	continue;
				864	}
				865	Quit => return Some(Quit),
				866	};
				867	}
				868	Some(NoMatch(text.len()))
				869	}
				870
				871	/// Finds the leftmost-first match (start and end) using only the DFA
				872	/// by scanning for suffix literals.
				873	///
				874	/// If the result returned indicates that the DFA quit, then another
				875	/// matching engine should be used.
				876	#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
				877	#[cfg_attr(feature = "perf-inline", inline(always))]
				878	fn find_dfa_reverse_suffix(
				879	&self,
				880	text: &[u8],
				881	start: usize,
				882	) -> dfa::Result<(usize, usize)> {
				883	use dfa::Result::*;
				884
				885	let match_start = match self.exec_dfa_reverse_suffix(text, start) {
				886	None => return self.find_dfa_forward(text, start),
				887	Some(Match((start, _))) => start,
				888	Some(r) => return r,
				889	};
				890	// At this point, we've found a match. The only way to quit now
				891	// without a match is if the DFA gives up (seems unlikely).
				892	//
				893	// Now run the DFA forwards to find the proper end of the match.
				894	// (The suffix literal match can only indicate the earliest
				895	// possible end location, which may appear before the end of the
				896	// leftmost-first match.)
				897	match dfa::Fsm::forward(
				898	&self.ro.dfa,
				899	self.cache.value(),
				900	false,
				901	text,
				902	match_start,
				903	) {
				904	NoMatch(_) => panic!("BUG: reverse match implies forward match"),
				905	Quit => Quit,
				906	Match(e) => Match((match_start, e)),
				907	}
				908	}
				909
				910	/// Executes the NFA engine to return whether there is a match or not.
				911	///
				912	/// Ideally, we could use shortest_nfa(...).is_some() and get the same
				913	/// performance characteristics, but regex sets don't have captures, which
				914	/// shortest_nfa depends on.
				915	#[cfg(feature = "perf-dfa")]
				916	fn match_nfa(&self, text: &[u8], start: usize) -> bool {
				917	self.match_nfa_type(MatchNfaType::Auto, text, start)
				918	}
				919
				920	/// Like match_nfa, but allows specification of the type of NFA engine.
				921	fn match_nfa_type(
				922	&self,
				923	ty: MatchNfaType,
				924	text: &[u8],
				925	start: usize,
				926	) -> bool {
				927	self.exec_nfa(
				928	ty,
				929	&mut [false],
				930	&mut [],
				931	true,
				932	false,
				933	text,
				934	start,
				935	text.len(),
				936	)
				937	}
				938
				939	/// Finds the shortest match using an NFA.
				940	#[cfg(feature = "perf-dfa")]
				941	fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
				942	self.shortest_nfa_type(MatchNfaType::Auto, text, start)
				943	}
				944
				945	/// Like shortest_nfa, but allows specification of the type of NFA engine.
				946	fn shortest_nfa_type(
				947	&self,
				948	ty: MatchNfaType,
				949	text: &[u8],
				950	start: usize,
				951	) -> Option<usize> {
				952	let mut slots = [None, None];
				953	if self.exec_nfa(
				954	ty,
				955	&mut [false],
				956	&mut slots,
				957	true,
				958	true,
				959	text,
				960	start,
				961	text.len(),
				962	) {
				963	slots[1]
				964	} else {
				965	None
				966	}
				967	}
				968
				969	/// Like find, but executes an NFA engine.
				970	fn find_nfa(
				971	&self,
				972	ty: MatchNfaType,
				973	text: &[u8],
				974	start: usize,
				975	) -> Option<(usize, usize)> {
				976	let mut slots = [None, None];
				977	if self.exec_nfa(
				978	ty,
				979	&mut [false],
				980	&mut slots,
				981	false,
				982	false,
				983	text,
				984	start,
				985	text.len(),
				986	) {
				987	match (slots[0], slots[1]) {
				988	(Some(s), Some(e)) => Some((s, e)),
				989	_ => None,
				990	}
				991	} else {
				992	None
				993	}
				994	}
				995
				996	/// Like find_nfa, but fills in captures.
				997	///
				998	/// `slots` should have length equal to `2 * nfa.captures.len()`.
				999	#[cfg(feature = "perf-dfa")]
				1000	fn captures_nfa(
				1001	&self,
				1002	slots: &mut [Slot],
				1003	text: &[u8],
				1004	start: usize,
				1005	) -> Option<(usize, usize)> {
				1006	self.captures_nfa_type(
				1007	MatchNfaType::Auto,
				1008	slots,
				1009	text,
				1010	start,
				1011	text.len(),
				1012	)
				1013	}
				1014
				1015	/// Like captures_nfa, but allows specification of type of NFA engine.
				1016	fn captures_nfa_type(
				1017	&self,
				1018	ty: MatchNfaType,
				1019	slots: &mut [Slot],
				1020	text: &[u8],
				1021	start: usize,
				1022	end: usize,
				1023	) -> Option<(usize, usize)> {
				1024	if self.exec_nfa(
				1025	ty,
				1026	&mut [false],
				1027	slots,
				1028	false,
				1029	false,
				1030	text,
				1031	start,
				1032	end,
				1033	) {
				1034	match (slots[0], slots[1]) {
				1035	(Some(s), Some(e)) => Some((s, e)),
				1036	_ => None,
				1037	}
				1038	} else {
				1039	None
				1040	}
				1041	}
				1042
				1043	fn exec_nfa(
				1044	&self,
				1045	mut ty: MatchNfaType,
				1046	matches: &mut [bool],
				1047	slots: &mut [Slot],
				1048	quit_after_match: bool,
				1049	quit_after_match_with_pos: bool,
				1050	text: &[u8],
				1051	start: usize,
				1052	end: usize,
				1053	) -> bool {
				1054	use self::MatchNfaType::*;
				1055	if let Auto = ty {
				1056	if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
				1057	ty = Backtrack;
				1058	} else {
				1059	ty = PikeVM;
				1060	}
				1061	}
				1062	// The backtracker can't return the shortest match position as it is
				1063	// implemented today. So if someone calls `shortest_match` and we need
				1064	// to run an NFA, then use the PikeVM.
				1065	if quit_after_match_with_pos \|\| ty == PikeVM {
				1066	self.exec_pikevm(
				1067	matches,
				1068	slots,
				1069	quit_after_match,
				1070	text,
				1071	start,
				1072	end,
				1073	)
				1074	} else {
				1075	self.exec_backtrack(matches, slots, text, start, end)
				1076	}
				1077	}
				1078
				1079	/// Always run the NFA algorithm.
				1080	fn exec_pikevm(
				1081	&self,
				1082	matches: &mut [bool],
				1083	slots: &mut [Slot],
				1084	quit_after_match: bool,
				1085	text: &[u8],
				1086	start: usize,
				1087	end: usize,
				1088	) -> bool {
				1089	if self.ro.nfa.uses_bytes() {
				1090	pikevm::Fsm::exec(
				1091	&self.ro.nfa,
				1092	self.cache.value(),
				1093	matches,
				1094	slots,
				1095	quit_after_match,
				1096	ByteInput::new(text, self.ro.nfa.only_utf8),
				1097	start,
				1098	end,
				1099	)
				1100	} else {
				1101	pikevm::Fsm::exec(
				1102	&self.ro.nfa,
				1103	self.cache.value(),
				1104	matches,
				1105	slots,
				1106	quit_after_match,
				1107	CharInput::new(text),
				1108	start,
				1109	end,
				1110	)
				1111	}
				1112	}
				1113
				1114	/// Always runs the NFA using bounded backtracking.
				1115	fn exec_backtrack(
				1116	&self,
				1117	matches: &mut [bool],
				1118	slots: &mut [Slot],
				1119	text: &[u8],
				1120	start: usize,
				1121	end: usize,
				1122	) -> bool {
				1123	if self.ro.nfa.uses_bytes() {
				1124	backtrack::Bounded::exec(
				1125	&self.ro.nfa,
				1126	self.cache.value(),
				1127	matches,
				1128	slots,
				1129	ByteInput::new(text, self.ro.nfa.only_utf8),
				1130	start,
				1131	end,
				1132	)
				1133	} else {
				1134	backtrack::Bounded::exec(
				1135	&self.ro.nfa,
				1136	self.cache.value(),
				1137	matches,
				1138	slots,
				1139	CharInput::new(text),
				1140	start,
				1141	end,
				1142	)
				1143	}
				1144	}
				1145
				1146	/// Finds which regular expressions match the given text.
				1147	///
				1148	/// `matches` should have length equal to the number of regexes being
				1149	/// searched.
				1150	///
				1151	/// This is only useful when one wants to know which regexes in a set
				1152	/// match some text.
				1153	pub fn many_matches_at(
				1154	&self,
				1155	matches: &mut [bool],
				1156	text: &[u8],
				1157	start: usize,
				1158	) -> bool {
				1159	use self::MatchType::*;
				1160	if !self.is_anchor_end_match(text) {
				1161	return false;
				1162	}
				1163	match self.ro.match_type {
				1164	#[cfg(feature = "perf-literal")]
				1165	Literal(ty) => {
				1166	debug_assert_eq!(matches.len(), 1);
				1167	matches[0] = self.find_literals(ty, text, start).is_some();
				1168	matches[0]
				1169	}
				1170	#[cfg(feature = "perf-dfa")]
				1171	Dfa \| DfaAnchoredReverse \| DfaMany => {
				1172	match dfa::Fsm::forward_many(
				1173	&self.ro.dfa,
				1174	self.cache.value(),
				1175	matches,
				1176	text,
				1177	start,
				1178	) {
				1179	dfa::Result::Match(_) => true,
				1180	dfa::Result::NoMatch(_) => false,
				1181	dfa::Result::Quit => self.exec_nfa(
				1182	MatchNfaType::Auto,
				1183	matches,
				1184	&mut [],
				1185	false,
				1186	false,
				1187	text,
				1188	start,
				1189	text.len(),
				1190	),
				1191	}
				1192	}
				1193	#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
				1194	DfaSuffix => {
				1195	match dfa::Fsm::forward_many(
				1196	&self.ro.dfa,
				1197	self.cache.value(),
				1198	matches,
				1199	text,
				1200	start,
				1201	) {
				1202	dfa::Result::Match(_) => true,
				1203	dfa::Result::NoMatch(_) => false,
				1204	dfa::Result::Quit => self.exec_nfa(
				1205	MatchNfaType::Auto,
				1206	matches,
				1207	&mut [],
				1208	false,
				1209	false,
				1210	text,
				1211	start,
				1212	text.len(),
				1213	),
				1214	}
				1215	}
				1216	Nfa(ty) => self.exec_nfa(
				1217	ty,
				1218	matches,
				1219	&mut [],
				1220	false,
				1221	false,
				1222	text,
				1223	start,
				1224	text.len(),
				1225	),
				1226	Nothing => false,
				1227	}
				1228	}
				1229
				1230	#[cfg_attr(feature = "perf-inline", inline(always))]
				1231	fn is_anchor_end_match(&self, text: &[u8]) -> bool {
				1232	#[cfg(not(feature = "perf-literal"))]
				1233	fn imp(_: &ExecReadOnly, _: &[u8]) -> bool {
				1234	true
				1235	}
				1236
				1237	#[cfg(feature = "perf-literal")]
				1238	fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool {
				1239	// Only do this check if the haystack is big (>1MB).
				1240	if text.len() > (1 << 20) && ro.nfa.is_anchored_end {
				1241	let lcs = ro.suffixes.lcs();
				1242	if lcs.len() >= 1 && !lcs.is_suffix(text) {
				1243	return false;
				1244	}
				1245	}
				1246	true
				1247	}
				1248
				1249	imp(&self.ro, text)
				1250	}
				1251
				1252	pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
				1253	&self.ro.nfa.capture_name_idx
				1254	}
				1255	}
				1256
				1257	impl<'c> ExecNoSyncStr<'c> {
				1258	pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
				1259	self.0.capture_name_idx()
				1260	}
				1261	}
				1262
				1263	impl Exec {
				1264	/// Get a searcher that isn't Sync.
				1265	#[cfg_attr(feature = "perf-inline", inline(always))]
				1266	pub fn searcher(&self) -> ExecNoSync {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	1267	ExecNoSync {
				1268	ro: &self.ro, // a clone is too expensive here! (and not needed)
Elliott Hughes	ffb6030	2021-04-01 17:11:40 -0700	[diff] [blame^]	1269	cache: self.pool.get(),
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	1270	}
				1271	}
				1272
				1273	/// Get a searcher that isn't Sync and can match on &str.
				1274	#[cfg_attr(feature = "perf-inline", inline(always))]
				1275	pub fn searcher_str(&self) -> ExecNoSyncStr {
				1276	ExecNoSyncStr(self.searcher())
				1277	}
				1278
				1279	/// Build a Regex from this executor.
				1280	pub fn into_regex(self) -> re_unicode::Regex {
				1281	re_unicode::Regex::from(self)
				1282	}
				1283
				1284	/// Build a RegexSet from this executor.
				1285	pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
				1286	re_set::unicode::RegexSet::from(self)
				1287	}
				1288
				1289	/// Build a Regex from this executor that can match arbitrary bytes.
				1290	pub fn into_byte_regex(self) -> re_bytes::Regex {
				1291	re_bytes::Regex::from(self)
				1292	}
				1293
				1294	/// Build a RegexSet from this executor that can match arbitrary bytes.
				1295	pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
				1296	re_set::bytes::RegexSet::from(self)
				1297	}
				1298
				1299	/// The original regular expressions given by the caller that were
				1300	/// compiled.
				1301	pub fn regex_strings(&self) -> &[String] {
				1302	&self.ro.res
				1303	}
				1304
				1305	/// Return a slice of capture names.
				1306	///
				1307	/// Any capture that isn't named is None.
				1308	pub fn capture_names(&self) -> &[Option<String>] {
				1309	&self.ro.nfa.captures
				1310	}
				1311
				1312	/// Return a reference to named groups mapping (from group name to
				1313	/// group position).
				1314	pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
				1315	&self.ro.nfa.capture_name_idx
				1316	}
				1317	}
				1318
				1319	impl Clone for Exec {
				1320	fn clone(&self) -> Exec {
Elliott Hughes	ffb6030	2021-04-01 17:11:40 -0700	[diff] [blame^]	1321	let pool = ExecReadOnly::new_pool(&self.ro);
				1322	Exec { ro: self.ro.clone(), pool }
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	1323	}
				1324	}
				1325
				1326	impl ExecReadOnly {
				1327	fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
				1328	if let Some(MatchType::Nfa(_)) = hint {
				1329	return hint.unwrap();
				1330	}
				1331	// If the NFA is empty, then we'll never match anything.
				1332	if self.nfa.insts.is_empty() {
				1333	return MatchType::Nothing;
				1334	}
				1335	if let Some(literalty) = self.choose_literal_match_type() {
				1336	return literalty;
				1337	}
				1338	if let Some(dfaty) = self.choose_dfa_match_type() {
				1339	return dfaty;
				1340	}
				1341	// We're so totally hosed.
				1342	MatchType::Nfa(MatchNfaType::Auto)
				1343	}
				1344
				1345	/// If a plain literal scan can be used, then a corresponding literal
				1346	/// search type is returned.
				1347	fn choose_literal_match_type(&self) -> Option<MatchType> {
				1348	#[cfg(not(feature = "perf-literal"))]
				1349	fn imp(_: &ExecReadOnly) -> Option<MatchType> {
				1350	None
				1351	}
				1352
				1353	#[cfg(feature = "perf-literal")]
				1354	fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
				1355	// If our set of prefixes is complete, then we can use it to find
				1356	// a match in lieu of a regex engine. This doesn't quite work well
				1357	// in the presence of multiple regexes, so only do it when there's
				1358	// one.
				1359	//
				1360	// TODO(burntsushi): Also, don't try to match literals if the regex
				1361	// is partially anchored. We could technically do it, but we'd need
				1362	// to create two sets of literals: all of them and then the subset
				1363	// that aren't anchored. We would then only search for all of them
				1364	// when at the beginning of the input and use the subset in all
				1365	// other cases.
				1366	if ro.res.len() != 1 {
				1367	return None;
				1368	}
				1369	if ro.ac.is_some() {
				1370	return Some(MatchType::Literal(
				1371	MatchLiteralType::AhoCorasick,
				1372	));
				1373	}
				1374	if ro.nfa.prefixes.complete() {
				1375	return if ro.nfa.is_anchored_start {
				1376	Some(MatchType::Literal(MatchLiteralType::AnchoredStart))
				1377	} else {
				1378	Some(MatchType::Literal(MatchLiteralType::Unanchored))
				1379	};
				1380	}
				1381	if ro.suffixes.complete() {
				1382	return if ro.nfa.is_anchored_end {
				1383	Some(MatchType::Literal(MatchLiteralType::AnchoredEnd))
				1384	} else {
				1385	// This case shouldn't happen. When the regex isn't
				1386	// anchored, then complete prefixes should imply complete
				1387	// suffixes.
				1388	Some(MatchType::Literal(MatchLiteralType::Unanchored))
				1389	};
				1390	}
				1391	None
				1392	}
				1393
				1394	imp(self)
				1395	}
				1396
				1397	/// If a DFA scan can be used, then choose the appropriate DFA strategy.
				1398	fn choose_dfa_match_type(&self) -> Option<MatchType> {
				1399	#[cfg(not(feature = "perf-dfa"))]
				1400	fn imp(_: &ExecReadOnly) -> Option<MatchType> {
				1401	None
				1402	}
				1403
				1404	#[cfg(feature = "perf-dfa")]
				1405	fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
				1406	if !dfa::can_exec(&ro.dfa) {
				1407	return None;
				1408	}
				1409	// Regex sets require a slightly specialized path.
				1410	if ro.res.len() >= 2 {
				1411	return Some(MatchType::DfaMany);
				1412	}
				1413	// If the regex is anchored at the end but not the start, then
				1414	// just match in reverse from the end of the haystack.
				1415	if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end {
				1416	return Some(MatchType::DfaAnchoredReverse);
				1417	}
				1418	#[cfg(feature = "perf-literal")]
				1419	{
				1420	// If there's a longish suffix literal, then it might be faster
				1421	// to look for that first.
				1422	if ro.should_suffix_scan() {
				1423	return Some(MatchType::DfaSuffix);
				1424	}
				1425	}
				1426	// Fall back to your garden variety forward searching lazy DFA.
				1427	Some(MatchType::Dfa)
				1428	}
				1429
				1430	imp(self)
				1431	}
				1432
				1433	/// Returns true if the program is amenable to suffix scanning.
				1434	///
				1435	/// When this is true, as a heuristic, we assume it is OK to quickly scan
				1436	/// for suffix literals and then do a reverse DFA match from any matches
				1437	/// produced by the literal scan. (And then followed by a forward DFA
				1438	/// search, since the previously found suffix literal maybe not actually be
				1439	/// the end of a match.)
				1440	///
				1441	/// This is a bit of a specialized optimization, but can result in pretty
				1442	/// big performance wins if 1) there are no prefix literals and 2) the
				1443	/// suffix literals are pretty rare in the text. (1) is obviously easy to
				1444	/// account for but (2) is harder. As a proxy, we assume that longer
				1445	/// strings are generally rarer, so we only enable this optimization when
				1446	/// we have a meaty suffix.
				1447	#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
				1448	fn should_suffix_scan(&self) -> bool {
				1449	if self.suffixes.is_empty() {
				1450	return false;
				1451	}
				1452	let lcs_len = self.suffixes.lcs().char_len();
				1453	lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
				1454	}
Elliott Hughes	ffb6030	2021-04-01 17:11:40 -0700	[diff] [blame^]	1455
				1456	fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> {
				1457	let ro = ro.clone();
				1458	Box::new(Pool::new(Box::new(move \|\| {
				1459	AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro)))
				1460	})))
				1461	}
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	1462	}
				1463
				1464	#[derive(Clone, Copy, Debug)]
				1465	enum MatchType {
				1466	/// A single or multiple literal search. This is only used when the regex
				1467	/// can be decomposed into a literal search.
				1468	#[cfg(feature = "perf-literal")]
				1469	Literal(MatchLiteralType),
				1470	/// A normal DFA search.
				1471	#[cfg(feature = "perf-dfa")]
				1472	Dfa,
				1473	/// A reverse DFA search starting from the end of a haystack.
				1474	#[cfg(feature = "perf-dfa")]
				1475	DfaAnchoredReverse,
				1476	/// A reverse DFA search with suffix literal scanning.
				1477	#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
				1478	DfaSuffix,
				1479	/// Use the DFA on two or more regular expressions.
				1480	#[cfg(feature = "perf-dfa")]
				1481	DfaMany,
				1482	/// An NFA variant.
				1483	Nfa(MatchNfaType),
				1484	/// No match is ever possible, so don't ever try to search.
				1485	Nothing,
				1486	}
				1487
				1488	#[derive(Clone, Copy, Debug)]
				1489	#[cfg(feature = "perf-literal")]
				1490	enum MatchLiteralType {
				1491	/// Match literals anywhere in text.
				1492	Unanchored,
				1493	/// Match literals only at the start of text.
				1494	AnchoredStart,
				1495	/// Match literals only at the end of text.
				1496	AnchoredEnd,
				1497	/// Use an Aho-Corasick automaton. This requires `ac` to be Some on
				1498	/// ExecReadOnly.
				1499	AhoCorasick,
				1500	}
				1501
				1502	#[derive(Clone, Copy, Debug, Eq, PartialEq)]
				1503	enum MatchNfaType {
				1504	/// Choose between Backtrack and PikeVM.
				1505	Auto,
				1506	/// NFA bounded backtracking.
				1507	///
				1508	/// (This is only set by tests, since it never makes sense to always want
				1509	/// backtracking.)
				1510	Backtrack,
				1511	/// The Pike VM.
				1512	///
				1513	/// (This is only set by tests, since it never makes sense to always want
				1514	/// the Pike VM.)
				1515	PikeVM,
				1516	}
				1517
				1518	/// `ProgramCache` maintains reusable allocations for each matching engine
				1519	/// available to a particular program.
Elliott Hughes	ffb6030	2021-04-01 17:11:40 -0700	[diff] [blame^]	1520	///
				1521	/// We declare this as unwind safe since it's a cache that's only used for
				1522	/// performance purposes. If a panic occurs, it is (or should be) always safe
				1523	/// to continue using the same regex object.
				1524	pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>;
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	1525
				1526	#[derive(Debug)]
				1527	pub struct ProgramCacheInner {
				1528	pub pikevm: pikevm::Cache,
				1529	pub backtrack: backtrack::Cache,
				1530	#[cfg(feature = "perf-dfa")]
				1531	pub dfa: dfa::Cache,
				1532	#[cfg(feature = "perf-dfa")]
				1533	pub dfa_reverse: dfa::Cache,
				1534	}
				1535
				1536	impl ProgramCacheInner {
				1537	fn new(ro: &ExecReadOnly) -> Self {
				1538	ProgramCacheInner {
				1539	pikevm: pikevm::Cache::new(&ro.nfa),
				1540	backtrack: backtrack::Cache::new(&ro.nfa),
				1541	#[cfg(feature = "perf-dfa")]
				1542	dfa: dfa::Cache::new(&ro.dfa),
				1543	#[cfg(feature = "perf-dfa")]
				1544	dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
				1545	}
				1546	}
				1547	}
				1548
				1549	/// Alternation literals checks if the given HIR is a simple alternation of
				1550	/// literals, and if so, returns them. Otherwise, this returns None.
				1551	#[cfg(feature = "perf-literal")]
				1552	fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
				1553	use syntax::hir::{HirKind, Literal};
				1554
				1555	// This is pretty hacky, but basically, if `is_alternation_literal` is
				1556	// true, then we can make several assumptions about the structure of our
				1557	// HIR. This is what justifies the `unreachable!` statements below.
				1558	//
				1559	// This code should be refactored once we overhaul this crate's
				1560	// optimization pipeline, because this is a terribly inflexible way to go
				1561	// about things.
				1562
				1563	if !expr.is_alternation_literal() {
				1564	return None;
				1565	}
				1566	let alts = match *expr.kind() {
				1567	HirKind::Alternation(ref alts) => alts,
				1568	_ => return None, // one literal isn't worth it
				1569	};
				1570
				1571	let extendlit = \|lit: &Literal, dst: &mut Vec<u8>\| match *lit {
				1572	Literal::Unicode(c) => {
				1573	let mut buf = [0; 4];
				1574	dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
				1575	}
				1576	Literal::Byte(b) => {
				1577	dst.push(b);
				1578	}
				1579	};
				1580
				1581	let mut lits = vec![];
				1582	for alt in alts {
				1583	let mut lit = vec![];
				1584	match *alt.kind() {
				1585	HirKind::Literal(ref x) => extendlit(x, &mut lit),
				1586	HirKind::Concat(ref exprs) => {
				1587	for e in exprs {
				1588	match *e.kind() {
				1589	HirKind::Literal(ref x) => extendlit(x, &mut lit),
				1590	_ => unreachable!("expected literal, got {:?}", e),
				1591	}
				1592	}
				1593	}
				1594	_ => unreachable!("expected literal or concat, got {:?}", alt),
				1595	}
				1596	lits.push(lit);
				1597	}
				1598	Some(lits)
				1599	}
				1600
				1601	#[cfg(test)]
				1602	mod test {
				1603	#[test]
				1604	fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
				1605	use internal::ExecBuilder;
				1606
				1607	let backtrack_bytes_re = ExecBuilder::new("^S")
				1608	.bounded_backtracking()
				1609	.only_utf8(false)
				1610	.build()
				1611	.map(\|exec\| exec.into_byte_regex())
				1612	.map_err(\|err\| format!("{}", err))
				1613	.unwrap();
				1614
				1615	let default_bytes_re = ExecBuilder::new("^S")
				1616	.only_utf8(false)
				1617	.build()
				1618	.map(\|exec\| exec.into_byte_regex())
				1619	.map_err(\|err\| format!("{}", err))
				1620	.unwrap();
				1621
				1622	let input = vec![83, 83];
				1623
				1624	let s1 = backtrack_bytes_re.split(&input);
				1625	let s2 = default_bytes_re.split(&input);
				1626	for (chunk1, chunk2) in s1.zip(s2) {
				1627	assert_eq!(chunk1, chunk2);
				1628	}
				1629	}
				1630
				1631	#[test]
				1632	fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
				1633	use internal::ExecBuilder;
				1634
				1635	let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
				1636	.bounded_backtracking()
				1637	.bytes(true)
				1638	.build()
				1639	.map(\|exec\| exec.into_regex())
				1640	.map_err(\|err\| format!("{}", err))
				1641	.unwrap();
				1642
				1643	let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
				1644	.bytes(true)
				1645	.build()
				1646	.map(\|exec\| exec.into_regex())
				1647	.map_err(\|err\| format!("{}", err))
				1648	.unwrap();
				1649
				1650	let input = "**";
				1651
				1652	let s1 = backtrack_bytes_re.split(input);
				1653	let s2 = default_bytes_re.split(input);
				1654	for (chunk1, chunk2) in s1.zip(s2) {
				1655	assert_eq!(chunk1, chunk2);
				1656	}
				1657	}
				1658	}