Blame - src/literal/imp.rs - platform/external/rust/crates/regex

blob: e4d04ed0a9c000814f2f00c4e8472e81cda4ca0f [file] [log] [blame]

Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	1	use std::cmp;
				2	use std::mem;
				3
				4	use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder};
				5	use memchr::{memchr, memchr2, memchr3};
				6	use syntax::hir::literal::{Literal, Literals};
				7
				8	use freqs::BYTE_FREQUENCIES;
				9
				10	/// A prefix extracted from a compiled regular expression.
				11	///
				12	/// A regex prefix is a set of literal strings that must be matched at the
				13	/// beginning of a regex in order for the entire regex to match. Similarly
				14	/// for a regex suffix.
				15	#[derive(Clone, Debug)]
				16	pub struct LiteralSearcher {
				17	complete: bool,
				18	lcp: FreqyPacked,
				19	lcs: FreqyPacked,
				20	matcher: Matcher,
				21	}
				22
				23	#[derive(Clone, Debug)]
				24	enum Matcher {
				25	/// No literals. (Never advances through the input.)
				26	Empty,
				27	/// A set of four or more single byte literals.
				28	Bytes(SingleByteSet),
				29	/// A single substring, find using memchr and frequency analysis.
				30	FreqyPacked(FreqyPacked),
				31	/// A single substring, find using Boyer-Moore.
				32	BoyerMoore(BoyerMooreSearch),
				33	/// An Aho-Corasick automaton.
				34	AC { ac: AhoCorasick<u32>, lits: Vec<Literal> },
				35	/// A packed multiple substring searcher, using SIMD.
				36	///
				37	/// Note that Aho-Corasick will actually use this packed searcher
				38	/// internally automatically, however, there is some overhead associated
				39	/// with going through the Aho-Corasick machinery. So using the packed
				40	/// searcher directly results in some gains.
				41	Packed { s: packed::Searcher, lits: Vec<Literal> },
				42	}
				43
				44	impl LiteralSearcher {
				45	/// Returns a matcher that never matches and never advances the input.
				46	pub fn empty() -> Self {
				47	Self::new(Literals::empty(), Matcher::Empty)
				48	}
				49
				50	/// Returns a matcher for literal prefixes from the given set.
				51	pub fn prefixes(lits: Literals) -> Self {
				52	let matcher = Matcher::prefixes(&lits);
				53	Self::new(lits, matcher)
				54	}
				55
				56	/// Returns a matcher for literal suffixes from the given set.
				57	pub fn suffixes(lits: Literals) -> Self {
				58	let matcher = Matcher::suffixes(&lits);
				59	Self::new(lits, matcher)
				60	}
				61
				62	fn new(lits: Literals, matcher: Matcher) -> Self {
				63	let complete = lits.all_complete();
				64	LiteralSearcher {
				65	complete: complete,
				66	lcp: FreqyPacked::new(lits.longest_common_prefix().to_vec()),
				67	lcs: FreqyPacked::new(lits.longest_common_suffix().to_vec()),
				68	matcher: matcher,
				69	}
				70	}
				71
				72	/// Returns true if all matches comprise the entire regular expression.
				73	///
				74	/// This does not necessarily mean that a literal match implies a match
Haibo Huang	47619dd	2021-01-08 17:05:43 -0800	[diff] [blame^]	75	/// of the regular expression. For example, the regular expression `^a`
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	76	/// is comprised of a single complete literal `a`, but the regular
				77	/// expression demands that it only match at the beginning of a string.
				78	pub fn complete(&self) -> bool {
				79	self.complete && !self.is_empty()
				80	}
				81
				82	/// Find the position of a literal in `haystack` if it exists.
				83	#[cfg_attr(feature = "perf-inline", inline(always))]
				84	pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> {
				85	use self::Matcher::*;
				86	match self.matcher {
				87	Empty => Some((0, 0)),
				88	Bytes(ref sset) => sset.find(haystack).map(\|i\| (i, i + 1)),
				89	FreqyPacked(ref s) => s.find(haystack).map(\|i\| (i, i + s.len())),
				90	BoyerMoore(ref s) => s.find(haystack).map(\|i\| (i, i + s.len())),
				91	AC { ref ac, .. } => {
				92	ac.find(haystack).map(\|m\| (m.start(), m.end()))
				93	}
				94	Packed { ref s, .. } => {
				95	s.find(haystack).map(\|m\| (m.start(), m.end()))
				96	}
				97	}
				98	}
				99
				100	/// Like find, except matches must start at index `0`.
				101	pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> {
				102	for lit in self.iter() {
				103	if lit.len() > haystack.len() {
				104	continue;
				105	}
				106	if lit == &haystack[0..lit.len()] {
				107	return Some((0, lit.len()));
				108	}
				109	}
				110	None
				111	}
				112
				113	/// Like find, except matches must end at index `haystack.len()`.
				114	pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> {
				115	for lit in self.iter() {
				116	if lit.len() > haystack.len() {
				117	continue;
				118	}
				119	if lit == &haystack[haystack.len() - lit.len()..] {
				120	return Some((haystack.len() - lit.len(), haystack.len()));
				121	}
				122	}
				123	None
				124	}
				125
				126	/// Returns an iterator over all literals to be matched.
				127	pub fn iter(&self) -> LiteralIter {
				128	match self.matcher {
				129	Matcher::Empty => LiteralIter::Empty,
				130	Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
				131	Matcher::FreqyPacked(ref s) => LiteralIter::Single(&s.pat),
				132	Matcher::BoyerMoore(ref s) => LiteralIter::Single(&s.pattern),
				133	Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
				134	Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
				135	}
				136	}
				137
				138	/// Returns a matcher for the longest common prefix of this matcher.
				139	pub fn lcp(&self) -> &FreqyPacked {
				140	&self.lcp
				141	}
				142
				143	/// Returns a matcher for the longest common suffix of this matcher.
				144	pub fn lcs(&self) -> &FreqyPacked {
				145	&self.lcs
				146	}
				147
				148	/// Returns true iff this prefix is empty.
				149	pub fn is_empty(&self) -> bool {
				150	self.len() == 0
				151	}
				152
				153	/// Returns the number of prefixes in this machine.
				154	pub fn len(&self) -> usize {
				155	use self::Matcher::*;
				156	match self.matcher {
				157	Empty => 0,
				158	Bytes(ref sset) => sset.dense.len(),
				159	FreqyPacked(_) => 1,
				160	BoyerMoore(_) => 1,
				161	AC { ref ac, .. } => ac.pattern_count(),
				162	Packed { ref lits, .. } => lits.len(),
				163	}
				164	}
				165
				166	/// Return the approximate heap usage of literals in bytes.
				167	pub fn approximate_size(&self) -> usize {
				168	use self::Matcher::*;
				169	match self.matcher {
				170	Empty => 0,
				171	Bytes(ref sset) => sset.approximate_size(),
				172	FreqyPacked(ref single) => single.approximate_size(),
				173	BoyerMoore(ref single) => single.approximate_size(),
				174	AC { ref ac, .. } => ac.heap_bytes(),
				175	Packed { ref s, .. } => s.heap_bytes(),
				176	}
				177	}
				178	}
				179
				180	impl Matcher {
				181	fn prefixes(lits: &Literals) -> Self {
				182	let sset = SingleByteSet::prefixes(lits);
				183	Matcher::new(lits, sset)
				184	}
				185
				186	fn suffixes(lits: &Literals) -> Self {
				187	let sset = SingleByteSet::suffixes(lits);
				188	Matcher::new(lits, sset)
				189	}
				190
				191	fn new(lits: &Literals, sset: SingleByteSet) -> Self {
				192	if lits.literals().is_empty() {
				193	return Matcher::Empty;
				194	}
				195	if sset.dense.len() >= 26 {
				196	// Avoid trying to match a large number of single bytes.
				197	// This is very sensitive to a frequency analysis comparison
				198	// between the bytes in sset and the composition of the haystack.
				199	// No matter the size of sset, if its members all are rare in the
				200	// haystack, then it'd be worth using it. How to tune this... IDK.
				201	// ---AG
				202	return Matcher::Empty;
				203	}
				204	if sset.complete {
				205	return Matcher::Bytes(sset);
				206	}
				207	if lits.literals().len() == 1 {
				208	let lit = lits.literals()[0].to_vec();
				209	if BoyerMooreSearch::should_use(lit.as_slice()) {
				210	return Matcher::BoyerMoore(BoyerMooreSearch::new(lit));
				211	} else {
				212	return Matcher::FreqyPacked(FreqyPacked::new(lit));
				213	}
				214	}
				215
				216	let pats = lits.literals().to_owned();
				217	let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii;
				218	if lits.literals().len() <= 100 && !is_aho_corasick_fast {
				219	let mut builder = packed::Config::new()
				220	.match_kind(packed::MatchKind::LeftmostFirst)
				221	.builder();
				222	if let Some(s) = builder.extend(&pats).build() {
				223	return Matcher::Packed { s, lits: pats };
				224	}
				225	}
				226	let ac = AhoCorasickBuilder::new()
				227	.match_kind(aho_corasick::MatchKind::LeftmostFirst)
				228	.dfa(true)
				229	.build_with_size::<u32, _, _>(&pats)
				230	.unwrap();
				231	Matcher::AC { ac, lits: pats }
				232	}
				233	}
				234
Haibo Huang	47619dd	2021-01-08 17:05:43 -0800	[diff] [blame^]	235	#[derive(Debug)]
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	236	pub enum LiteralIter<'a> {
				237	Empty,
				238	Bytes(&'a [u8]),
				239	Single(&'a [u8]),
				240	AC(&'a [Literal]),
				241	Packed(&'a [Literal]),
				242	}
				243
				244	impl<'a> Iterator for LiteralIter<'a> {
				245	type Item = &'a [u8];
				246
				247	fn next(&mut self) -> Option<Self::Item> {
				248	match *self {
				249	LiteralIter::Empty => None,
				250	LiteralIter::Bytes(ref mut many) => {
				251	if many.is_empty() {
				252	None
				253	} else {
				254	let next = &many[0..1];
				255	*many = &many[1..];
				256	Some(next)
				257	}
				258	}
				259	LiteralIter::Single(ref mut one) => {
				260	if one.is_empty() {
				261	None
				262	} else {
				263	let next = &one[..];
				264	*one = &[];
				265	Some(next)
				266	}
				267	}
				268	LiteralIter::AC(ref mut lits) => {
				269	if lits.is_empty() {
				270	None
				271	} else {
				272	let next = &lits[0];
				273	*lits = &lits[1..];
				274	Some(&**next)
				275	}
				276	}
				277	LiteralIter::Packed(ref mut lits) => {
				278	if lits.is_empty() {
				279	None
				280	} else {
				281	let next = &lits[0];
				282	*lits = &lits[1..];
				283	Some(&**next)
				284	}
				285	}
				286	}
				287	}
				288	}
				289
				290	#[derive(Clone, Debug)]
				291	struct SingleByteSet {
				292	sparse: Vec<bool>,
				293	dense: Vec<u8>,
				294	complete: bool,
				295	all_ascii: bool,
				296	}
				297
				298	impl SingleByteSet {
				299	fn new() -> SingleByteSet {
				300	SingleByteSet {
				301	sparse: vec![false; 256],
				302	dense: vec![],
				303	complete: true,
				304	all_ascii: true,
				305	}
				306	}
				307
				308	fn prefixes(lits: &Literals) -> SingleByteSet {
				309	let mut sset = SingleByteSet::new();
				310	for lit in lits.literals() {
				311	sset.complete = sset.complete && lit.len() == 1;
				312	if let Some(&b) = lit.get(0) {
				313	if !sset.sparse[b as usize] {
				314	if b > 0x7F {
				315	sset.all_ascii = false;
				316	}
				317	sset.dense.push(b);
				318	sset.sparse[b as usize] = true;
				319	}
				320	}
				321	}
				322	sset
				323	}
				324
				325	fn suffixes(lits: &Literals) -> SingleByteSet {
				326	let mut sset = SingleByteSet::new();
				327	for lit in lits.literals() {
				328	sset.complete = sset.complete && lit.len() == 1;
				329	if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) {
				330	if !sset.sparse[b as usize] {
				331	if b > 0x7F {
				332	sset.all_ascii = false;
				333	}
				334	sset.dense.push(b);
				335	sset.sparse[b as usize] = true;
				336	}
				337	}
				338	}
				339	sset
				340	}
				341
				342	/// Faster find that special cases certain sizes to use memchr.
				343	#[cfg_attr(feature = "perf-inline", inline(always))]
				344	fn find(&self, text: &[u8]) -> Option<usize> {
				345	match self.dense.len() {
				346	0 => None,
				347	1 => memchr(self.dense[0], text),
				348	2 => memchr2(self.dense[0], self.dense[1], text),
				349	3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text),
				350	_ => self._find(text),
				351	}
				352	}
				353
				354	/// Generic find that works on any sized set.
				355	fn _find(&self, haystack: &[u8]) -> Option<usize> {
				356	for (i, &b) in haystack.iter().enumerate() {
				357	if self.sparse[b as usize] {
				358	return Some(i);
				359	}
				360	}
				361	None
				362	}
				363
				364	fn approximate_size(&self) -> usize {
				365	(self.dense.len() * mem::size_of::<u8>())
				366	+ (self.sparse.len() * mem::size_of::<bool>())
				367	}
				368	}
				369
				370	/// Provides an implementation of fast subtring search using frequency
				371	/// analysis.
				372	///
				373	/// memchr is so fast that we do everything we can to keep the loop in memchr
				374	/// for as long as possible. The easiest way to do this is to intelligently
				375	/// pick the byte to send to memchr. The best byte is the byte that occurs
				376	/// least frequently in the haystack. Since doing frequency analysis on the
				377	/// haystack is far too expensive, we compute a set of fixed frequencies up
				378	/// front and hard code them in src/freqs.rs. Frequency analysis is done via
				379	/// scripts/frequencies.py.
				380	#[derive(Clone, Debug)]
				381	pub struct FreqyPacked {
				382	/// The pattern.
				383	pat: Vec<u8>,
				384	/// The number of Unicode characters in the pattern. This is useful for
				385	/// determining the effective length of a pattern when deciding which
				386	/// optimizations to perform. A trailing incomplete UTF-8 sequence counts
				387	/// as one character.
				388	char_len: usize,
				389	/// The rarest byte in the pattern, according to pre-computed frequency
				390	/// analysis.
				391	rare1: u8,
				392	/// The offset of the rarest byte in `pat`.
				393	rare1i: usize,
				394	/// The second rarest byte in the pattern, according to pre-computed
				395	/// frequency analysis. (This may be equivalent to the rarest byte.)
				396	///
				397	/// The second rarest byte is used as a type of guard for quickly detecting
				398	/// a mismatch after memchr locates an instance of the rarest byte. This
				399	/// is a hedge against pathological cases where the pre-computed frequency
				400	/// analysis may be off. (But of course, does not prevent all
				401	/// pathological cases.)
				402	rare2: u8,
				403	/// The offset of the second rarest byte in `pat`.
				404	rare2i: usize,
				405	}
				406
				407	impl FreqyPacked {
				408	fn new(pat: Vec<u8>) -> FreqyPacked {
				409	if pat.is_empty() {
				410	return FreqyPacked::empty();
				411	}
				412
				413	// Find the rarest two bytes. Try to make them distinct (but it's not
				414	// required).
				415	let mut rare1 = pat[0];
				416	let mut rare2 = pat[0];
				417	for b in pat[1..].iter().cloned() {
				418	if freq_rank(b) < freq_rank(rare1) {
				419	rare1 = b;
				420	}
				421	}
				422	for &b in &pat {
				423	if rare1 == rare2 {
				424	rare2 = b
				425	} else if b != rare1 && freq_rank(b) < freq_rank(rare2) {
				426	rare2 = b;
				427	}
				428	}
				429
				430	// And find the offsets of their last occurrences.
				431	let rare1i = pat.iter().rposition(\|&b\| b == rare1).unwrap();
				432	let rare2i = pat.iter().rposition(\|&b\| b == rare2).unwrap();
				433
				434	let char_len = char_len_lossy(&pat);
				435	FreqyPacked {
				436	pat: pat,
				437	char_len: char_len,
				438	rare1: rare1,
				439	rare1i: rare1i,
				440	rare2: rare2,
				441	rare2i: rare2i,
				442	}
				443	}
				444
				445	fn empty() -> FreqyPacked {
				446	FreqyPacked {
				447	pat: vec![],
				448	char_len: 0,
				449	rare1: 0,
				450	rare1i: 0,
				451	rare2: 0,
				452	rare2i: 0,
				453	}
				454	}
				455
				456	#[cfg_attr(feature = "perf-inline", inline(always))]
				457	pub fn find(&self, haystack: &[u8]) -> Option<usize> {
				458	let pat = &*self.pat;
				459	if haystack.len() < pat.len() \|\| pat.is_empty() {
				460	return None;
				461	}
				462	let mut i = self.rare1i;
				463	while i < haystack.len() {
				464	i += match memchr(self.rare1, &haystack[i..]) {
				465	None => return None,
				466	Some(i) => i,
				467	};
				468	let start = i - self.rare1i;
				469	let end = start + pat.len();
				470	if end > haystack.len() {
				471	return None;
				472	}
				473	let aligned = &haystack[start..end];
				474	if aligned[self.rare2i] == self.rare2 && aligned == &*self.pat {
				475	return Some(start);
				476	}
				477	i += 1;
				478	}
				479	None
				480	}
				481
				482	#[cfg_attr(feature = "perf-inline", inline(always))]
				483	pub fn is_suffix(&self, text: &[u8]) -> bool {
				484	if text.len() < self.len() {
				485	return false;
				486	}
				487	text[text.len() - self.len()..] == *self.pat
				488	}
				489
				490	pub fn len(&self) -> usize {
				491	self.pat.len()
				492	}
				493
				494	pub fn char_len(&self) -> usize {
				495	self.char_len
				496	}
				497
				498	fn approximate_size(&self) -> usize {
				499	self.pat.len() * mem::size_of::<u8>()
				500	}
				501	}
				502
				503	fn char_len_lossy(bytes: &[u8]) -> usize {
				504	String::from_utf8_lossy(bytes).chars().count()
				505	}
				506
				507	/// An implementation of Tuned Boyer-Moore as laid out by
				508	/// Andrew Hume and Daniel Sunday in "Fast String Searching".
				509	/// O(n) in the size of the input.
				510	///
				511	/// Fast string searching algorithms come in many variations,
				512	/// but they can generally be described in terms of three main
				513	/// components.
				514	///
				515	/// The skip loop is where the string searcher wants to spend
				516	/// as much time as possible. Exactly which character in the
				517	/// pattern the skip loop examines varies from algorithm to
				518	/// algorithm, but in the simplest case this loop repeated
				519	/// looks at the last character in the pattern and jumps
				520	/// forward in the input if it is not in the pattern.
				521	/// Robert Boyer and J Moore called this the "fast" loop in
				522	/// their original paper.
				523	///
				524	/// The match loop is responsible for actually examining the
				525	/// whole potentially matching substring. In order to fail
				526	/// faster, the match loop sometimes has a guard test attached.
				527	/// The guard test uses frequency analysis of the different
				528	/// characters in the pattern to choose the least frequency
				529	/// occurring character and use it to find match failures
				530	/// as quickly as possible.
				531	///
				532	/// The shift rule governs how the algorithm will shuffle its
				533	/// test window in the event of a failure during the match loop.
				534	/// Certain shift rules allow the worst-case run time of the
				535	/// algorithm to be shown to be O(n) in the size of the input
				536	/// rather than O(nm) in the size of the input and the size
				537	/// of the pattern (as naive Boyer-Moore is).
				538	///
				539	/// "Fast String Searching", in addition to presenting a tuned
				540	/// algorithm, provides a comprehensive taxonomy of the many
				541	/// different flavors of string searchers. Under that taxonomy
				542	/// TBM, the algorithm implemented here, uses an unrolled fast
				543	/// skip loop with memchr fallback, a forward match loop with guard,
				544	/// and the mini Sunday's delta shift rule. To unpack that you'll have to
				545	/// read the paper.
				546	#[derive(Clone, Debug)]
				547	pub struct BoyerMooreSearch {
				548	/// The pattern we are going to look for in the haystack.
				549	pattern: Vec<u8>,
				550
				551	/// The skip table for the skip loop.
				552	///
				553	/// Maps the character at the end of the input
				554	/// to a shift.
				555	skip_table: Vec<usize>,
				556
				557	/// The guard character (least frequently occurring char).
				558	guard: u8,
				559	/// The reverse-index of the guard character in the pattern.
				560	guard_reverse_idx: usize,
				561
				562	/// Daniel Sunday's mini generalized delta2 shift table.
				563	///
				564	/// We use a skip loop, so we only have to provide a shift
				565	/// for the skip char (last char). This is why it is a mini
				566	/// shift rule.
				567	md2_shift: usize,
				568	}
				569
				570	impl BoyerMooreSearch {
				571	/// Create a new string searcher, performing whatever
				572	/// compilation steps are required.
				573	fn new(pattern: Vec<u8>) -> Self {
				574	debug_assert!(!pattern.is_empty());
				575
				576	let (g, gi) = Self::select_guard(pattern.as_slice());
				577	let skip_table = Self::compile_skip_table(pattern.as_slice());
				578	let md2_shift = Self::compile_md2_shift(pattern.as_slice());
				579	BoyerMooreSearch {
				580	pattern: pattern,
				581	skip_table: skip_table,
				582	guard: g,
				583	guard_reverse_idx: gi,
				584	md2_shift: md2_shift,
				585	}
				586	}
				587
				588	/// Find the pattern in `haystack`, returning the offset
				589	/// of the start of the first occurrence of the pattern
				590	/// in `haystack`.
				591	#[inline]
				592	fn find(&self, haystack: &[u8]) -> Option<usize> {
				593	if haystack.len() < self.pattern.len() {
				594	return None;
				595	}
				596
				597	let mut window_end = self.pattern.len() - 1;
				598
				599	// Inspired by the grep source. It is a way
				600	// to do correct loop unrolling without having to place
				601	// a crashpad of terminating charicters at the end in
				602	// the way described in the Fast String Searching paper.
				603	const NUM_UNROLL: usize = 10;
				604	// 1 for the initial position, and 1 for the md2 shift
				605	let short_circut = (NUM_UNROLL + 2) * self.pattern.len();
				606
				607	if haystack.len() > short_circut {
				608	// just 1 for the md2 shift
				609	let backstop =
				610	haystack.len() - ((NUM_UNROLL + 1) * self.pattern.len());
				611	loop {
				612	window_end =
				613	match self.skip_loop(haystack, window_end, backstop) {
				614	Some(i) => i,
				615	None => return None,
				616	};
				617	if window_end >= backstop {
				618	break;
				619	}
				620
				621	if self.check_match(haystack, window_end) {
				622	return Some(window_end - (self.pattern.len() - 1));
				623	} else {
				624	let skip = self.skip_table[haystack[window_end] as usize];
				625	window_end +=
				626	if skip == 0 { self.md2_shift } else { skip };
				627	continue;
				628	}
				629	}
				630	}
				631
				632	// now process the input after the backstop
				633	while window_end < haystack.len() {
				634	let mut skip = self.skip_table[haystack[window_end] as usize];
				635	if skip == 0 {
				636	if self.check_match(haystack, window_end) {
				637	return Some(window_end - (self.pattern.len() - 1));
				638	} else {
				639	skip = self.md2_shift;
				640	}
				641	}
				642	window_end += skip;
				643	}
				644
				645	None
				646	}
				647
				648	fn len(&self) -> usize {
				649	return self.pattern.len();
				650	}
				651
				652	/// The key heuristic behind which the BoyerMooreSearch lives.
				653	///
				654	/// See `rust-lang/regex/issues/408`.
				655	///
				656	/// Tuned Boyer-Moore is actually pretty slow! It turns out a handrolled
				657	/// platform-specific memchr routine with a bit of frequency
				658	/// analysis sprinkled on top actually wins most of the time.
				659	/// However, there are a few cases where Tuned Boyer-Moore still
				660	/// wins.
				661	///
				662	/// If the haystack is random, frequency analysis doesn't help us,
				663	/// so Boyer-Moore will win for sufficiently large needles.
				664	/// Unfortunately, there is no obvious way to determine this
				665	/// ahead of time.
				666	///
				667	/// If the pattern itself consists of very common characters,
				668	/// frequency analysis won't get us anywhere. The most extreme
				669	/// example of this is a pattern like `eeeeeeeeeeeeeeee`. Fortunately,
				670	/// this case is wholly determined by the pattern, so we can actually
				671	/// implement the heuristic.
				672	///
				673	/// A third case is if the pattern is sufficiently long. The idea
				674	/// here is that once the pattern gets long enough the Tuned
				675	/// Boyer-Moore skip loop will start making strides long enough
				676	/// to beat the asm deep magic that is memchr.
				677	fn should_use(pattern: &[u8]) -> bool {
				678	// The minimum pattern length required to use TBM.
				679	const MIN_LEN: usize = 9;
				680	// The minimum frequency rank (lower is rarer) that every byte in the
				681	// pattern must have in order to use TBM. That is, if the pattern
				682	// contains _any_ byte with a lower rank, then TBM won't be used.
				683	const MIN_CUTOFF: usize = 150;
				684	// The maximum frequency rank for any byte.
				685	const MAX_CUTOFF: usize = 255;
				686	// The scaling factor used to determine the actual cutoff frequency
				687	// to use (keeping in mind that the minimum frequency rank is bounded
				688	// by MIN_CUTOFF). This scaling factor is an attempt to make TBM more
				689	// likely to be used as the pattern grows longer. That is, longer
				690	// patterns permit somewhat less frequent bytes than shorter patterns,
				691	// under the assumption that TBM gets better as the pattern gets
				692	// longer.
				693	const LEN_CUTOFF_PROPORTION: usize = 4;
				694
				695	let scaled_rank = pattern.len().wrapping_mul(LEN_CUTOFF_PROPORTION);
				696	let cutoff = cmp::max(
				697	MIN_CUTOFF,
				698	MAX_CUTOFF - cmp::min(MAX_CUTOFF, scaled_rank),
				699	);
				700	// The pattern must be long enough to be worthwhile. e.g., memchr will
				701	// be faster on `e` because it is short even though e is quite common.
				702	pattern.len() > MIN_LEN
				703	// all the bytes must be more common than the cutoff.
				704	&& pattern.iter().all(\|c\| freq_rank(*c) >= cutoff)
				705	}
				706
				707	/// Check to see if there is a match at the given position
				708	#[inline]
				709	fn check_match(&self, haystack: &[u8], window_end: usize) -> bool {
				710	// guard test
				711	if haystack[window_end - self.guard_reverse_idx] != self.guard {
				712	return false;
				713	}
				714
				715	// match loop
				716	let window_start = window_end - (self.pattern.len() - 1);
				717	for i in 0..self.pattern.len() {
				718	if self.pattern[i] != haystack[window_start + i] {
				719	return false;
				720	}
				721	}
				722
				723	true
				724	}
				725
				726	/// Skip forward according to the shift table.
				727	///
				728	/// Returns the offset of the next occurrence
				729	/// of the last char in the pattern, or the none
				730	/// if it never reappears. If `skip_loop` hits the backstop
				731	/// it will leave early.
				732	#[inline]
				733	fn skip_loop(
				734	&self,
				735	haystack: &[u8],
				736	mut window_end: usize,
				737	backstop: usize,
				738	) -> Option<usize> {
				739	let window_end_snapshot = window_end;
				740	let skip_of = \|we: usize\| -> usize {
				741	// Unsafe might make this faster, but the benchmarks
				742	// were hard to interpret.
				743	self.skip_table[haystack[we] as usize]
				744	};
				745
				746	loop {
				747	let mut skip = skip_of(window_end);
				748	window_end += skip;
				749	skip = skip_of(window_end);
				750	window_end += skip;
				751	if skip != 0 {
				752	skip = skip_of(window_end);
				753	window_end += skip;
				754	skip = skip_of(window_end);
				755	window_end += skip;
				756	skip = skip_of(window_end);
				757	window_end += skip;
				758	if skip != 0 {
				759	skip = skip_of(window_end);
				760	window_end += skip;
				761	skip = skip_of(window_end);
				762	window_end += skip;
				763	skip = skip_of(window_end);
				764	window_end += skip;
				765	if skip != 0 {
				766	skip = skip_of(window_end);
				767	window_end += skip;
				768	skip = skip_of(window_end);
				769	window_end += skip;
				770
				771	// If ten iterations did not make at least 16 words
				772	// worth of progress, we just fall back on memchr.
				773	if window_end - window_end_snapshot
				774	> 16 * mem::size_of::<usize>()
				775	{
				776	// Returning a window_end >= backstop will
				777	// immediatly break us out of the inner loop in
				778	// `find`.
				779	if window_end >= backstop {
				780	return Some(window_end);
				781	}
				782
				783	continue; // we made enough progress
				784	} else {
				785	// In case we are already there, and so that
				786	// we will catch the guard char.
				787	window_end = window_end
				788	.checked_sub(1 + self.guard_reverse_idx)
				789	.unwrap_or(0);
				790
				791	match memchr(self.guard, &haystack[window_end..]) {
				792	None => return None,
				793	Some(g_idx) => {
				794	return Some(
				795	window_end
				796	+ g_idx
				797	+ self.guard_reverse_idx,
				798	);
				799	}
				800	}
				801	}
				802	}
				803	}
				804	}
				805
				806	return Some(window_end);
				807	}
				808	}
				809
				810	/// Compute the ufast skip table.
				811	fn compile_skip_table(pattern: &[u8]) -> Vec<usize> {
				812	let mut tab = vec![pattern.len(); 256];
				813
				814	// For every char in the pattern, we write a skip
				815	// that will line us up with the rightmost occurrence.
				816	//
				817	// N.B. the sentinel (0) is written by the last
				818	// loop iteration.
				819	for (i, c) in pattern.iter().enumerate() {
				820	tab[*c as usize] = (pattern.len() - 1) - i;
				821	}
				822
				823	tab
				824	}
				825
				826	/// Select the guard character based off of the precomputed
				827	/// frequency table.
				828	fn select_guard(pattern: &[u8]) -> (u8, usize) {
				829	let mut rarest = pattern[0];
				830	let mut rarest_rev_idx = pattern.len() - 1;
				831	for (i, c) in pattern.iter().enumerate() {
				832	if freq_rank(*c) < freq_rank(rarest) {
				833	rarest = *c;
				834	rarest_rev_idx = (pattern.len() - 1) - i;
				835	}
				836	}
				837
				838	(rarest, rarest_rev_idx)
				839	}
				840
				841	/// If there is another occurrence of the skip
				842	/// char, shift to it, otherwise just shift to
				843	/// the next window.
				844	fn compile_md2_shift(pattern: &[u8]) -> usize {
				845	let shiftc = *pattern.last().unwrap();
				846
				847	// For a pattern of length 1 we will never apply the
				848	// shift rule, so we use a poison value on the principle
				849	// that failing fast is a good thing.
				850	if pattern.len() == 1 {
				851	return 0xDEADBEAF;
				852	}
				853
				854	let mut i = pattern.len() - 2;
				855	while i > 0 {
				856	if pattern[i] == shiftc {
				857	return (pattern.len() - 1) - i;
				858	}
				859	i -= 1;
				860	}
				861
				862	// The skip char never re-occurs in the pattern, so
				863	// we can just shift the whole window length.
				864	pattern.len() - 1
				865	}
				866
				867	fn approximate_size(&self) -> usize {
				868	(self.pattern.len() * mem::size_of::<u8>())
				869	+ (256 * mem::size_of::<usize>()) // skip table
				870	}
				871	}
				872
				873	fn freq_rank(b: u8) -> usize {
				874	BYTE_FREQUENCIES[b as usize] as usize
				875	}
				876
				877	#[cfg(test)]
				878	mod tests {
				879	use super::{BoyerMooreSearch, FreqyPacked};
				880
				881	//
				882	// Unit Tests
				883	//
				884
				885	// The "hello, world" of string searching
				886	#[test]
				887	fn bm_find_subs() {
				888	let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..]));
				889	let haystack = b"I keep seeing patterns in this text";
				890	assert_eq!(14, searcher.find(haystack).unwrap());
				891	}
				892
				893	#[test]
				894	fn bm_find_no_subs() {
				895	let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..]));
				896	let haystack = b"I keep seeing needles in this text";
				897	assert_eq!(None, searcher.find(haystack));
				898	}
				899
				900	//
				901	// Regression Tests
				902	//
				903
				904	#[test]
				905	fn bm_skip_reset_bug() {
				906	let haystack = vec![0, 0, 0, 0, 0, 1, 1, 0];
				907	let needle = vec![0, 1, 1, 0];
				908
				909	let searcher = BoyerMooreSearch::new(needle);
				910	let offset = searcher.find(haystack.as_slice()).unwrap();
				911	assert_eq!(4, offset);
				912	}
				913
				914	#[test]
				915	fn bm_backstop_underflow_bug() {
				916	let haystack = vec![0, 0];
				917	let needle = vec![0, 0];
				918
				919	let searcher = BoyerMooreSearch::new(needle);
				920	let offset = searcher.find(haystack.as_slice()).unwrap();
				921	assert_eq!(0, offset);
				922	}
				923
				924	#[test]
				925	fn bm_naive_off_by_one_bug() {
				926	let haystack = vec![91];
				927	let needle = vec![91];
				928
				929	let naive_offset = naive_find(&needle, &haystack).unwrap();
				930	assert_eq!(0, naive_offset);
				931	}
				932
				933	#[test]
				934	fn bm_memchr_fallback_indexing_bug() {
				935	let mut haystack = vec![
				936	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				937	0, 0, 0, 0, 0, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				938	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				939	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				940	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				941	];
				942	let needle = vec![1, 1, 1, 1, 32, 32, 87];
				943	let needle_start = haystack.len();
				944	haystack.extend(needle.clone());
				945
				946	let searcher = BoyerMooreSearch::new(needle);
				947	assert_eq!(needle_start, searcher.find(haystack.as_slice()).unwrap());
				948	}
				949
				950	#[test]
				951	fn bm_backstop_boundary() {
				952	let haystack = b"\
				953	// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
				954	e_data.clone_created(entity_id, entity_to_add.entity_id);
				955	aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
				956	aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
				957	"
				958	.to_vec();
				959	let needle = b"clone_created".to_vec();
				960
				961	let searcher = BoyerMooreSearch::new(needle);
				962	let result = searcher.find(&haystack);
				963	assert_eq!(Some(43), result);
				964	}
				965
				966	#[test]
				967	fn bm_win_gnu_indexing_bug() {
				968	let haystack_raw = vec![
				969	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				970	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				971	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				972	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				973	];
				974	let needle = vec![1, 1, 1, 1, 1, 1, 1];
				975	let haystack = haystack_raw.as_slice();
				976
				977	BoyerMooreSearch::new(needle.clone()).find(haystack);
				978	}
				979
				980	//
				981	// QuickCheck Properties
				982	//
				983
				984	use quickcheck::TestResult;
				985
				986	fn naive_find(needle: &[u8], haystack: &[u8]) -> Option<usize> {
				987	assert!(needle.len() <= haystack.len());
				988
				989	for i in 0..(haystack.len() - (needle.len() - 1)) {
				990	if haystack[i] == needle[0]
				991	&& &haystack[i..(i + needle.len())] == needle
				992	{
				993	return Some(i);
				994	}
				995	}
				996
				997	None
				998	}
				999
				1000	quickcheck! {
				1001	fn qc_bm_equals_nieve_find(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult {
				1002	if pile1.len() == 0 \|\| pile2.len() == 0 {
				1003	return TestResult::discard();
				1004	}
				1005
				1006	let (needle, haystack) = if pile1.len() < pile2.len() {
				1007	(pile1, pile2.as_slice())
				1008	} else {
				1009	(pile2, pile1.as_slice())
				1010	};
				1011
				1012	let searcher = BoyerMooreSearch::new(needle.clone());
				1013	TestResult::from_bool(
				1014	searcher.find(haystack) == naive_find(&needle, haystack))
				1015	}
				1016
				1017	fn qc_bm_equals_single(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult {
				1018	if pile1.len() == 0 \|\| pile2.len() == 0 {
				1019	return TestResult::discard();
				1020	}
				1021
				1022	let (needle, haystack) = if pile1.len() < pile2.len() {
				1023	(pile1, pile2.as_slice())
				1024	} else {
				1025	(pile2, pile1.as_slice())
				1026	};
				1027
				1028	let bm_searcher = BoyerMooreSearch::new(needle.clone());
				1029	let freqy_memchr = FreqyPacked::new(needle);
				1030	TestResult::from_bool(
				1031	bm_searcher.find(haystack) == freqy_memchr.find(haystack))
				1032	}
				1033
				1034	fn qc_bm_finds_trailing_needle(
				1035	haystack_pre: Vec<u8>,
				1036	needle: Vec<u8>
				1037	) -> TestResult {
				1038	if needle.len() == 0 {
				1039	return TestResult::discard();
				1040	}
				1041
				1042	let mut haystack = haystack_pre.clone();
				1043	let searcher = BoyerMooreSearch::new(needle.clone());
				1044
				1045	if haystack.len() >= needle.len() &&
				1046	searcher.find(haystack.as_slice()).is_some() {
				1047	return TestResult::discard();
				1048	}
				1049
				1050	haystack.extend(needle.clone());
				1051
				1052	// What if the the tail of the haystack can start the
				1053	// needle?
				1054	let start = haystack_pre.len()
				1055	.checked_sub(needle.len())
				1056	.unwrap_or(0);
				1057	for i in 0..(needle.len() - 1) {
				1058	if searcher.find(&haystack[(i + start)..]).is_some() {
				1059	return TestResult::discard();
				1060	}
				1061	}
				1062
				1063	TestResult::from_bool(
				1064	searcher.find(haystack.as_slice())
				1065	.map(\|x\| x == haystack_pre.len())
				1066	.unwrap_or(false))
				1067	}
				1068
				1069	// qc_equals_* is only testing the negative case as @burntsushi
				1070	// pointed out in https://github.com/rust-lang/regex/issues/446.
				1071	// This quickcheck prop represents an effort to force testing of
				1072	// the positive case. qc_bm_finds_first and qc_bm_finds_trailing_needle
				1073	// already check some of the positive cases, but they don't cover
				1074	// cases where the needle is in the middle of haystack. This prop
				1075	// fills that hole.
				1076	fn qc_bm_finds_subslice(
				1077	haystack: Vec<u8>,
				1078	needle_start: usize,
				1079	needle_length: usize
				1080	) -> TestResult {
				1081	if haystack.len() == 0 {
				1082	return TestResult::discard();
				1083	}
				1084
				1085	let needle_start = needle_start % haystack.len();
				1086	let needle_length = needle_length % (haystack.len() - needle_start);
				1087
				1088	if needle_length == 0 {
				1089	return TestResult::discard();
				1090	}
				1091
				1092	let needle = &haystack[needle_start..(needle_start + needle_length)];
				1093
				1094	let bm_searcher = BoyerMooreSearch::new(needle.to_vec());
				1095
				1096	let start = naive_find(&needle, &haystack);
				1097	match start {
				1098	None => TestResult::from_bool(false),
				1099	Some(nf_start) =>
				1100	TestResult::from_bool(
				1101	nf_start <= needle_start
				1102	&& bm_searcher.find(&haystack) == start
				1103	)
				1104	}
				1105	}
				1106
				1107	fn qc_bm_finds_first(needle: Vec<u8>) -> TestResult {
				1108	if needle.len() == 0 {
				1109	return TestResult::discard();
				1110	}
				1111
				1112	let mut haystack = needle.clone();
				1113	let searcher = BoyerMooreSearch::new(needle.clone());
				1114	haystack.extend(needle);
				1115
				1116	TestResult::from_bool(
				1117	searcher.find(haystack.as_slice())
				1118	.map(\|x\| x == 0)
				1119	.unwrap_or(false))
				1120	}
				1121	}
				1122	}