Blame - src/prog.rs - platform/external/rust/crates/regex

blob: 74e5f2f6f80b9186fba17ae4e38b4ee38c6972ac [file] [log] [blame]

Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame^]	1	use std::cmp::Ordering;
				2	use std::collections::HashMap;
				3	use std::fmt;
				4	use std::mem;
				5	use std::ops::Deref;
				6	use std::slice;
				7	use std::sync::Arc;
				8
				9	use input::Char;
				10	use literal::LiteralSearcher;
				11
				12	/// `InstPtr` represents the index of an instruction in a regex program.
				13	pub type InstPtr = usize;
				14
				15	/// Program is a sequence of instructions and various facts about thos
				16	/// instructions.
				17	#[derive(Clone)]
				18	pub struct Program {
				19	/// A sequence of instructions that represents an NFA.
				20	pub insts: Vec<Inst>,
				21	/// Pointers to each Match instruction in the sequence.
				22	///
				23	/// This is always length 1 unless this program represents a regex set.
				24	pub matches: Vec<InstPtr>,
				25	/// The ordered sequence of all capture groups extracted from the AST.
				26	/// Unnamed groups are `None`.
				27	pub captures: Vec<Option<String>>,
				28	/// Pointers to all named capture groups into `captures`.
				29	pub capture_name_idx: Arc<HashMap<String, usize>>,
				30	/// A pointer to the start instruction. This can vary depending on how
				31	/// the program was compiled. For example, programs for use with the DFA
				32	/// engine have a `.*?` inserted at the beginning of unanchored regular
				33	/// expressions. The actual starting point of the program is after the
				34	/// `.*?`.
				35	pub start: InstPtr,
				36	/// A set of equivalence classes for discriminating bytes in the compiled
				37	/// program.
				38	pub byte_classes: Vec<u8>,
				39	/// When true, this program can only match valid UTF-8.
				40	pub only_utf8: bool,
				41	/// When true, this program uses byte range instructions instead of Unicode
				42	/// range instructions.
				43	pub is_bytes: bool,
				44	/// When true, the program is compiled for DFA matching. For example, this
				45	/// implies `is_bytes` and also inserts a preceding `.*?` for unanchored
				46	/// regexes.
				47	pub is_dfa: bool,
				48	/// When true, the program matches text in reverse (for use only in the
				49	/// DFA).
				50	pub is_reverse: bool,
				51	/// Whether the regex must match from the start of the input.
				52	pub is_anchored_start: bool,
				53	/// Whether the regex must match at the end of the input.
				54	pub is_anchored_end: bool,
				55	/// Whether this program contains a Unicode word boundary instruction.
				56	pub has_unicode_word_boundary: bool,
				57	/// A possibly empty machine for very quickly matching prefix literals.
				58	pub prefixes: LiteralSearcher,
				59	/// A limit on the size of the cache that the DFA is allowed to use while
				60	/// matching.
				61	///
				62	/// The cache limit specifies approximately how much space we're willing to
				63	/// give to the state cache. Once the state cache exceeds the size, it is
				64	/// wiped and all states must be re-computed.
				65	///
				66	/// Note that this value does not impact correctness. It can be set to 0
				67	/// and the DFA will run just fine. (It will only ever store exactly one
				68	/// state in the cache, and will likely run very slowly, but it will work.)
				69	///
				70	/// Also note that this limit is per thread of execution. That is,
				71	/// if the same regex is used to search text across multiple threads
				72	/// simultaneously, then the DFA cache is not shared. Instead, copies are
				73	/// made.
				74	pub dfa_size_limit: usize,
				75	}
				76
				77	impl Program {
				78	/// Creates an empty instruction sequence. Fields are given default
				79	/// values.
				80	pub fn new() -> Self {
				81	Program {
				82	insts: vec![],
				83	matches: vec![],
				84	captures: vec![],
				85	capture_name_idx: Arc::new(HashMap::new()),
				86	start: 0,
				87	byte_classes: vec![0; 256],
				88	only_utf8: true,
				89	is_bytes: false,
				90	is_dfa: false,
				91	is_reverse: false,
				92	is_anchored_start: false,
				93	is_anchored_end: false,
				94	has_unicode_word_boundary: false,
				95	prefixes: LiteralSearcher::empty(),
				96	dfa_size_limit: 2 * (1 << 20),
				97	}
				98	}
				99
				100	/// If pc is an index to a no-op instruction (like Save), then return the
				101	/// next pc that is not a no-op instruction.
				102	pub fn skip(&self, mut pc: usize) -> usize {
				103	loop {
				104	match self[pc] {
				105	Inst::Save(ref i) => pc = i.goto,
				106	_ => return pc,
				107	}
				108	}
				109	}
				110
				111	/// Return true if and only if an execution engine at instruction `pc` will
				112	/// always lead to a match.
				113	pub fn leads_to_match(&self, pc: usize) -> bool {
				114	if self.matches.len() > 1 {
				115	// If we have a regex set, then we have more than one ending
				116	// state, so leading to one of those states is generally
				117	// meaningless.
				118	return false;
				119	}
				120	match self[self.skip(pc)] {
				121	Inst::Match(_) => true,
				122	_ => false,
				123	}
				124	}
				125
				126	/// Returns true if the current configuration demands that an implicit
				127	/// `.*?` be prepended to the instruction sequence.
				128	pub fn needs_dotstar(&self) -> bool {
				129	self.is_dfa && !self.is_reverse && !self.is_anchored_start
				130	}
				131
				132	/// Returns true if this program uses Byte instructions instead of
				133	/// Char/Range instructions.
				134	pub fn uses_bytes(&self) -> bool {
				135	self.is_bytes \|\| self.is_dfa
				136	}
				137
				138	/// Returns true if this program exclusively matches valid UTF-8 bytes.
				139	///
				140	/// That is, if an invalid UTF-8 byte is seen, then no match is possible.
				141	pub fn only_utf8(&self) -> bool {
				142	self.only_utf8
				143	}
				144
				145	/// Return the approximate heap usage of this instruction sequence in
				146	/// bytes.
				147	pub fn approximate_size(&self) -> usize {
				148	// The only instruction that uses heap space is Ranges (for
				149	// Unicode codepoint programs) to store non-overlapping codepoint
				150	// ranges. To keep this operation constant time, we ignore them.
				151	(self.len() * mem::size_of::<Inst>())
				152	+ (self.matches.len() * mem::size_of::<InstPtr>())
				153	+ (self.captures.len() * mem::size_of::<Option<String>>())
				154	+ (self.capture_name_idx.len()
				155	* (mem::size_of::<String>() + mem::size_of::<usize>()))
				156	+ (self.byte_classes.len() * mem::size_of::<u8>())
				157	+ self.prefixes.approximate_size()
				158	}
				159	}
				160
				161	impl Deref for Program {
				162	type Target = [Inst];
				163
				164	#[cfg_attr(feature = "perf-inline", inline(always))]
				165	fn deref(&self) -> &Self::Target {
				166	&*self.insts
				167	}
				168	}
				169
				170	impl fmt::Debug for Program {
				171	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
				172	use self::Inst::*;
				173
				174	fn with_goto(cur: usize, goto: usize, fmtd: String) -> String {
				175	if goto == cur + 1 {
				176	fmtd
				177	} else {
				178	format!("{} (goto: {})", fmtd, goto)
				179	}
				180	}
				181
				182	fn visible_byte(b: u8) -> String {
				183	use std::ascii::escape_default;
				184	let escaped = escape_default(b).collect::<Vec<u8>>();
				185	String::from_utf8_lossy(&escaped).into_owned()
				186	}
				187
				188	for (pc, inst) in self.iter().enumerate() {
				189	match *inst {
				190	Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?,
				191	Save(ref inst) => {
				192	let s = format!("{:04} Save({})", pc, inst.slot);
				193	write!(f, "{}", with_goto(pc, inst.goto, s))?;
				194	}
				195	Split(ref inst) => {
				196	write!(
				197	f,
				198	"{:04} Split({}, {})",
				199	pc, inst.goto1, inst.goto2
				200	)?;
				201	}
				202	EmptyLook(ref inst) => {
				203	let s = format!("{:?}", inst.look);
				204	write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
				205	}
				206	Char(ref inst) => {
				207	let s = format!("{:?}", inst.c);
				208	write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
				209	}
				210	Ranges(ref inst) => {
				211	let ranges = inst
				212	.ranges
				213	.iter()
				214	.map(\|r\| format!("{:?}-{:?}", r.0, r.1))
				215	.collect::<Vec<String>>()
				216	.join(", ");
				217	write!(
				218	f,
				219	"{:04} {}",
				220	pc,
				221	with_goto(pc, inst.goto, ranges)
				222	)?;
				223	}
				224	Bytes(ref inst) => {
				225	let s = format!(
				226	"Bytes({}, {})",
				227	visible_byte(inst.start),
				228	visible_byte(inst.end)
				229	);
				230	write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
				231	}
				232	}
				233	if pc == self.start {
				234	write!(f, " (start)")?;
				235	}
				236	write!(f, "\n")?;
				237	}
				238	Ok(())
				239	}
				240	}
				241
				242	impl<'a> IntoIterator for &'a Program {
				243	type Item = &'a Inst;
				244	type IntoIter = slice::Iter<'a, Inst>;
				245	fn into_iter(self) -> Self::IntoIter {
				246	self.iter()
				247	}
				248	}
				249
				250	/// Inst is an instruction code in a Regex program.
				251	///
				252	/// Regrettably, a regex program either contains Unicode codepoint
				253	/// instructions (Char and Ranges) or it contains byte instructions (Bytes).
				254	/// A regex program can never contain both.
				255	///
				256	/// It would be worth investigating splitting this into two distinct types and
				257	/// then figuring out how to make the matching engines polymorphic over those
				258	/// types without sacrificing performance.
				259	///
				260	/// Other than the benefit of moving invariants into the type system, another
				261	/// benefit is the decreased size. If we remove the `Char` and `Ranges`
				262	/// instructions from the `Inst` enum, then its size shrinks from 40 bytes to
				263	/// 24 bytes. (This is because of the removal of a `Vec` in the `Ranges`
				264	/// variant.) Given that byte based machines are typically much bigger than
				265	/// their Unicode analogues (because they can decode UTF-8 directly), this ends
				266	/// up being a pretty significant savings.
				267	#[derive(Clone, Debug)]
				268	pub enum Inst {
				269	/// Match indicates that the program has reached a match state.
				270	///
				271	/// The number in the match corresponds to the Nth logical regular
				272	/// expression in this program. This index is always 0 for normal regex
				273	/// programs. Values greater than 0 appear when compiling regex sets, and
				274	/// each match instruction gets its own unique value. The value corresponds
				275	/// to the Nth regex in the set.
				276	Match(usize),
				277	/// Save causes the program to save the current location of the input in
				278	/// the slot indicated by InstSave.
				279	Save(InstSave),
				280	/// Split causes the program to diverge to one of two paths in the
				281	/// program, preferring goto1 in InstSplit.
				282	Split(InstSplit),
				283	/// EmptyLook represents a zero-width assertion in a regex program. A
				284	/// zero-width assertion does not consume any of the input text.
				285	EmptyLook(InstEmptyLook),
				286	/// Char requires the regex program to match the character in InstChar at
				287	/// the current position in the input.
				288	Char(InstChar),
				289	/// Ranges requires the regex program to match the character at the current
				290	/// position in the input with one of the ranges specified in InstRanges.
				291	Ranges(InstRanges),
				292	/// Bytes is like Ranges, except it expresses a single byte range. It is
				293	/// used in conjunction with Split instructions to implement multi-byte
				294	/// character classes.
				295	Bytes(InstBytes),
				296	}
				297
				298	impl Inst {
				299	/// Returns true if and only if this is a match instruction.
				300	pub fn is_match(&self) -> bool {
				301	match *self {
				302	Inst::Match(_) => true,
				303	_ => false,
				304	}
				305	}
				306	}
				307
				308	/// Representation of the Save instruction.
				309	#[derive(Clone, Debug)]
				310	pub struct InstSave {
				311	/// The next location to execute in the program.
				312	pub goto: InstPtr,
				313	/// The capture slot (there are two slots for every capture in a regex,
				314	/// including the zeroth capture for the entire match).
				315	pub slot: usize,
				316	}
				317
				318	/// Representation of the Split instruction.
				319	#[derive(Clone, Debug)]
				320	pub struct InstSplit {
				321	/// The first instruction to try. A match resulting from following goto1
				322	/// has precedence over a match resulting from following goto2.
				323	pub goto1: InstPtr,
				324	/// The second instruction to try. A match resulting from following goto1
				325	/// has precedence over a match resulting from following goto2.
				326	pub goto2: InstPtr,
				327	}
				328
				329	/// Representation of the `EmptyLook` instruction.
				330	#[derive(Clone, Debug)]
				331	pub struct InstEmptyLook {
				332	/// The next location to execute in the program if this instruction
				333	/// succeeds.
				334	pub goto: InstPtr,
				335	/// The type of zero-width assertion to check.
				336	pub look: EmptyLook,
				337	}
				338
				339	/// The set of zero-width match instructions.
				340	#[derive(Clone, Copy, Debug, PartialEq, Eq)]
				341	pub enum EmptyLook {
				342	/// Start of line or input.
				343	StartLine,
				344	/// End of line or input.
				345	EndLine,
				346	/// Start of input.
				347	StartText,
				348	/// End of input.
				349	EndText,
				350	/// Word character on one side and non-word character on other.
				351	WordBoundary,
				352	/// Word character on both sides or non-word character on both sides.
				353	NotWordBoundary,
				354	/// ASCII word boundary.
				355	WordBoundaryAscii,
				356	/// Not ASCII word boundary.
				357	NotWordBoundaryAscii,
				358	}
				359
				360	/// Representation of the Char instruction.
				361	#[derive(Clone, Debug)]
				362	pub struct InstChar {
				363	/// The next location to execute in the program if this instruction
				364	/// succeeds.
				365	pub goto: InstPtr,
				366	/// The character to test.
				367	pub c: char,
				368	}
				369
				370	/// Representation of the Ranges instruction.
				371	#[derive(Clone, Debug)]
				372	pub struct InstRanges {
				373	/// The next location to execute in the program if this instruction
				374	/// succeeds.
				375	pub goto: InstPtr,
				376	/// The set of Unicode scalar value ranges to test.
				377	pub ranges: Vec<(char, char)>,
				378	}
				379
				380	impl InstRanges {
				381	/// Tests whether the given input character matches this instruction.
				382	pub fn matches(&self, c: Char) -> bool {
				383	// This speeds up the `match_class_unicode` benchmark by checking
				384	// some common cases quickly without binary search. e.g., Matching
				385	// a Unicode class on predominantly ASCII text.
				386	for r in self.ranges.iter().take(4) {
				387	if c < r.0 {
				388	return false;
				389	}
				390	if c <= r.1 {
				391	return true;
				392	}
				393	}
				394	self.ranges
				395	.binary_search_by(\|r\| {
				396	if r.1 < c {
				397	Ordering::Less
				398	} else if r.0 > c {
				399	Ordering::Greater
				400	} else {
				401	Ordering::Equal
				402	}
				403	})
				404	.is_ok()
				405	}
				406
				407	/// Return the number of distinct characters represented by all of the
				408	/// ranges.
				409	pub fn num_chars(&self) -> usize {
				410	self.ranges
				411	.iter()
				412	.map(\|&(s, e)\| 1 + (e as u32) - (s as u32))
				413	.sum::<u32>() as usize
				414	}
				415	}
				416
				417	/// Representation of the Bytes instruction.
				418	#[derive(Clone, Debug)]
				419	pub struct InstBytes {
				420	/// The next location to execute in the program if this instruction
				421	/// succeeds.
				422	pub goto: InstPtr,
				423	/// The start (inclusive) of this byte range.
				424	pub start: u8,
				425	/// The end (inclusive) of this byte range.
				426	pub end: u8,
				427	}
				428
				429	impl InstBytes {
				430	/// Returns true if and only if the given byte is in this range.
				431	pub fn matches(&self, byte: u8) -> bool {
				432	self.start <= byte && byte <= self.end
				433	}
				434	}