Blame - src/re_set.rs - platform/external/rust/crates/regex

blob: b8954be7f7f5b4ce36a148a07baca06918a3dd2b [file] [log] [blame]

Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	1	macro_rules! define_set {
				2	($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr,
				3	$(#[$doc_regexset_example:meta])* ) => {
				4	pub mod $name {
				5	use std::fmt;
				6	use std::iter;
				7	use std::slice;
				8	use std::vec;
				9
				10	use error::Error;
				11	use exec::Exec;
				12	use re_builder::$builder_mod::RegexSetBuilder;
				13	use re_trait::RegularExpression;
				14
				15	/// Match multiple (possibly overlapping) regular expressions in a single scan.
				16	///
				17	/// A regex set corresponds to the union of two or more regular expressions.
				18	/// That is, a regex set will match text where at least one of its
				19	/// constituent regular expressions matches. A regex set as its formulated here
				20	/// provides a touch more power: it will also report which regular
				21	/// expressions in the set match. Indeed, this is the key difference between
				22	/// regex sets and a single `Regex` with many alternates, since only one
				23	/// alternate can match at a time.
				24	///
				25	/// For example, consider regular expressions to match email addresses and
				26	/// domains: `[a-z]+@[a-z]+\.(com\|org\|net)` and `[a-z]+\.(com\|org\|net)`. If a
				27	/// regex set is constructed from those regexes, then searching the text
				28	/// `foo@example.com` will report both regexes as matching. Of course, one
				29	/// could accomplish this by compiling each regex on its own and doing two
				30	/// searches over the text. The key advantage of using a regex set is that it
				31	/// will report the matching regexes using a single pass through the text.
				32	/// If one has hundreds or thousands of regexes to match repeatedly (like a URL
				33	/// router for a complex web application or a user agent matcher), then a regex
				34	/// set can realize huge performance gains.
				35	///
				36	/// # Example
				37	///
				38	/// This shows how the above two regexes (for matching email addresses and
				39	/// domains) might work:
				40	///
				41	$(#[$doc_regexset_example])*
				42	///
				43	/// Note that it would be possible to adapt the above example to using `Regex`
				44	/// with an expression like:
				45	///
				46	/// ```ignore
				47	/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com\|org\|net)))\|(?P<domain>[a-z]+[.](com\|org\|net))
				48	/// ```
				49	///
				50	/// After a match, one could then inspect the capture groups to figure out
				51	/// which alternates matched. The problem is that it is hard to make this
				52	/// approach scale when there are many regexes since the overlap between each
				53	/// alternate isn't always obvious to reason about.
				54	///
				55	/// # Limitations
				56	///
				57	/// Regex sets are limited to answering the following two questions:
				58	///
				59	/// 1. Does any regex in the set match?
				60	/// 2. If so, which regexes in the set match?
				61	///
				62	/// As with the main `Regex` type, it is cheaper to ask (1) instead of (2)
				63	/// since the matching engines can stop after the first match is found.
				64	///
				65	/// Other features like finding the location of successive matches or their
				66	/// sub-captures aren't supported. If you need this functionality, the
				67	/// recommended approach is to compile each regex in the set independently and
				68	/// selectively match them based on which regexes in the set matched.
				69	///
				70	/// # Performance
				71	///
				72	/// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
				73	/// search takes `O(mn)` time, where `m` is proportional to the size of the
				74	/// regex set and `n` is proportional to the length of the search text.
				75	#[derive(Clone)]
				76	pub struct RegexSet(Exec);
				77
				78	impl RegexSet {
				79	/// Create a new regex set with the given regular expressions.
				80	///
				81	/// This takes an iterator of `S`, where `S` is something that can produce
				82	/// a `&str`. If any of the strings in the iterator are not valid regular
				83	/// expressions, then an error is returned.
				84	///
				85	/// # Example
				86	///
				87	/// Create a new regex set from an iterator of strings:
				88	///
				89	/// ```rust
				90	/// # use regex::RegexSet;
				91	/// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
				92	/// assert!(set.is_match("foo"));
				93	/// ```
				94	pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
				95	where S: AsRef<str>, I: IntoIterator<Item=S> {
				96	RegexSetBuilder::new(exprs).build()
				97	}
				98
Chih-Hung Hsieh	849e445	2020-10-26 13:16:47 -0700	[diff] [blame^]	99	/// Create a new empty regex set.
				100	///
				101	/// # Example
				102	///
				103	/// ```rust
				104	/// # use regex::RegexSet;
				105	/// let set = RegexSet::empty();
				106	/// assert!(set.is_empty());
				107	/// ```
				108	pub fn empty() -> RegexSet {
				109	RegexSetBuilder::new(&[""; 0]).build().unwrap()
				110	}
				111
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	112	/// Returns true if and only if one of the regexes in this set matches
				113	/// the text given.
				114	///
				115	/// This method should be preferred if you only need to test whether any
				116	/// of the regexes in the set should match, but don't care about which
				117	/// regexes matched. This is because the underlying matching engine will
				118	/// quit immediately after seeing the first match instead of continuing to
				119	/// find all matches.
				120	///
				121	/// Note that as with searches using `Regex`, the expression is unanchored
				122	/// by default. That is, if the regex does not start with `^` or `\A`, or
				123	/// end with `$` or `\z`, then it is permitted to match anywhere in the
				124	/// text.
				125	///
				126	/// # Example
				127	///
				128	/// Tests whether a set matches some text:
				129	///
				130	/// ```rust
				131	/// # use regex::RegexSet;
				132	/// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
				133	/// assert!(set.is_match("foo"));
				134	/// assert!(!set.is_match("☃"));
				135	/// ```
				136	pub fn is_match(&self, text: $text_ty) -> bool {
				137	self.is_match_at(text, 0)
				138	}
				139
				140	/// Returns the same as is_match, but starts the search at the given
				141	/// offset.
				142	///
				143	/// The significance of the starting point is that it takes the surrounding
				144	/// context into consideration. For example, the `\A` anchor can only
				145	/// match when `start == 0`.
				146	#[doc(hidden)]
				147	pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool {
				148	self.0.searcher().is_match_at($as_bytes(text), start)
				149	}
				150
				151	/// Returns the set of regular expressions that match in the given text.
				152	///
				153	/// The set returned contains the index of each regular expression that
				154	/// matches in the given text. The index is in correspondence with the
				155	/// order of regular expressions given to `RegexSet`'s constructor.
				156	///
				157	/// The set can also be used to iterate over the matched indices.
				158	///
				159	/// Note that as with searches using `Regex`, the expression is unanchored
				160	/// by default. That is, if the regex does not start with `^` or `\A`, or
				161	/// end with `$` or `\z`, then it is permitted to match anywhere in the
				162	/// text.
				163	///
				164	/// # Example
				165	///
				166	/// Tests which regular expressions match the given text:
				167	///
				168	/// ```rust
				169	/// # use regex::RegexSet;
				170	/// let set = RegexSet::new(&[
				171	/// r"\w+",
				172	/// r"\d+",
				173	/// r"\pL+",
				174	/// r"foo",
				175	/// r"bar",
				176	/// r"barfoo",
				177	/// r"foobar",
				178	/// ]).unwrap();
				179	/// let matches: Vec<_> = set.matches("foobar").into_iter().collect();
				180	/// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
				181	///
				182	/// // You can also test whether a particular regex matched:
				183	/// let matches = set.matches("foobar");
				184	/// assert!(!matches.matched(5));
				185	/// assert!(matches.matched(6));
				186	/// ```
				187	pub fn matches(&self, text: $text_ty) -> SetMatches {
				188	let mut matches = vec![false; self.0.regex_strings().len()];
				189	let any = self.read_matches_at(&mut matches, text, 0);
				190	SetMatches {
				191	matched_any: any,
				192	matches: matches,
				193	}
				194	}
				195
				196	/// Returns the same as matches, but starts the search at the given
				197	/// offset and stores the matches into the slice given.
				198	///
				199	/// The significance of the starting point is that it takes the surrounding
				200	/// context into consideration. For example, the `\A` anchor can only
				201	/// match when `start == 0`.
				202	///
				203	/// `matches` must have a length that is at least the number of regexes
				204	/// in this set.
				205	///
				206	/// This method returns true if and only if at least one member of
				207	/// `matches` is true after executing the set against `text`.
				208	#[doc(hidden)]
				209	pub fn read_matches_at(
				210	&self,
				211	matches: &mut [bool],
				212	text: $text_ty,
				213	start: usize,
				214	) -> bool {
				215	self.0.searcher().many_matches_at(matches, $as_bytes(text), start)
				216	}
				217
				218	/// Returns the total number of regular expressions in this set.
				219	pub fn len(&self) -> usize {
				220	self.0.regex_strings().len()
				221	}
				222
Chih-Hung Hsieh	849e445	2020-10-26 13:16:47 -0700	[diff] [blame^]	223	/// Returns `true` if this set contains no regular expressions.
				224	pub fn is_empty(&self) -> bool {
				225	self.0.regex_strings().is_empty()
				226	}
				227
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	228	/// Returns the patterns that this set will match on.
				229	///
				230	/// This function can be used to determine the pattern for a match. The
				231	/// slice returned has exactly as many patterns givens to this regex set,
				232	/// and the order of the slice is the same as the order of the patterns
				233	/// provided to the set.
				234	///
				235	/// # Example
				236	///
				237	/// ```rust
				238	/// # use regex::RegexSet;
				239	/// let set = RegexSet::new(&[
				240	/// r"\w+",
				241	/// r"\d+",
				242	/// r"\pL+",
				243	/// r"foo",
				244	/// r"bar",
				245	/// r"barfoo",
				246	/// r"foobar",
				247	/// ]).unwrap();
				248	/// let matches: Vec<_> = set
				249	/// .matches("foobar")
				250	/// .into_iter()
				251	/// .map(\|match_idx\| &set.patterns()[match_idx])
				252	/// .collect();
				253	/// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]);
				254	/// ```
				255	pub fn patterns(&self) -> &[String] {
				256	self.0.regex_strings()
				257	}
				258	}
				259
				260	/// A set of matches returned by a regex set.
				261	#[derive(Clone, Debug)]
				262	pub struct SetMatches {
				263	matched_any: bool,
				264	matches: Vec<bool>,
				265	}
				266
				267	impl SetMatches {
				268	/// Whether this set contains any matches.
				269	pub fn matched_any(&self) -> bool {
				270	self.matched_any
				271	}
				272
				273	/// Whether the regex at the given index matched.
				274	///
				275	/// The index for a regex is determined by its insertion order upon the
				276	/// initial construction of a `RegexSet`, starting at `0`.
				277	///
				278	/// # Panics
				279	///
				280	/// If `regex_index` is greater than or equal to `self.len()`.
				281	pub fn matched(&self, regex_index: usize) -> bool {
				282	self.matches[regex_index]
				283	}
				284
				285	/// The total number of regexes in the set that created these matches.
				286	pub fn len(&self) -> usize {
				287	self.matches.len()
				288	}
				289
				290	/// Returns an iterator over indexes in the regex that matched.
				291	///
				292	/// This will always produces matches in ascending order of index, where
				293	/// the index corresponds to the index of the regex that matched with
				294	/// respect to its position when initially building the set.
				295	pub fn iter(&self) -> SetMatchesIter {
				296	SetMatchesIter((&*self.matches).into_iter().enumerate())
				297	}
				298	}
				299
				300	impl IntoIterator for SetMatches {
				301	type IntoIter = SetMatchesIntoIter;
				302	type Item = usize;
				303
				304	fn into_iter(self) -> Self::IntoIter {
				305	SetMatchesIntoIter(self.matches.into_iter().enumerate())
				306	}
				307	}
				308
				309	impl<'a> IntoIterator for &'a SetMatches {
				310	type IntoIter = SetMatchesIter<'a>;
				311	type Item = usize;
				312
				313	fn into_iter(self) -> Self::IntoIter {
				314	self.iter()
				315	}
				316	}
				317
				318	/// An owned iterator over the set of matches from a regex set.
				319	///
				320	/// This will always produces matches in ascending order of index, where the
				321	/// index corresponds to the index of the regex that matched with respect to
				322	/// its position when initially building the set.
				323	pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>);
				324
				325	impl Iterator for SetMatchesIntoIter {
				326	type Item = usize;
				327
				328	fn next(&mut self) -> Option<usize> {
				329	loop {
				330	match self.0.next() {
				331	None => return None,
				332	Some((_, false)) => {}
				333	Some((i, true)) => return Some(i),
				334	}
				335	}
				336	}
				337
				338	fn size_hint(&self) -> (usize, Option<usize>) {
				339	self.0.size_hint()
				340	}
				341	}
				342
				343	impl DoubleEndedIterator for SetMatchesIntoIter {
				344	fn next_back(&mut self) -> Option<usize> {
				345	loop {
				346	match self.0.next_back() {
				347	None => return None,
				348	Some((_, false)) => {}
				349	Some((i, true)) => return Some(i),
				350	}
				351	}
				352	}
				353	}
				354
				355	/// A borrowed iterator over the set of matches from a regex set.
				356	///
				357	/// The lifetime `'a` refers to the lifetime of a `SetMatches` value.
				358	///
				359	/// This will always produces matches in ascending order of index, where the
				360	/// index corresponds to the index of the regex that matched with respect to
				361	/// its position when initially building the set.
				362	#[derive(Clone)]
				363	pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>);
				364
				365	impl<'a> Iterator for SetMatchesIter<'a> {
				366	type Item = usize;
				367
				368	fn next(&mut self) -> Option<usize> {
				369	loop {
				370	match self.0.next() {
				371	None => return None,
				372	Some((_, &false)) => {}
				373	Some((i, &true)) => return Some(i),
				374	}
				375	}
				376	}
				377
				378	fn size_hint(&self) -> (usize, Option<usize>) {
				379	self.0.size_hint()
				380	}
				381	}
				382
				383	impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
				384	fn next_back(&mut self) -> Option<usize> {
				385	loop {
				386	match self.0.next_back() {
				387	None => return None,
				388	Some((_, &false)) => {}
				389	Some((i, &true)) => return Some(i),
				390	}
				391	}
				392	}
				393	}
				394
				395	#[doc(hidden)]
				396	impl From<Exec> for RegexSet {
				397	fn from(exec: Exec) -> Self {
				398	RegexSet(exec)
				399	}
				400	}
				401
				402	impl fmt::Debug for RegexSet {
				403	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
				404	write!(f, "RegexSet({:?})", self.0.regex_strings())
				405	}
				406	}
				407
				408	#[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() }
				409	#[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text }
				410	}
				411	}
				412	}
				413
				414	define_set! {
				415	unicode,
				416	set_unicode,
				417	&str,
				418	as_bytes_str,
				419	/// ```rust
				420	/// # use regex::RegexSet;
				421	/// let set = RegexSet::new(&[
				422	/// r"[a-z]+@[a-z]+\.(com\|org\|net)",
				423	/// r"[a-z]+\.(com\|org\|net)",
				424	/// ]).unwrap();
				425	///
				426	/// // Ask whether any regexes in the set match.
				427	/// assert!(set.is_match("foo@example.com"));
				428	///
				429	/// // Identify which regexes in the set match.
				430	/// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect();
				431	/// assert_eq!(vec![0, 1], matches);
				432	///
				433	/// // Try again, but with text that only matches one of the regexes.
				434	/// let matches: Vec<_> = set.matches("example.com").into_iter().collect();
				435	/// assert_eq!(vec![1], matches);
				436	///
				437	/// // Try again, but with text that doesn't match any regex in the set.
				438	/// let matches: Vec<_> = set.matches("example").into_iter().collect();
				439	/// assert!(matches.is_empty());
				440	/// ```
				441	}
				442
				443	define_set! {
				444	bytes,
				445	set_bytes,
				446	&[u8],
				447	as_bytes_bytes,
				448	/// ```rust
				449	/// # use regex::bytes::RegexSet;
				450	/// let set = RegexSet::new(&[
				451	/// r"[a-z]+@[a-z]+\.(com\|org\|net)",
				452	/// r"[a-z]+\.(com\|org\|net)",
				453	/// ]).unwrap();
				454	///
				455	/// // Ask whether any regexes in the set match.
				456	/// assert!(set.is_match(b"foo@example.com"));
				457	///
				458	/// // Identify which regexes in the set match.
				459	/// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();
				460	/// assert_eq!(vec![0, 1], matches);
				461	///
				462	/// // Try again, but with text that only matches one of the regexes.
				463	/// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
				464	/// assert_eq!(vec![1], matches);
				465	///
				466	/// // Try again, but with text that doesn't match any regex in the set.
				467	/// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
				468	/// assert!(matches.is_empty());
				469	/// ```
				470	}