blob: fc2b61a8ba606046a322b9b0e9dad838c1f317c1 [file] [log] [blame]
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -07001macro_rules! define_set {
2 ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr,
3 $(#[$doc_regexset_example:meta])* ) => {
4 pub mod $name {
5 use std::fmt;
6 use std::iter;
7 use std::slice;
8 use std::vec;
9
10 use error::Error;
11 use exec::Exec;
12 use re_builder::$builder_mod::RegexSetBuilder;
13 use re_trait::RegularExpression;
14
15/// Match multiple (possibly overlapping) regular expressions in a single scan.
16///
17/// A regex set corresponds to the union of two or more regular expressions.
18/// That is, a regex set will match text where at least one of its
19/// constituent regular expressions matches. A regex set as its formulated here
20/// provides a touch more power: it will also report *which* regular
21/// expressions in the set match. Indeed, this is the key difference between
22/// regex sets and a single `Regex` with many alternates, since only one
23/// alternate can match at a time.
24///
25/// For example, consider regular expressions to match email addresses and
26/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
27/// regex set is constructed from those regexes, then searching the text
28/// `foo@example.com` will report both regexes as matching. Of course, one
29/// could accomplish this by compiling each regex on its own and doing two
30/// searches over the text. The key advantage of using a regex set is that it
31/// will report the matching regexes using a *single pass through the text*.
32/// If one has hundreds or thousands of regexes to match repeatedly (like a URL
33/// router for a complex web application or a user agent matcher), then a regex
34/// set can realize huge performance gains.
35///
36/// # Example
37///
38/// This shows how the above two regexes (for matching email addresses and
39/// domains) might work:
40///
41$(#[$doc_regexset_example])*
42///
43/// Note that it would be possible to adapt the above example to using `Regex`
44/// with an expression like:
45///
46/// ```ignore
47/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
48/// ```
49///
50/// After a match, one could then inspect the capture groups to figure out
51/// which alternates matched. The problem is that it is hard to make this
52/// approach scale when there are many regexes since the overlap between each
53/// alternate isn't always obvious to reason about.
54///
55/// # Limitations
56///
57/// Regex sets are limited to answering the following two questions:
58///
59/// 1. Does any regex in the set match?
60/// 2. If so, which regexes in the set match?
61///
62/// As with the main `Regex` type, it is cheaper to ask (1) instead of (2)
63/// since the matching engines can stop after the first match is found.
64///
65/// Other features like finding the location of successive matches or their
66/// sub-captures aren't supported. If you need this functionality, the
67/// recommended approach is to compile each regex in the set independently and
68/// selectively match them based on which regexes in the set matched.
69///
70/// # Performance
71///
72/// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
73/// search takes `O(mn)` time, where `m` is proportional to the size of the
74/// regex set and `n` is proportional to the length of the search text.
75#[derive(Clone)]
76pub struct RegexSet(Exec);
77
78impl RegexSet {
79 /// Create a new regex set with the given regular expressions.
80 ///
81 /// This takes an iterator of `S`, where `S` is something that can produce
82 /// a `&str`. If any of the strings in the iterator are not valid regular
83 /// expressions, then an error is returned.
84 ///
85 /// # Example
86 ///
87 /// Create a new regex set from an iterator of strings:
88 ///
89 /// ```rust
90 /// # use regex::RegexSet;
91 /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
92 /// assert!(set.is_match("foo"));
93 /// ```
94 pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
95 where S: AsRef<str>, I: IntoIterator<Item=S> {
96 RegexSetBuilder::new(exprs).build()
97 }
98
99 /// Returns true if and only if one of the regexes in this set matches
100 /// the text given.
101 ///
102 /// This method should be preferred if you only need to test whether any
103 /// of the regexes in the set should match, but don't care about *which*
104 /// regexes matched. This is because the underlying matching engine will
105 /// quit immediately after seeing the first match instead of continuing to
106 /// find all matches.
107 ///
108 /// Note that as with searches using `Regex`, the expression is unanchored
109 /// by default. That is, if the regex does not start with `^` or `\A`, or
110 /// end with `$` or `\z`, then it is permitted to match anywhere in the
111 /// text.
112 ///
113 /// # Example
114 ///
115 /// Tests whether a set matches some text:
116 ///
117 /// ```rust
118 /// # use regex::RegexSet;
119 /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
120 /// assert!(set.is_match("foo"));
121 /// assert!(!set.is_match("☃"));
122 /// ```
123 pub fn is_match(&self, text: $text_ty) -> bool {
124 self.is_match_at(text, 0)
125 }
126
127 /// Returns the same as is_match, but starts the search at the given
128 /// offset.
129 ///
130 /// The significance of the starting point is that it takes the surrounding
131 /// context into consideration. For example, the `\A` anchor can only
132 /// match when `start == 0`.
133 #[doc(hidden)]
134 pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool {
135 self.0.searcher().is_match_at($as_bytes(text), start)
136 }
137
138 /// Returns the set of regular expressions that match in the given text.
139 ///
140 /// The set returned contains the index of each regular expression that
141 /// matches in the given text. The index is in correspondence with the
142 /// order of regular expressions given to `RegexSet`'s constructor.
143 ///
144 /// The set can also be used to iterate over the matched indices.
145 ///
146 /// Note that as with searches using `Regex`, the expression is unanchored
147 /// by default. That is, if the regex does not start with `^` or `\A`, or
148 /// end with `$` or `\z`, then it is permitted to match anywhere in the
149 /// text.
150 ///
151 /// # Example
152 ///
153 /// Tests which regular expressions match the given text:
154 ///
155 /// ```rust
156 /// # use regex::RegexSet;
157 /// let set = RegexSet::new(&[
158 /// r"\w+",
159 /// r"\d+",
160 /// r"\pL+",
161 /// r"foo",
162 /// r"bar",
163 /// r"barfoo",
164 /// r"foobar",
165 /// ]).unwrap();
166 /// let matches: Vec<_> = set.matches("foobar").into_iter().collect();
167 /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
168 ///
169 /// // You can also test whether a particular regex matched:
170 /// let matches = set.matches("foobar");
171 /// assert!(!matches.matched(5));
172 /// assert!(matches.matched(6));
173 /// ```
174 pub fn matches(&self, text: $text_ty) -> SetMatches {
175 let mut matches = vec![false; self.0.regex_strings().len()];
176 let any = self.read_matches_at(&mut matches, text, 0);
177 SetMatches {
178 matched_any: any,
179 matches: matches,
180 }
181 }
182
183 /// Returns the same as matches, but starts the search at the given
184 /// offset and stores the matches into the slice given.
185 ///
186 /// The significance of the starting point is that it takes the surrounding
187 /// context into consideration. For example, the `\A` anchor can only
188 /// match when `start == 0`.
189 ///
190 /// `matches` must have a length that is at least the number of regexes
191 /// in this set.
192 ///
193 /// This method returns true if and only if at least one member of
194 /// `matches` is true after executing the set against `text`.
195 #[doc(hidden)]
196 pub fn read_matches_at(
197 &self,
198 matches: &mut [bool],
199 text: $text_ty,
200 start: usize,
201 ) -> bool {
202 self.0.searcher().many_matches_at(matches, $as_bytes(text), start)
203 }
204
205 /// Returns the total number of regular expressions in this set.
206 pub fn len(&self) -> usize {
207 self.0.regex_strings().len()
208 }
209
210 /// Returns the patterns that this set will match on.
211 ///
212 /// This function can be used to determine the pattern for a match. The
213 /// slice returned has exactly as many patterns givens to this regex set,
214 /// and the order of the slice is the same as the order of the patterns
215 /// provided to the set.
216 ///
217 /// # Example
218 ///
219 /// ```rust
220 /// # use regex::RegexSet;
221 /// let set = RegexSet::new(&[
222 /// r"\w+",
223 /// r"\d+",
224 /// r"\pL+",
225 /// r"foo",
226 /// r"bar",
227 /// r"barfoo",
228 /// r"foobar",
229 /// ]).unwrap();
230 /// let matches: Vec<_> = set
231 /// .matches("foobar")
232 /// .into_iter()
233 /// .map(|match_idx| &set.patterns()[match_idx])
234 /// .collect();
235 /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]);
236 /// ```
237 pub fn patterns(&self) -> &[String] {
238 self.0.regex_strings()
239 }
240}
241
242/// A set of matches returned by a regex set.
243#[derive(Clone, Debug)]
244pub struct SetMatches {
245 matched_any: bool,
246 matches: Vec<bool>,
247}
248
249impl SetMatches {
250 /// Whether this set contains any matches.
251 pub fn matched_any(&self) -> bool {
252 self.matched_any
253 }
254
255 /// Whether the regex at the given index matched.
256 ///
257 /// The index for a regex is determined by its insertion order upon the
258 /// initial construction of a `RegexSet`, starting at `0`.
259 ///
260 /// # Panics
261 ///
262 /// If `regex_index` is greater than or equal to `self.len()`.
263 pub fn matched(&self, regex_index: usize) -> bool {
264 self.matches[regex_index]
265 }
266
267 /// The total number of regexes in the set that created these matches.
268 pub fn len(&self) -> usize {
269 self.matches.len()
270 }
271
272 /// Returns an iterator over indexes in the regex that matched.
273 ///
274 /// This will always produces matches in ascending order of index, where
275 /// the index corresponds to the index of the regex that matched with
276 /// respect to its position when initially building the set.
277 pub fn iter(&self) -> SetMatchesIter {
278 SetMatchesIter((&*self.matches).into_iter().enumerate())
279 }
280}
281
282impl IntoIterator for SetMatches {
283 type IntoIter = SetMatchesIntoIter;
284 type Item = usize;
285
286 fn into_iter(self) -> Self::IntoIter {
287 SetMatchesIntoIter(self.matches.into_iter().enumerate())
288 }
289}
290
291impl<'a> IntoIterator for &'a SetMatches {
292 type IntoIter = SetMatchesIter<'a>;
293 type Item = usize;
294
295 fn into_iter(self) -> Self::IntoIter {
296 self.iter()
297 }
298}
299
300/// An owned iterator over the set of matches from a regex set.
301///
302/// This will always produces matches in ascending order of index, where the
303/// index corresponds to the index of the regex that matched with respect to
304/// its position when initially building the set.
305pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>);
306
307impl Iterator for SetMatchesIntoIter {
308 type Item = usize;
309
310 fn next(&mut self) -> Option<usize> {
311 loop {
312 match self.0.next() {
313 None => return None,
314 Some((_, false)) => {}
315 Some((i, true)) => return Some(i),
316 }
317 }
318 }
319
320 fn size_hint(&self) -> (usize, Option<usize>) {
321 self.0.size_hint()
322 }
323}
324
325impl DoubleEndedIterator for SetMatchesIntoIter {
326 fn next_back(&mut self) -> Option<usize> {
327 loop {
328 match self.0.next_back() {
329 None => return None,
330 Some((_, false)) => {}
331 Some((i, true)) => return Some(i),
332 }
333 }
334 }
335}
336
337/// A borrowed iterator over the set of matches from a regex set.
338///
339/// The lifetime `'a` refers to the lifetime of a `SetMatches` value.
340///
341/// This will always produces matches in ascending order of index, where the
342/// index corresponds to the index of the regex that matched with respect to
343/// its position when initially building the set.
344#[derive(Clone)]
345pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>);
346
347impl<'a> Iterator for SetMatchesIter<'a> {
348 type Item = usize;
349
350 fn next(&mut self) -> Option<usize> {
351 loop {
352 match self.0.next() {
353 None => return None,
354 Some((_, &false)) => {}
355 Some((i, &true)) => return Some(i),
356 }
357 }
358 }
359
360 fn size_hint(&self) -> (usize, Option<usize>) {
361 self.0.size_hint()
362 }
363}
364
365impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
366 fn next_back(&mut self) -> Option<usize> {
367 loop {
368 match self.0.next_back() {
369 None => return None,
370 Some((_, &false)) => {}
371 Some((i, &true)) => return Some(i),
372 }
373 }
374 }
375}
376
377#[doc(hidden)]
378impl From<Exec> for RegexSet {
379 fn from(exec: Exec) -> Self {
380 RegexSet(exec)
381 }
382}
383
384impl fmt::Debug for RegexSet {
385 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
386 write!(f, "RegexSet({:?})", self.0.regex_strings())
387 }
388}
389
390#[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() }
391#[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text }
392 }
393 }
394}
395
396define_set! {
397 unicode,
398 set_unicode,
399 &str,
400 as_bytes_str,
401/// ```rust
402/// # use regex::RegexSet;
403/// let set = RegexSet::new(&[
404/// r"[a-z]+@[a-z]+\.(com|org|net)",
405/// r"[a-z]+\.(com|org|net)",
406/// ]).unwrap();
407///
408/// // Ask whether any regexes in the set match.
409/// assert!(set.is_match("foo@example.com"));
410///
411/// // Identify which regexes in the set match.
412/// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect();
413/// assert_eq!(vec![0, 1], matches);
414///
415/// // Try again, but with text that only matches one of the regexes.
416/// let matches: Vec<_> = set.matches("example.com").into_iter().collect();
417/// assert_eq!(vec![1], matches);
418///
419/// // Try again, but with text that doesn't match any regex in the set.
420/// let matches: Vec<_> = set.matches("example").into_iter().collect();
421/// assert!(matches.is_empty());
422/// ```
423}
424
425define_set! {
426 bytes,
427 set_bytes,
428 &[u8],
429 as_bytes_bytes,
430/// ```rust
431/// # use regex::bytes::RegexSet;
432/// let set = RegexSet::new(&[
433/// r"[a-z]+@[a-z]+\.(com|org|net)",
434/// r"[a-z]+\.(com|org|net)",
435/// ]).unwrap();
436///
437/// // Ask whether any regexes in the set match.
438/// assert!(set.is_match(b"foo@example.com"));
439///
440/// // Identify which regexes in the set match.
441/// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();
442/// assert_eq!(vec![0, 1], matches);
443///
444/// // Try again, but with text that only matches one of the regexes.
445/// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
446/// assert_eq!(vec![1], matches);
447///
448/// // Try again, but with text that doesn't match any regex in the set.
449/// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
450/// assert!(matches.is_empty());
451/// ```
452}