Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 1 | macro_rules! define_set { |
| 2 | ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr, |
| 3 | $(#[$doc_regexset_example:meta])* ) => { |
| 4 | pub mod $name { |
| 5 | use std::fmt; |
| 6 | use std::iter; |
| 7 | use std::slice; |
| 8 | use std::vec; |
| 9 | |
| 10 | use error::Error; |
| 11 | use exec::Exec; |
| 12 | use re_builder::$builder_mod::RegexSetBuilder; |
| 13 | use re_trait::RegularExpression; |
| 14 | |
| 15 | /// Match multiple (possibly overlapping) regular expressions in a single scan. |
| 16 | /// |
| 17 | /// A regex set corresponds to the union of two or more regular expressions. |
| 18 | /// That is, a regex set will match text where at least one of its |
| 19 | /// constituent regular expressions matches. A regex set as its formulated here |
| 20 | /// provides a touch more power: it will also report *which* regular |
| 21 | /// expressions in the set match. Indeed, this is the key difference between |
| 22 | /// regex sets and a single `Regex` with many alternates, since only one |
| 23 | /// alternate can match at a time. |
| 24 | /// |
| 25 | /// For example, consider regular expressions to match email addresses and |
| 26 | /// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a |
| 27 | /// regex set is constructed from those regexes, then searching the text |
| 28 | /// `foo@example.com` will report both regexes as matching. Of course, one |
| 29 | /// could accomplish this by compiling each regex on its own and doing two |
| 30 | /// searches over the text. The key advantage of using a regex set is that it |
| 31 | /// will report the matching regexes using a *single pass through the text*. |
| 32 | /// If one has hundreds or thousands of regexes to match repeatedly (like a URL |
| 33 | /// router for a complex web application or a user agent matcher), then a regex |
| 34 | /// set can realize huge performance gains. |
| 35 | /// |
| 36 | /// # Example |
| 37 | /// |
| 38 | /// This shows how the above two regexes (for matching email addresses and |
| 39 | /// domains) might work: |
| 40 | /// |
| 41 | $(#[$doc_regexset_example])* |
| 42 | /// |
| 43 | /// Note that it would be possible to adapt the above example to using `Regex` |
| 44 | /// with an expression like: |
| 45 | /// |
| 46 | /// ```ignore |
| 47 | /// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net)) |
| 48 | /// ``` |
| 49 | /// |
| 50 | /// After a match, one could then inspect the capture groups to figure out |
| 51 | /// which alternates matched. The problem is that it is hard to make this |
| 52 | /// approach scale when there are many regexes since the overlap between each |
| 53 | /// alternate isn't always obvious to reason about. |
| 54 | /// |
| 55 | /// # Limitations |
| 56 | /// |
| 57 | /// Regex sets are limited to answering the following two questions: |
| 58 | /// |
| 59 | /// 1. Does any regex in the set match? |
| 60 | /// 2. If so, which regexes in the set match? |
| 61 | /// |
| 62 | /// As with the main `Regex` type, it is cheaper to ask (1) instead of (2) |
| 63 | /// since the matching engines can stop after the first match is found. |
| 64 | /// |
| 65 | /// Other features like finding the location of successive matches or their |
| 66 | /// sub-captures aren't supported. If you need this functionality, the |
| 67 | /// recommended approach is to compile each regex in the set independently and |
| 68 | /// selectively match them based on which regexes in the set matched. |
| 69 | /// |
| 70 | /// # Performance |
| 71 | /// |
| 72 | /// A `RegexSet` has the same performance characteristics as `Regex`. Namely, |
| 73 | /// search takes `O(mn)` time, where `m` is proportional to the size of the |
| 74 | /// regex set and `n` is proportional to the length of the search text. |
| 75 | #[derive(Clone)] |
| 76 | pub struct RegexSet(Exec); |
| 77 | |
| 78 | impl RegexSet { |
| 79 | /// Create a new regex set with the given regular expressions. |
| 80 | /// |
| 81 | /// This takes an iterator of `S`, where `S` is something that can produce |
| 82 | /// a `&str`. If any of the strings in the iterator are not valid regular |
| 83 | /// expressions, then an error is returned. |
| 84 | /// |
| 85 | /// # Example |
| 86 | /// |
| 87 | /// Create a new regex set from an iterator of strings: |
| 88 | /// |
| 89 | /// ```rust |
| 90 | /// # use regex::RegexSet; |
| 91 | /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); |
| 92 | /// assert!(set.is_match("foo")); |
| 93 | /// ``` |
| 94 | pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error> |
| 95 | where S: AsRef<str>, I: IntoIterator<Item=S> { |
| 96 | RegexSetBuilder::new(exprs).build() |
| 97 | } |
| 98 | |
Chih-Hung Hsieh | 849e445 | 2020-10-26 13:16:47 -0700 | [diff] [blame^] | 99 | /// Create a new empty regex set. |
| 100 | /// |
| 101 | /// # Example |
| 102 | /// |
| 103 | /// ```rust |
| 104 | /// # use regex::RegexSet; |
| 105 | /// let set = RegexSet::empty(); |
| 106 | /// assert!(set.is_empty()); |
| 107 | /// ``` |
| 108 | pub fn empty() -> RegexSet { |
| 109 | RegexSetBuilder::new(&[""; 0]).build().unwrap() |
| 110 | } |
| 111 | |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 112 | /// Returns true if and only if one of the regexes in this set matches |
| 113 | /// the text given. |
| 114 | /// |
| 115 | /// This method should be preferred if you only need to test whether any |
| 116 | /// of the regexes in the set should match, but don't care about *which* |
| 117 | /// regexes matched. This is because the underlying matching engine will |
| 118 | /// quit immediately after seeing the first match instead of continuing to |
| 119 | /// find all matches. |
| 120 | /// |
| 121 | /// Note that as with searches using `Regex`, the expression is unanchored |
| 122 | /// by default. That is, if the regex does not start with `^` or `\A`, or |
| 123 | /// end with `$` or `\z`, then it is permitted to match anywhere in the |
| 124 | /// text. |
| 125 | /// |
| 126 | /// # Example |
| 127 | /// |
| 128 | /// Tests whether a set matches some text: |
| 129 | /// |
| 130 | /// ```rust |
| 131 | /// # use regex::RegexSet; |
| 132 | /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); |
| 133 | /// assert!(set.is_match("foo")); |
| 134 | /// assert!(!set.is_match("☃")); |
| 135 | /// ``` |
| 136 | pub fn is_match(&self, text: $text_ty) -> bool { |
| 137 | self.is_match_at(text, 0) |
| 138 | } |
| 139 | |
| 140 | /// Returns the same as is_match, but starts the search at the given |
| 141 | /// offset. |
| 142 | /// |
| 143 | /// The significance of the starting point is that it takes the surrounding |
| 144 | /// context into consideration. For example, the `\A` anchor can only |
| 145 | /// match when `start == 0`. |
| 146 | #[doc(hidden)] |
| 147 | pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool { |
| 148 | self.0.searcher().is_match_at($as_bytes(text), start) |
| 149 | } |
| 150 | |
| 151 | /// Returns the set of regular expressions that match in the given text. |
| 152 | /// |
| 153 | /// The set returned contains the index of each regular expression that |
| 154 | /// matches in the given text. The index is in correspondence with the |
| 155 | /// order of regular expressions given to `RegexSet`'s constructor. |
| 156 | /// |
| 157 | /// The set can also be used to iterate over the matched indices. |
| 158 | /// |
| 159 | /// Note that as with searches using `Regex`, the expression is unanchored |
| 160 | /// by default. That is, if the regex does not start with `^` or `\A`, or |
| 161 | /// end with `$` or `\z`, then it is permitted to match anywhere in the |
| 162 | /// text. |
| 163 | /// |
| 164 | /// # Example |
| 165 | /// |
| 166 | /// Tests which regular expressions match the given text: |
| 167 | /// |
| 168 | /// ```rust |
| 169 | /// # use regex::RegexSet; |
| 170 | /// let set = RegexSet::new(&[ |
| 171 | /// r"\w+", |
| 172 | /// r"\d+", |
| 173 | /// r"\pL+", |
| 174 | /// r"foo", |
| 175 | /// r"bar", |
| 176 | /// r"barfoo", |
| 177 | /// r"foobar", |
| 178 | /// ]).unwrap(); |
| 179 | /// let matches: Vec<_> = set.matches("foobar").into_iter().collect(); |
| 180 | /// assert_eq!(matches, vec![0, 2, 3, 4, 6]); |
| 181 | /// |
| 182 | /// // You can also test whether a particular regex matched: |
| 183 | /// let matches = set.matches("foobar"); |
| 184 | /// assert!(!matches.matched(5)); |
| 185 | /// assert!(matches.matched(6)); |
| 186 | /// ``` |
| 187 | pub fn matches(&self, text: $text_ty) -> SetMatches { |
| 188 | let mut matches = vec![false; self.0.regex_strings().len()]; |
| 189 | let any = self.read_matches_at(&mut matches, text, 0); |
| 190 | SetMatches { |
| 191 | matched_any: any, |
| 192 | matches: matches, |
| 193 | } |
| 194 | } |
| 195 | |
| 196 | /// Returns the same as matches, but starts the search at the given |
| 197 | /// offset and stores the matches into the slice given. |
| 198 | /// |
| 199 | /// The significance of the starting point is that it takes the surrounding |
| 200 | /// context into consideration. For example, the `\A` anchor can only |
| 201 | /// match when `start == 0`. |
| 202 | /// |
| 203 | /// `matches` must have a length that is at least the number of regexes |
| 204 | /// in this set. |
| 205 | /// |
| 206 | /// This method returns true if and only if at least one member of |
| 207 | /// `matches` is true after executing the set against `text`. |
| 208 | #[doc(hidden)] |
| 209 | pub fn read_matches_at( |
| 210 | &self, |
| 211 | matches: &mut [bool], |
| 212 | text: $text_ty, |
| 213 | start: usize, |
| 214 | ) -> bool { |
| 215 | self.0.searcher().many_matches_at(matches, $as_bytes(text), start) |
| 216 | } |
| 217 | |
| 218 | /// Returns the total number of regular expressions in this set. |
| 219 | pub fn len(&self) -> usize { |
| 220 | self.0.regex_strings().len() |
| 221 | } |
| 222 | |
Chih-Hung Hsieh | 849e445 | 2020-10-26 13:16:47 -0700 | [diff] [blame^] | 223 | /// Returns `true` if this set contains no regular expressions. |
| 224 | pub fn is_empty(&self) -> bool { |
| 225 | self.0.regex_strings().is_empty() |
| 226 | } |
| 227 | |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 228 | /// Returns the patterns that this set will match on. |
| 229 | /// |
| 230 | /// This function can be used to determine the pattern for a match. The |
| 231 | /// slice returned has exactly as many patterns givens to this regex set, |
| 232 | /// and the order of the slice is the same as the order of the patterns |
| 233 | /// provided to the set. |
| 234 | /// |
| 235 | /// # Example |
| 236 | /// |
| 237 | /// ```rust |
| 238 | /// # use regex::RegexSet; |
| 239 | /// let set = RegexSet::new(&[ |
| 240 | /// r"\w+", |
| 241 | /// r"\d+", |
| 242 | /// r"\pL+", |
| 243 | /// r"foo", |
| 244 | /// r"bar", |
| 245 | /// r"barfoo", |
| 246 | /// r"foobar", |
| 247 | /// ]).unwrap(); |
| 248 | /// let matches: Vec<_> = set |
| 249 | /// .matches("foobar") |
| 250 | /// .into_iter() |
| 251 | /// .map(|match_idx| &set.patterns()[match_idx]) |
| 252 | /// .collect(); |
| 253 | /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]); |
| 254 | /// ``` |
| 255 | pub fn patterns(&self) -> &[String] { |
| 256 | self.0.regex_strings() |
| 257 | } |
| 258 | } |
| 259 | |
| 260 | /// A set of matches returned by a regex set. |
| 261 | #[derive(Clone, Debug)] |
| 262 | pub struct SetMatches { |
| 263 | matched_any: bool, |
| 264 | matches: Vec<bool>, |
| 265 | } |
| 266 | |
| 267 | impl SetMatches { |
| 268 | /// Whether this set contains any matches. |
| 269 | pub fn matched_any(&self) -> bool { |
| 270 | self.matched_any |
| 271 | } |
| 272 | |
| 273 | /// Whether the regex at the given index matched. |
| 274 | /// |
| 275 | /// The index for a regex is determined by its insertion order upon the |
| 276 | /// initial construction of a `RegexSet`, starting at `0`. |
| 277 | /// |
| 278 | /// # Panics |
| 279 | /// |
| 280 | /// If `regex_index` is greater than or equal to `self.len()`. |
| 281 | pub fn matched(&self, regex_index: usize) -> bool { |
| 282 | self.matches[regex_index] |
| 283 | } |
| 284 | |
| 285 | /// The total number of regexes in the set that created these matches. |
| 286 | pub fn len(&self) -> usize { |
| 287 | self.matches.len() |
| 288 | } |
| 289 | |
| 290 | /// Returns an iterator over indexes in the regex that matched. |
| 291 | /// |
| 292 | /// This will always produces matches in ascending order of index, where |
| 293 | /// the index corresponds to the index of the regex that matched with |
| 294 | /// respect to its position when initially building the set. |
| 295 | pub fn iter(&self) -> SetMatchesIter { |
| 296 | SetMatchesIter((&*self.matches).into_iter().enumerate()) |
| 297 | } |
| 298 | } |
| 299 | |
| 300 | impl IntoIterator for SetMatches { |
| 301 | type IntoIter = SetMatchesIntoIter; |
| 302 | type Item = usize; |
| 303 | |
| 304 | fn into_iter(self) -> Self::IntoIter { |
| 305 | SetMatchesIntoIter(self.matches.into_iter().enumerate()) |
| 306 | } |
| 307 | } |
| 308 | |
| 309 | impl<'a> IntoIterator for &'a SetMatches { |
| 310 | type IntoIter = SetMatchesIter<'a>; |
| 311 | type Item = usize; |
| 312 | |
| 313 | fn into_iter(self) -> Self::IntoIter { |
| 314 | self.iter() |
| 315 | } |
| 316 | } |
| 317 | |
| 318 | /// An owned iterator over the set of matches from a regex set. |
| 319 | /// |
| 320 | /// This will always produces matches in ascending order of index, where the |
| 321 | /// index corresponds to the index of the regex that matched with respect to |
| 322 | /// its position when initially building the set. |
| 323 | pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>); |
| 324 | |
| 325 | impl Iterator for SetMatchesIntoIter { |
| 326 | type Item = usize; |
| 327 | |
| 328 | fn next(&mut self) -> Option<usize> { |
| 329 | loop { |
| 330 | match self.0.next() { |
| 331 | None => return None, |
| 332 | Some((_, false)) => {} |
| 333 | Some((i, true)) => return Some(i), |
| 334 | } |
| 335 | } |
| 336 | } |
| 337 | |
| 338 | fn size_hint(&self) -> (usize, Option<usize>) { |
| 339 | self.0.size_hint() |
| 340 | } |
| 341 | } |
| 342 | |
| 343 | impl DoubleEndedIterator for SetMatchesIntoIter { |
| 344 | fn next_back(&mut self) -> Option<usize> { |
| 345 | loop { |
| 346 | match self.0.next_back() { |
| 347 | None => return None, |
| 348 | Some((_, false)) => {} |
| 349 | Some((i, true)) => return Some(i), |
| 350 | } |
| 351 | } |
| 352 | } |
| 353 | } |
| 354 | |
| 355 | /// A borrowed iterator over the set of matches from a regex set. |
| 356 | /// |
| 357 | /// The lifetime `'a` refers to the lifetime of a `SetMatches` value. |
| 358 | /// |
| 359 | /// This will always produces matches in ascending order of index, where the |
| 360 | /// index corresponds to the index of the regex that matched with respect to |
| 361 | /// its position when initially building the set. |
| 362 | #[derive(Clone)] |
| 363 | pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>); |
| 364 | |
| 365 | impl<'a> Iterator for SetMatchesIter<'a> { |
| 366 | type Item = usize; |
| 367 | |
| 368 | fn next(&mut self) -> Option<usize> { |
| 369 | loop { |
| 370 | match self.0.next() { |
| 371 | None => return None, |
| 372 | Some((_, &false)) => {} |
| 373 | Some((i, &true)) => return Some(i), |
| 374 | } |
| 375 | } |
| 376 | } |
| 377 | |
| 378 | fn size_hint(&self) -> (usize, Option<usize>) { |
| 379 | self.0.size_hint() |
| 380 | } |
| 381 | } |
| 382 | |
| 383 | impl<'a> DoubleEndedIterator for SetMatchesIter<'a> { |
| 384 | fn next_back(&mut self) -> Option<usize> { |
| 385 | loop { |
| 386 | match self.0.next_back() { |
| 387 | None => return None, |
| 388 | Some((_, &false)) => {} |
| 389 | Some((i, &true)) => return Some(i), |
| 390 | } |
| 391 | } |
| 392 | } |
| 393 | } |
| 394 | |
| 395 | #[doc(hidden)] |
| 396 | impl From<Exec> for RegexSet { |
| 397 | fn from(exec: Exec) -> Self { |
| 398 | RegexSet(exec) |
| 399 | } |
| 400 | } |
| 401 | |
| 402 | impl fmt::Debug for RegexSet { |
| 403 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| 404 | write!(f, "RegexSet({:?})", self.0.regex_strings()) |
| 405 | } |
| 406 | } |
| 407 | |
| 408 | #[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() } |
| 409 | #[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text } |
| 410 | } |
| 411 | } |
| 412 | } |
| 413 | |
| 414 | define_set! { |
| 415 | unicode, |
| 416 | set_unicode, |
| 417 | &str, |
| 418 | as_bytes_str, |
| 419 | /// ```rust |
| 420 | /// # use regex::RegexSet; |
| 421 | /// let set = RegexSet::new(&[ |
| 422 | /// r"[a-z]+@[a-z]+\.(com|org|net)", |
| 423 | /// r"[a-z]+\.(com|org|net)", |
| 424 | /// ]).unwrap(); |
| 425 | /// |
| 426 | /// // Ask whether any regexes in the set match. |
| 427 | /// assert!(set.is_match("foo@example.com")); |
| 428 | /// |
| 429 | /// // Identify which regexes in the set match. |
| 430 | /// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect(); |
| 431 | /// assert_eq!(vec![0, 1], matches); |
| 432 | /// |
| 433 | /// // Try again, but with text that only matches one of the regexes. |
| 434 | /// let matches: Vec<_> = set.matches("example.com").into_iter().collect(); |
| 435 | /// assert_eq!(vec![1], matches); |
| 436 | /// |
| 437 | /// // Try again, but with text that doesn't match any regex in the set. |
| 438 | /// let matches: Vec<_> = set.matches("example").into_iter().collect(); |
| 439 | /// assert!(matches.is_empty()); |
| 440 | /// ``` |
| 441 | } |
| 442 | |
| 443 | define_set! { |
| 444 | bytes, |
| 445 | set_bytes, |
| 446 | &[u8], |
| 447 | as_bytes_bytes, |
| 448 | /// ```rust |
| 449 | /// # use regex::bytes::RegexSet; |
| 450 | /// let set = RegexSet::new(&[ |
| 451 | /// r"[a-z]+@[a-z]+\.(com|org|net)", |
| 452 | /// r"[a-z]+\.(com|org|net)", |
| 453 | /// ]).unwrap(); |
| 454 | /// |
| 455 | /// // Ask whether any regexes in the set match. |
| 456 | /// assert!(set.is_match(b"foo@example.com")); |
| 457 | /// |
| 458 | /// // Identify which regexes in the set match. |
| 459 | /// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect(); |
| 460 | /// assert_eq!(vec![0, 1], matches); |
| 461 | /// |
| 462 | /// // Try again, but with text that only matches one of the regexes. |
| 463 | /// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect(); |
| 464 | /// assert_eq!(vec![1], matches); |
| 465 | /// |
| 466 | /// // Try again, but with text that doesn't match any regex in the set. |
| 467 | /// let matches: Vec<_> = set.matches(b"example").into_iter().collect(); |
| 468 | /// assert!(matches.is_empty()); |
| 469 | /// ``` |
| 470 | } |