blob: 2b1aa5c4c408f4342bf0d51d8240ac1958663584 [file] [log] [blame]
Chih-Hung Hsieh0a0edd52020-04-07 14:24:00 -07001use std::io;
2
Joel Galenson82068452021-05-19 14:19:20 -07003use crate::automaton::Automaton;
4use crate::buffer::Buffer;
5use crate::dfa::{self, DFA};
6use crate::error::Result;
7use crate::nfa::{self, NFA};
8use crate::packed;
9use crate::prefilter::{Prefilter, PrefilterState};
10use crate::state_id::StateID;
11use crate::Match;
Chih-Hung Hsieh0a0edd52020-04-07 14:24:00 -070012
13/// An automaton for searching multiple strings in linear time.
14///
15/// The `AhoCorasick` type supports a few basic ways of constructing an
16/// automaton, including
17/// [`AhoCorasick::new`](struct.AhoCorasick.html#method.new)
18/// and
19/// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured).
20/// However, there are a fair number of configurable options that can be set
21/// by using
22/// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html)
23/// instead. Such options include, but are not limited to, how matches are
24/// determined, simple case insensitivity, whether to use a DFA or not and
25/// various knobs for controlling the space-vs-time trade offs taken when
26/// building the automaton.
27///
28/// If you aren't sure where to start, try beginning with
29/// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured).
30///
31/// # Resource usage
32///
33/// Aho-Corasick automatons are always constructed in `O(p)` time, where `p`
34/// is the combined length of all patterns being searched. With that said,
35/// building an automaton can be fairly costly because of high constant
36/// factors, particularly when enabling the
37/// [DFA](struct.AhoCorasickBuilder.html#method.dfa)
38/// option (which is disabled by default). For this reason, it's generally a
39/// good idea to build an automaton once and reuse it as much as possible.
40///
41/// Aho-Corasick automatons can also use a fair bit of memory. To get a
42/// concrete idea of how much memory is being used, try using the
43/// [`AhoCorasick::heap_bytes`](struct.AhoCorasick.html#method.heap_bytes)
44/// method.
45///
46/// # Examples
47///
48/// This example shows how to search for occurrences of multiple patterns
49/// simultaneously in a case insensitive fashion. Each match includes the
50/// pattern that matched along with the byte offsets of the match.
51///
52/// ```
53/// use aho_corasick::AhoCorasickBuilder;
54///
55/// let patterns = &["apple", "maple", "snapple"];
56/// let haystack = "Nobody likes maple in their apple flavored Snapple.";
57///
58/// let ac = AhoCorasickBuilder::new()
59/// .ascii_case_insensitive(true)
60/// .build(patterns);
61/// let mut matches = vec![];
62/// for mat in ac.find_iter(haystack) {
63/// matches.push((mat.pattern(), mat.start(), mat.end()));
64/// }
65/// assert_eq!(matches, vec![
66/// (1, 13, 18),
67/// (0, 28, 33),
68/// (2, 43, 50),
69/// ]);
70/// ```
71///
72/// This example shows how to replace matches with some other string:
73///
74/// ```
75/// use aho_corasick::AhoCorasick;
76///
77/// let patterns = &["fox", "brown", "quick"];
78/// let haystack = "The quick brown fox.";
79/// let replace_with = &["sloth", "grey", "slow"];
80///
81/// let ac = AhoCorasick::new(patterns);
82/// let result = ac.replace_all(haystack, replace_with);
83/// assert_eq!(result, "The slow grey sloth.");
84/// ```
85#[derive(Clone, Debug)]
86pub struct AhoCorasick<S: StateID = usize> {
87 imp: Imp<S>,
88 match_kind: MatchKind,
89}
90
91impl AhoCorasick {
92 /// Create a new Aho-Corasick automaton using the default configuration.
93 ///
94 /// The default configuration optimizes for less space usage, but at the
95 /// expense of longer search times. To change the configuration, use
96 /// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html)
97 /// for fine-grained control, or
98 /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured)
99 /// for automatic configuration if you aren't sure which settings to pick.
100 ///
101 /// This uses the default
102 /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard)
103 /// match semantics, which reports a match as soon as it is found. This
104 /// corresponds to the standard match semantics supported by textbook
105 /// descriptions of the Aho-Corasick algorithm.
106 ///
107 /// # Examples
108 ///
109 /// Basic usage:
110 ///
111 /// ```
112 /// use aho_corasick::AhoCorasick;
113 ///
114 /// let ac = AhoCorasick::new(&[
115 /// "foo", "bar", "baz",
116 /// ]);
117 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
118 /// ```
119 pub fn new<I, P>(patterns: I) -> AhoCorasick
120 where
121 I: IntoIterator<Item = P>,
122 P: AsRef<[u8]>,
123 {
124 AhoCorasickBuilder::new().build(patterns)
125 }
126
127 /// Build an Aho-Corasick automaton with an automatically determined
128 /// configuration.
129 ///
130 /// Specifically, this requires a slice of patterns instead of an iterator
131 /// since the configuration is determined by looking at the patterns before
132 /// constructing the automaton. The idea here is to balance space and time
133 /// automatically. That is, when searching a small number of patterns, this
134 /// will attempt to use the fastest possible configuration since the total
135 /// space required will be small anyway. As the number of patterns grows,
136 /// this will fall back to slower configurations that use less space.
137 ///
138 /// If you want auto configuration but with match semantics different from
139 /// the default `MatchKind::Standard`, then use
140 /// [`AhoCorasickBuilder::auto_configure`](struct.AhoCorasickBuilder.html#method.auto_configure).
141 ///
142 /// # Examples
143 ///
144 /// Basic usage is just like `new`, except you must provide a slice:
145 ///
146 /// ```
147 /// use aho_corasick::AhoCorasick;
148 ///
149 /// let ac = AhoCorasick::new_auto_configured(&[
150 /// "foo", "bar", "baz",
151 /// ]);
152 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
153 /// ```
154 pub fn new_auto_configured<B>(patterns: &[B]) -> AhoCorasick
155 where
156 B: AsRef<[u8]>,
157 {
158 AhoCorasickBuilder::new().auto_configure(patterns).build(patterns)
159 }
160}
161
162impl<S: StateID> AhoCorasick<S> {
163 /// Returns true if and only if this automaton matches the haystack at any
164 /// position.
165 ///
166 /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
167 /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
168 /// `&[u8]` itself.
169 ///
170 /// # Examples
171 ///
172 /// Basic usage:
173 ///
174 /// ```
175 /// use aho_corasick::AhoCorasick;
176 ///
177 /// let ac = AhoCorasick::new(&[
178 /// "foo", "bar", "quux", "baz",
179 /// ]);
180 /// assert!(ac.is_match("xxx bar xxx"));
181 /// assert!(!ac.is_match("xxx qux xxx"));
182 /// ```
183 pub fn is_match<B: AsRef<[u8]>>(&self, haystack: B) -> bool {
184 self.earliest_find(haystack).is_some()
185 }
186
187 /// Returns the location of the first detected match in `haystack`.
188 ///
189 /// This method has the same behavior regardless of the
190 /// [`MatchKind`](enum.MatchKind.html)
191 /// of this automaton.
192 ///
193 /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
194 /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
195 /// `&[u8]` itself.
196 ///
197 /// # Examples
198 ///
199 /// Basic usage:
200 ///
201 /// ```
202 /// use aho_corasick::AhoCorasick;
203 ///
204 /// let ac = AhoCorasick::new(&[
205 /// "abc", "b",
206 /// ]);
207 /// let mat = ac.earliest_find("abcd").expect("should have match");
208 /// assert_eq!(1, mat.pattern());
209 /// assert_eq!((1, 2), (mat.start(), mat.end()));
210 /// ```
211 pub fn earliest_find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
212 let mut prestate = PrefilterState::new(self.max_pattern_len());
213 let mut start = self.imp.start_state();
214 self.imp.earliest_find_at(
215 &mut prestate,
216 haystack.as_ref(),
217 0,
218 &mut start,
219 )
220 }
221
222 /// Returns the location of the first match according to the match
223 /// semantics that this automaton was constructed with.
224 ///
225 /// When using `MatchKind::Standard`, this corresponds precisely to the
226 /// same behavior as
227 /// [`earliest_find`](struct.AhoCorasick.html#method.earliest_find).
228 /// Otherwise, match semantics correspond to either
229 /// [leftmost-first](enum.MatchKind.html#variant.LeftmostFirst)
230 /// or
231 /// [leftmost-longest](enum.MatchKind.html#variant.LeftmostLongest).
232 ///
233 /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
234 /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
235 /// `&[u8]` itself.
236 ///
237 /// # Examples
238 ///
239 /// Basic usage, with standard semantics:
240 ///
241 /// ```
242 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
243 ///
244 /// let patterns = &["b", "abc", "abcd"];
245 /// let haystack = "abcd";
246 ///
247 /// let ac = AhoCorasickBuilder::new()
248 /// .match_kind(MatchKind::Standard) // default, not necessary
249 /// .build(patterns);
250 /// let mat = ac.find(haystack).expect("should have a match");
251 /// assert_eq!("b", &haystack[mat.start()..mat.end()]);
252 /// ```
253 ///
254 /// Now with leftmost-first semantics:
255 ///
256 /// ```
257 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
258 ///
259 /// let patterns = &["b", "abc", "abcd"];
260 /// let haystack = "abcd";
261 ///
262 /// let ac = AhoCorasickBuilder::new()
263 /// .match_kind(MatchKind::LeftmostFirst)
264 /// .build(patterns);
265 /// let mat = ac.find(haystack).expect("should have a match");
266 /// assert_eq!("abc", &haystack[mat.start()..mat.end()]);
267 /// ```
268 ///
269 /// And finally, leftmost-longest semantics:
270 ///
271 /// ```
272 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
273 ///
274 /// let patterns = &["b", "abc", "abcd"];
275 /// let haystack = "abcd";
276 ///
277 /// let ac = AhoCorasickBuilder::new()
278 /// .match_kind(MatchKind::LeftmostLongest)
279 /// .build(patterns);
280 /// let mat = ac.find(haystack).expect("should have a match");
281 /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]);
282 /// ```
283 pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
284 let mut prestate = PrefilterState::new(self.max_pattern_len());
285 self.imp.find_at_no_state(&mut prestate, haystack.as_ref(), 0)
286 }
287
288 /// Returns an iterator of non-overlapping matches, using the match
289 /// semantics that this automaton was constructed with.
290 ///
291 /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
292 /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
293 /// `&[u8]` itself.
294 ///
295 /// # Examples
296 ///
297 /// Basic usage, with standard semantics:
298 ///
299 /// ```
300 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
301 ///
302 /// let patterns = &["append", "appendage", "app"];
303 /// let haystack = "append the app to the appendage";
304 ///
305 /// let ac = AhoCorasickBuilder::new()
306 /// .match_kind(MatchKind::Standard) // default, not necessary
307 /// .build(patterns);
308 /// let matches: Vec<usize> = ac
309 /// .find_iter(haystack)
310 /// .map(|mat| mat.pattern())
311 /// .collect();
312 /// assert_eq!(vec![2, 2, 2], matches);
313 /// ```
314 ///
315 /// Now with leftmost-first semantics:
316 ///
317 /// ```
318 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
319 ///
320 /// let patterns = &["append", "appendage", "app"];
321 /// let haystack = "append the app to the appendage";
322 ///
323 /// let ac = AhoCorasickBuilder::new()
324 /// .match_kind(MatchKind::LeftmostFirst)
325 /// .build(patterns);
326 /// let matches: Vec<usize> = ac
327 /// .find_iter(haystack)
328 /// .map(|mat| mat.pattern())
329 /// .collect();
330 /// assert_eq!(vec![0, 2, 0], matches);
331 /// ```
332 ///
333 /// And finally, leftmost-longest semantics:
334 ///
335 /// ```
336 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
337 ///
338 /// let patterns = &["append", "appendage", "app"];
339 /// let haystack = "append the app to the appendage";
340 ///
341 /// let ac = AhoCorasickBuilder::new()
342 /// .match_kind(MatchKind::LeftmostLongest)
343 /// .build(patterns);
344 /// let matches: Vec<usize> = ac
345 /// .find_iter(haystack)
346 /// .map(|mat| mat.pattern())
347 /// .collect();
348 /// assert_eq!(vec![0, 2, 1], matches);
349 /// ```
350 pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
351 &'a self,
352 haystack: &'b B,
353 ) -> FindIter<'a, 'b, S> {
354 FindIter::new(self, haystack.as_ref())
355 }
356
357 /// Returns an iterator of overlapping matches in the given `haystack`.
358 ///
359 /// Overlapping matches can _only_ be detected using
360 /// `MatchKind::Standard` semantics. If this automaton was constructed with
361 /// leftmost semantics, then this method will panic. To determine whether
362 /// this will panic at runtime, use the
363 /// [`AhoCorasick::supports_overlapping`](struct.AhoCorasick.html#method.supports_overlapping)
364 /// method.
365 ///
366 /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
367 /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
368 /// `&[u8]` itself.
369 ///
370 /// # Panics
371 ///
372 /// This panics when `AhoCorasick::supports_overlapping` returns `false`.
373 /// That is, this panics when this automaton's match semantics are not
374 /// `MatchKind::Standard`.
375 ///
376 /// # Examples
377 ///
378 /// Basic usage, with standard semantics:
379 ///
380 /// ```
381 /// use aho_corasick::AhoCorasick;
382 ///
383 /// let patterns = &["append", "appendage", "app"];
384 /// let haystack = "append the app to the appendage";
385 ///
386 /// let ac = AhoCorasick::new(patterns);
387 /// let matches: Vec<usize> = ac
388 /// .find_overlapping_iter(haystack)
389 /// .map(|mat| mat.pattern())
390 /// .collect();
391 /// assert_eq!(vec![2, 0, 2, 2, 0, 1], matches);
392 /// ```
393 pub fn find_overlapping_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
394 &'a self,
395 haystack: &'b B,
396 ) -> FindOverlappingIter<'a, 'b, S> {
397 FindOverlappingIter::new(self, haystack.as_ref())
398 }
399
400 /// Replace all matches with a corresponding value in the `replace_with`
401 /// slice given. Matches correspond to the same matches as reported by
402 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
403 ///
404 /// Replacements are determined by the index of the matching pattern.
405 /// For example, if the pattern with index `2` is found, then it is
406 /// replaced by `replace_with[2]`.
407 ///
408 /// # Panics
409 ///
410 /// This panics when `replace_with.len()` does not equal the total number
411 /// of patterns that are matched by this automaton.
412 ///
413 /// # Examples
414 ///
415 /// Basic usage:
416 ///
417 /// ```
418 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
419 ///
420 /// let patterns = &["append", "appendage", "app"];
421 /// let haystack = "append the app to the appendage";
422 ///
423 /// let ac = AhoCorasickBuilder::new()
424 /// .match_kind(MatchKind::LeftmostFirst)
425 /// .build(patterns);
426 /// let result = ac.replace_all(haystack, &["x", "y", "z"]);
427 /// assert_eq!("x the z to the xage", result);
428 /// ```
429 pub fn replace_all<B>(&self, haystack: &str, replace_with: &[B]) -> String
430 where
431 B: AsRef<str>,
432 {
433 assert_eq!(
434 replace_with.len(),
435 self.pattern_count(),
436 "replace_all requires a replacement for every pattern \
437 in the automaton"
438 );
439 let mut dst = String::with_capacity(haystack.len());
440 self.replace_all_with(haystack, &mut dst, |mat, _, dst| {
441 dst.push_str(replace_with[mat.pattern()].as_ref());
442 true
443 });
444 dst
445 }
446
447 /// Replace all matches using raw bytes with a corresponding value in the
448 /// `replace_with` slice given. Matches correspond to the same matches as
449 /// reported by [`find_iter`](struct.AhoCorasick.html#method.find_iter).
450 ///
451 /// Replacements are determined by the index of the matching pattern.
452 /// For example, if the pattern with index `2` is found, then it is
453 /// replaced by `replace_with[2]`.
454 ///
455 /// # Panics
456 ///
457 /// This panics when `replace_with.len()` does not equal the total number
458 /// of patterns that are matched by this automaton.
459 ///
460 /// # Examples
461 ///
462 /// Basic usage:
463 ///
464 /// ```
465 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
466 ///
467 /// let patterns = &["append", "appendage", "app"];
468 /// let haystack = b"append the app to the appendage";
469 ///
470 /// let ac = AhoCorasickBuilder::new()
471 /// .match_kind(MatchKind::LeftmostFirst)
472 /// .build(patterns);
473 /// let result = ac.replace_all_bytes(haystack, &["x", "y", "z"]);
474 /// assert_eq!(b"x the z to the xage".to_vec(), result);
475 /// ```
476 pub fn replace_all_bytes<B>(
477 &self,
478 haystack: &[u8],
479 replace_with: &[B],
480 ) -> Vec<u8>
481 where
482 B: AsRef<[u8]>,
483 {
484 assert_eq!(
485 replace_with.len(),
486 self.pattern_count(),
487 "replace_all_bytes requires a replacement for every pattern \
488 in the automaton"
489 );
490 let mut dst = Vec::with_capacity(haystack.len());
491 self.replace_all_with_bytes(haystack, &mut dst, |mat, _, dst| {
492 dst.extend(replace_with[mat.pattern()].as_ref());
493 true
494 });
495 dst
496 }
497
498 /// Replace all matches using a closure called on each match.
499 /// Matches correspond to the same matches as reported by
500 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
501 ///
502 /// The closure accepts three parameters: the match found, the text of
503 /// the match and a string buffer with which to write the replaced text
504 /// (if any). If the closure returns `true`, then it continues to the next
Haibo Huang5b2c4082020-07-10 20:22:37 -0700505 /// match. If the closure returns `false`, then searching is stopped.
Chih-Hung Hsieh0a0edd52020-04-07 14:24:00 -0700506 ///
507 /// # Examples
508 ///
509 /// Basic usage:
510 ///
511 /// ```
512 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
513 ///
514 /// let patterns = &["append", "appendage", "app"];
515 /// let haystack = "append the app to the appendage";
516 ///
517 /// let ac = AhoCorasickBuilder::new()
518 /// .match_kind(MatchKind::LeftmostFirst)
519 /// .build(patterns);
520 /// let mut result = String::new();
521 /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| {
522 /// dst.push_str(&mat.pattern().to_string());
523 /// true
524 /// });
525 /// assert_eq!("0 the 2 to the 0age", result);
526 /// ```
Haibo Huang5b2c4082020-07-10 20:22:37 -0700527 ///
528 /// Stopping the replacement by returning `false` (continued from the
529 /// example above):
530 ///
531 /// ```
532 /// # use aho_corasick::{AhoCorasickBuilder, MatchKind};
533 /// # let patterns = &["append", "appendage", "app"];
534 /// # let haystack = "append the app to the appendage";
535 /// # let ac = AhoCorasickBuilder::new()
536 /// # .match_kind(MatchKind::LeftmostFirst)
537 /// # .build(patterns);
538 /// let mut result = String::new();
539 /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| {
540 /// dst.push_str(&mat.pattern().to_string());
541 /// mat.pattern() != 2
542 /// });
543 /// assert_eq!("0 the 2 to the appendage", result);
544 /// ```
Chih-Hung Hsieh0a0edd52020-04-07 14:24:00 -0700545 pub fn replace_all_with<F>(
546 &self,
547 haystack: &str,
548 dst: &mut String,
549 mut replace_with: F,
550 ) where
551 F: FnMut(&Match, &str, &mut String) -> bool,
552 {
553 let mut last_match = 0;
554 for mat in self.find_iter(haystack) {
555 dst.push_str(&haystack[last_match..mat.start()]);
556 last_match = mat.end();
Haibo Huang5b2c4082020-07-10 20:22:37 -0700557 if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) {
558 break;
559 };
Chih-Hung Hsieh0a0edd52020-04-07 14:24:00 -0700560 }
561 dst.push_str(&haystack[last_match..]);
562 }
563
564 /// Replace all matches using raw bytes with a closure called on each
565 /// match. Matches correspond to the same matches as reported by
566 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
567 ///
568 /// The closure accepts three parameters: the match found, the text of
569 /// the match and a byte buffer with which to write the replaced text
570 /// (if any). If the closure returns `true`, then it continues to the next
Haibo Huang5b2c4082020-07-10 20:22:37 -0700571 /// match. If the closure returns `false`, then searching is stopped.
Chih-Hung Hsieh0a0edd52020-04-07 14:24:00 -0700572 ///
573 /// # Examples
574 ///
575 /// Basic usage:
576 ///
577 /// ```
578 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
579 ///
580 /// let patterns = &["append", "appendage", "app"];
581 /// let haystack = b"append the app to the appendage";
582 ///
583 /// let ac = AhoCorasickBuilder::new()
584 /// .match_kind(MatchKind::LeftmostFirst)
585 /// .build(patterns);
586 /// let mut result = vec![];
587 /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| {
588 /// dst.extend(mat.pattern().to_string().bytes());
589 /// true
590 /// });
591 /// assert_eq!(b"0 the 2 to the 0age".to_vec(), result);
592 /// ```
Haibo Huang5b2c4082020-07-10 20:22:37 -0700593 ///
594 /// Stopping the replacement by returning `false` (continued from the
595 /// example above):
596 ///
597 /// ```
598 /// # use aho_corasick::{AhoCorasickBuilder, MatchKind};
599 /// # let patterns = &["append", "appendage", "app"];
600 /// # let haystack = b"append the app to the appendage";
601 /// # let ac = AhoCorasickBuilder::new()
602 /// # .match_kind(MatchKind::LeftmostFirst)
603 /// # .build(patterns);
604 /// let mut result = vec![];
605 /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| {
606 /// dst.extend(mat.pattern().to_string().bytes());
607 /// mat.pattern() != 2
608 /// });
609 /// assert_eq!(b"0 the 2 to the appendage".to_vec(), result);
610 /// ```
Chih-Hung Hsieh0a0edd52020-04-07 14:24:00 -0700611 pub fn replace_all_with_bytes<F>(
612 &self,
613 haystack: &[u8],
614 dst: &mut Vec<u8>,
615 mut replace_with: F,
616 ) where
617 F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool,
618 {
619 let mut last_match = 0;
620 for mat in self.find_iter(haystack) {
621 dst.extend(&haystack[last_match..mat.start()]);
622 last_match = mat.end();
Haibo Huang5b2c4082020-07-10 20:22:37 -0700623 if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) {
624 break;
625 };
Chih-Hung Hsieh0a0edd52020-04-07 14:24:00 -0700626 }
627 dst.extend(&haystack[last_match..]);
628 }
629
630 /// Returns an iterator of non-overlapping matches in the given
631 /// stream. Matches correspond to the same matches as reported by
632 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
633 ///
634 /// The matches yielded by this iterator use absolute position offsets in
635 /// the stream given, where the first byte has index `0`. Matches are
636 /// yieled until the stream is exhausted.
637 ///
638 /// Each item yielded by the iterator is an `io::Result<Match>`, where an
639 /// error is yielded if there was a problem reading from the reader given.
640 ///
641 /// When searching a stream, an internal buffer is used. Therefore, callers
642 /// should avoiding providing a buffered reader, if possible.
643 ///
644 /// Searching a stream requires that the automaton was built with
645 /// `MatchKind::Standard` semantics. If this automaton was constructed
646 /// with leftmost semantics, then this method will panic. To determine
647 /// whether this will panic at runtime, use the
648 /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream)
649 /// method.
650 ///
651 /// # Memory usage
652 ///
653 /// In general, searching streams will use a constant amount of memory for
654 /// its internal buffer. The one requirement is that the internal buffer
655 /// must be at least the size of the longest possible match. In most use
656 /// cases, the default buffer size will be much larger than any individual
657 /// match.
658 ///
659 /// # Panics
660 ///
661 /// This panics when `AhoCorasick::supports_stream` returns `false`.
662 /// That is, this panics when this automaton's match semantics are not
663 /// `MatchKind::Standard`. This restriction may be lifted in the future.
664 ///
665 /// # Examples
666 ///
667 /// Basic usage:
668 ///
669 /// ```
670 /// use aho_corasick::AhoCorasick;
671 ///
672 /// # fn example() -> Result<(), ::std::io::Error> {
673 /// let patterns = &["append", "appendage", "app"];
674 /// let haystack = "append the app to the appendage";
675 ///
676 /// let ac = AhoCorasick::new(patterns);
677 /// let mut matches = vec![];
678 /// for result in ac.stream_find_iter(haystack.as_bytes()) {
679 /// let mat = result?;
680 /// matches.push(mat.pattern());
681 /// }
682 /// assert_eq!(vec![2, 2, 2], matches);
683 /// # Ok(()) }; example().unwrap()
684 /// ```
685 pub fn stream_find_iter<'a, R: io::Read>(
686 &'a self,
687 rdr: R,
688 ) -> StreamFindIter<'a, R, S> {
689 StreamFindIter::new(self, rdr)
690 }
691
692 /// Search for and replace all matches of this automaton in
693 /// the given reader, and write the replacements to the given
694 /// writer. Matches correspond to the same matches as reported by
695 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
696 ///
697 /// Replacements are determined by the index of the matching pattern.
698 /// For example, if the pattern with index `2` is found, then it is
699 /// replaced by `replace_with[2]`.
700 ///
701 /// After all matches are replaced, the writer is _not_ flushed.
702 ///
703 /// If there was a problem reading from the given reader or writing to the
704 /// given writer, then the corresponding `io::Error` is returned and all
705 /// replacement is stopped.
706 ///
707 /// When searching a stream, an internal buffer is used. Therefore, callers
708 /// should avoiding providing a buffered reader, if possible. However,
709 /// callers may want to provide a buffered writer.
710 ///
711 /// Searching a stream requires that the automaton was built with
712 /// `MatchKind::Standard` semantics. If this automaton was constructed
713 /// with leftmost semantics, then this method will panic. To determine
714 /// whether this will panic at runtime, use the
715 /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream)
716 /// method.
717 ///
718 /// # Memory usage
719 ///
720 /// In general, searching streams will use a constant amount of memory for
721 /// its internal buffer. The one requirement is that the internal buffer
722 /// must be at least the size of the longest possible match. In most use
723 /// cases, the default buffer size will be much larger than any individual
724 /// match.
725 ///
726 /// # Panics
727 ///
728 /// This panics when `AhoCorasick::supports_stream` returns `false`.
729 /// That is, this panics when this automaton's match semantics are not
730 /// `MatchKind::Standard`. This restriction may be lifted in the future.
731 ///
732 /// # Examples
733 ///
734 /// Basic usage:
735 ///
736 /// ```
737 /// use aho_corasick::AhoCorasick;
738 ///
739 /// # fn example() -> Result<(), ::std::io::Error> {
740 /// let patterns = &["fox", "brown", "quick"];
741 /// let haystack = "The quick brown fox.";
742 /// let replace_with = &["sloth", "grey", "slow"];
743 ///
744 /// let ac = AhoCorasick::new(patterns);
745 /// let mut result = vec![];
746 /// ac.stream_replace_all(haystack.as_bytes(), &mut result, replace_with)?;
747 /// assert_eq!(b"The slow grey sloth.".to_vec(), result);
748 /// # Ok(()) }; example().unwrap()
749 /// ```
750 pub fn stream_replace_all<R, W, B>(
751 &self,
752 rdr: R,
753 wtr: W,
754 replace_with: &[B],
755 ) -> io::Result<()>
756 where
757 R: io::Read,
758 W: io::Write,
759 B: AsRef<[u8]>,
760 {
761 assert_eq!(
762 replace_with.len(),
763 self.pattern_count(),
764 "stream_replace_all requires a replacement for every pattern \
765 in the automaton"
766 );
767 self.stream_replace_all_with(rdr, wtr, |mat, _, wtr| {
768 wtr.write_all(replace_with[mat.pattern()].as_ref())
769 })
770 }
771
772 /// Search the given reader and replace all matches of this automaton
773 /// using the given closure. The result is written to the given
774 /// writer. Matches correspond to the same matches as reported by
775 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
776 ///
777 /// The closure accepts three parameters: the match found, the text of
Haibo Huang5b2c4082020-07-10 20:22:37 -0700778 /// the match and the writer with which to write the replaced text (if any).
Chih-Hung Hsieh0a0edd52020-04-07 14:24:00 -0700779 ///
780 /// After all matches are replaced, the writer is _not_ flushed.
781 ///
782 /// If there was a problem reading from the given reader or writing to the
783 /// given writer, then the corresponding `io::Error` is returned and all
784 /// replacement is stopped.
785 ///
786 /// When searching a stream, an internal buffer is used. Therefore, callers
787 /// should avoiding providing a buffered reader, if possible. However,
788 /// callers may want to provide a buffered writer.
789 ///
790 /// Searching a stream requires that the automaton was built with
791 /// `MatchKind::Standard` semantics. If this automaton was constructed
792 /// with leftmost semantics, then this method will panic. To determine
793 /// whether this will panic at runtime, use the
794 /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream)
795 /// method.
796 ///
797 /// # Memory usage
798 ///
799 /// In general, searching streams will use a constant amount of memory for
800 /// its internal buffer. The one requirement is that the internal buffer
801 /// must be at least the size of the longest possible match. In most use
802 /// cases, the default buffer size will be much larger than any individual
803 /// match.
804 ///
805 /// # Panics
806 ///
807 /// This panics when `AhoCorasick::supports_stream` returns `false`.
808 /// That is, this panics when this automaton's match semantics are not
809 /// `MatchKind::Standard`. This restriction may be lifted in the future.
810 ///
811 /// # Examples
812 ///
813 /// Basic usage:
814 ///
815 /// ```
816 /// use std::io::Write;
817 /// use aho_corasick::AhoCorasick;
818 ///
819 /// # fn example() -> Result<(), ::std::io::Error> {
820 /// let patterns = &["fox", "brown", "quick"];
821 /// let haystack = "The quick brown fox.";
822 ///
823 /// let ac = AhoCorasick::new(patterns);
824 /// let mut result = vec![];
825 /// ac.stream_replace_all_with(
826 /// haystack.as_bytes(),
827 /// &mut result,
828 /// |mat, _, wtr| {
829 /// wtr.write_all(mat.pattern().to_string().as_bytes())
830 /// },
831 /// )?;
832 /// assert_eq!(b"The 2 1 0.".to_vec(), result);
833 /// # Ok(()) }; example().unwrap()
834 /// ```
835 pub fn stream_replace_all_with<R, W, F>(
836 &self,
837 rdr: R,
838 mut wtr: W,
839 mut replace_with: F,
840 ) -> io::Result<()>
841 where
842 R: io::Read,
843 W: io::Write,
844 F: FnMut(&Match, &[u8], &mut W) -> io::Result<()>,
845 {
846 let mut it = StreamChunkIter::new(self, rdr);
847 while let Some(result) = it.next() {
848 let chunk = result?;
849 match chunk {
850 StreamChunk::NonMatch { bytes, .. } => {
851 wtr.write_all(bytes)?;
852 }
853 StreamChunk::Match { bytes, mat } => {
854 replace_with(&mat, bytes, &mut wtr)?;
855 }
856 }
857 }
858 Ok(())
859 }
860
861 /// Returns the match kind used by this automaton.
862 ///
863 /// # Examples
864 ///
865 /// Basic usage:
866 ///
867 /// ```
868 /// use aho_corasick::{AhoCorasick, MatchKind};
869 ///
870 /// let ac = AhoCorasick::new(&[
871 /// "foo", "bar", "quux", "baz",
872 /// ]);
873 /// assert_eq!(&MatchKind::Standard, ac.match_kind());
874 /// ```
875 pub fn match_kind(&self) -> &MatchKind {
876 self.imp.match_kind()
877 }
878
879 /// Returns the length of the longest pattern matched by this automaton.
880 ///
881 /// # Examples
882 ///
883 /// Basic usage:
884 ///
885 /// ```
886 /// use aho_corasick::AhoCorasick;
887 ///
888 /// let ac = AhoCorasick::new(&[
889 /// "foo", "bar", "quux", "baz",
890 /// ]);
891 /// assert_eq!(4, ac.max_pattern_len());
892 /// ```
893 pub fn max_pattern_len(&self) -> usize {
894 self.imp.max_pattern_len()
895 }
896
897 /// Return the total number of patterns matched by this automaton.
898 ///
899 /// This includes patterns that may never participate in a match. For
900 /// example, if
901 /// [`MatchKind::LeftmostFirst`](enum.MatchKind.html#variant.LeftmostFirst)
902 /// match semantics are used, and the patterns `Sam` and `Samwise` were
903 /// used to build the automaton, then `Samwise` can never participate in a
904 /// match because `Sam` will always take priority.
905 ///
906 /// # Examples
907 ///
908 /// Basic usage:
909 ///
910 /// ```
911 /// use aho_corasick::AhoCorasick;
912 ///
913 /// let ac = AhoCorasick::new(&[
914 /// "foo", "bar", "baz",
915 /// ]);
916 /// assert_eq!(3, ac.pattern_count());
917 /// ```
918 pub fn pattern_count(&self) -> usize {
919 self.imp.pattern_count()
920 }
921
922 /// Returns true if and only if this automaton supports reporting
923 /// overlapping matches.
924 ///
925 /// If this returns false and overlapping matches are requested, then it
926 /// will result in a panic.
927 ///
928 /// Since leftmost matching is inherently incompatible with overlapping
929 /// matches, only
930 /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard)
931 /// supports overlapping matches. This is unlikely to change in the future.
932 ///
933 /// # Examples
934 ///
935 /// Basic usage:
936 ///
937 /// ```
938 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
939 ///
940 /// let ac = AhoCorasickBuilder::new()
941 /// .match_kind(MatchKind::Standard)
942 /// .build(&["foo", "bar", "baz"]);
943 /// assert!(ac.supports_overlapping());
944 ///
945 /// let ac = AhoCorasickBuilder::new()
946 /// .match_kind(MatchKind::LeftmostFirst)
947 /// .build(&["foo", "bar", "baz"]);
948 /// assert!(!ac.supports_overlapping());
949 /// ```
950 pub fn supports_overlapping(&self) -> bool {
951 self.match_kind.supports_overlapping()
952 }
953
954 /// Returns true if and only if this automaton supports stream searching.
955 ///
956 /// If this returns false and stream searching (or replacing) is attempted,
957 /// then it will result in a panic.
958 ///
959 /// Currently, only
960 /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard)
961 /// supports streaming. This may be expanded in the future.
962 ///
963 /// # Examples
964 ///
965 /// Basic usage:
966 ///
967 /// ```
968 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
969 ///
970 /// let ac = AhoCorasickBuilder::new()
971 /// .match_kind(MatchKind::Standard)
972 /// .build(&["foo", "bar", "baz"]);
973 /// assert!(ac.supports_stream());
974 ///
975 /// let ac = AhoCorasickBuilder::new()
976 /// .match_kind(MatchKind::LeftmostFirst)
977 /// .build(&["foo", "bar", "baz"]);
978 /// assert!(!ac.supports_stream());
979 /// ```
980 pub fn supports_stream(&self) -> bool {
981 self.match_kind.supports_stream()
982 }
983
984 /// Returns the approximate total amount of heap used by this automaton, in
985 /// units of bytes.
986 ///
987 /// # Examples
988 ///
989 /// This example shows the difference in heap usage between a few
990 /// configurations:
991 ///
992 /// ```ignore
993 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
994 ///
995 /// let ac = AhoCorasickBuilder::new()
996 /// .dfa(false) // default
997 /// .build(&["foo", "bar", "baz"]);
998 /// assert_eq!(10_336, ac.heap_bytes());
999 ///
1000 /// let ac = AhoCorasickBuilder::new()
1001 /// .dfa(false) // default
1002 /// .ascii_case_insensitive(true)
1003 /// .build(&["foo", "bar", "baz"]);
1004 /// assert_eq!(10_384, ac.heap_bytes());
1005 ///
1006 /// let ac = AhoCorasickBuilder::new()
1007 /// .dfa(true)
Chih-Hung Hsieh0a0edd52020-04-07 14:24:00 -07001008 /// .ascii_case_insensitive(true)
1009 /// .build(&["foo", "bar", "baz"]);
1010 /// assert_eq!(1_248, ac.heap_bytes());
1011 /// ```
1012 pub fn heap_bytes(&self) -> usize {
1013 match self.imp {
1014 Imp::NFA(ref nfa) => nfa.heap_bytes(),
1015 Imp::DFA(ref dfa) => dfa.heap_bytes(),
1016 }
1017 }
1018}
1019
1020/// The internal implementation of Aho-Corasick, which is either an NFA or
1021/// a DFA. The NFA is slower but uses less memory. The DFA is faster but uses
1022/// more memory.
1023#[derive(Clone, Debug)]
1024enum Imp<S: StateID> {
1025 NFA(NFA<S>),
1026 DFA(DFA<S>),
1027}
1028
1029impl<S: StateID> Imp<S> {
1030 /// Returns the type of match semantics implemented by this automaton.
1031 fn match_kind(&self) -> &MatchKind {
1032 match *self {
1033 Imp::NFA(ref nfa) => nfa.match_kind(),
1034 Imp::DFA(ref dfa) => dfa.match_kind(),
1035 }
1036 }
1037
1038 /// Returns the identifier of the start state.
1039 fn start_state(&self) -> S {
1040 match *self {
1041 Imp::NFA(ref nfa) => nfa.start_state(),
1042 Imp::DFA(ref dfa) => dfa.start_state(),
1043 }
1044 }
1045
1046 /// The length, in bytes, of the longest pattern in this automaton. This
1047 /// information is useful for maintaining correct buffer sizes when
1048 /// searching on streams.
1049 fn max_pattern_len(&self) -> usize {
1050 match *self {
1051 Imp::NFA(ref nfa) => nfa.max_pattern_len(),
1052 Imp::DFA(ref dfa) => dfa.max_pattern_len(),
1053 }
1054 }
1055
1056 /// The total number of patterns added to this automaton. This includes
1057 /// patterns that may never match. The maximum matching pattern that can be
1058 /// reported is exactly one less than this number.
1059 fn pattern_count(&self) -> usize {
1060 match *self {
1061 Imp::NFA(ref nfa) => nfa.pattern_count(),
1062 Imp::DFA(ref dfa) => dfa.pattern_count(),
1063 }
1064 }
1065
Chih-Hung Hsieh2db690b2020-10-26 13:16:43 -07001066 /// Returns the prefilter object, if one exists, for the underlying
1067 /// automaton.
1068 fn prefilter(&self) -> Option<&dyn Prefilter> {
1069 match *self {
1070 Imp::NFA(ref nfa) => nfa.prefilter(),
1071 Imp::DFA(ref dfa) => dfa.prefilter(),
1072 }
1073 }
1074
1075 /// Returns true if and only if we should attempt to use a prefilter.
1076 fn use_prefilter(&self) -> bool {
1077 let p = match self.prefilter() {
1078 None => return false,
1079 Some(p) => p,
1080 };
1081 !p.looks_for_non_start_of_match()
1082 }
1083
Chih-Hung Hsieh0a0edd52020-04-07 14:24:00 -07001084 #[inline(always)]
1085 fn overlapping_find_at(
1086 &self,
1087 prestate: &mut PrefilterState,
1088 haystack: &[u8],
1089 at: usize,
1090 state_id: &mut S,
1091 match_index: &mut usize,
1092 ) -> Option<Match> {
1093 match *self {
1094 Imp::NFA(ref nfa) => nfa.overlapping_find_at(
1095 prestate,
1096 haystack,
1097 at,
1098 state_id,
1099 match_index,
1100 ),
1101 Imp::DFA(ref dfa) => dfa.overlapping_find_at(
1102 prestate,
1103 haystack,
1104 at,
1105 state_id,
1106 match_index,
1107 ),
1108 }
1109 }
1110
1111 #[inline(always)]
1112 fn earliest_find_at(
1113 &self,
1114 prestate: &mut PrefilterState,
1115 haystack: &[u8],
1116 at: usize,
1117 state_id: &mut S,
1118 ) -> Option<Match> {
1119 match *self {
1120 Imp::NFA(ref nfa) => {
1121 nfa.earliest_find_at(prestate, haystack, at, state_id)
1122 }
1123 Imp::DFA(ref dfa) => {
1124 dfa.earliest_find_at(prestate, haystack, at, state_id)
1125 }
1126 }
1127 }
1128
1129 #[inline(always)]
1130 fn find_at_no_state(
1131 &self,
1132 prestate: &mut PrefilterState,
1133 haystack: &[u8],
1134 at: usize,
1135 ) -> Option<Match> {
1136 match *self {
1137 Imp::NFA(ref nfa) => nfa.find_at_no_state(prestate, haystack, at),
1138 Imp::DFA(ref dfa) => dfa.find_at_no_state(prestate, haystack, at),
1139 }
1140 }
1141}
1142
1143/// An iterator of non-overlapping matches in a particular haystack.
1144///
1145/// This iterator yields matches according to the
1146/// [`MatchKind`](enum.MatchKind.html)
1147/// used by this automaton.
1148///
1149/// This iterator is constructed via the
1150/// [`AhoCorasick::find_iter`](struct.AhoCorasick.html#method.find_iter)
1151/// method.
1152///
1153/// The type variable `S` refers to the representation used for state
1154/// identifiers. (By default, this is `usize`.)
1155///
1156/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
1157///
1158/// The lifetime `'b` refers to the lifetime of the haystack being searched.
1159#[derive(Debug)]
Joel Galenson82068452021-05-19 14:19:20 -07001160pub struct FindIter<'a, 'b, S: StateID> {
Chih-Hung Hsieh0a0edd52020-04-07 14:24:00 -07001161 fsm: &'a Imp<S>,
1162 prestate: PrefilterState,
1163 haystack: &'b [u8],
1164 pos: usize,
1165}
1166
1167impl<'a, 'b, S: StateID> FindIter<'a, 'b, S> {
1168 fn new(ac: &'a AhoCorasick<S>, haystack: &'b [u8]) -> FindIter<'a, 'b, S> {
1169 let prestate = PrefilterState::new(ac.max_pattern_len());
1170 FindIter { fsm: &ac.imp, prestate, haystack, pos: 0 }
1171 }
1172}
1173
1174impl<'a, 'b, S: StateID> Iterator for FindIter<'a, 'b, S> {
1175 type Item = Match;
1176
1177 fn next(&mut self) -> Option<Match> {
1178 if self.pos > self.haystack.len() {
1179 return None;
1180 }
1181 let result = self.fsm.find_at_no_state(
1182 &mut self.prestate,
1183 self.haystack,
1184 self.pos,
1185 );
1186 let mat = match result {
1187 None => return None,
1188 Some(mat) => mat,
1189 };
1190 if mat.end() == self.pos {
1191 // If the automaton can match the empty string and if we found an
1192 // empty match, then we need to forcefully move the position.
1193 self.pos += 1;
1194 } else {
1195 self.pos = mat.end();
1196 }
1197 Some(mat)
1198 }
1199}
1200
1201/// An iterator of overlapping matches in a particular haystack.
1202///
1203/// This iterator will report all possible matches in a particular haystack,
1204/// even when the matches overlap.
1205///
1206/// This iterator is constructed via the
1207/// [`AhoCorasick::find_overlapping_iter`](struct.AhoCorasick.html#method.find_overlapping_iter)
1208/// method.
1209///
1210/// The type variable `S` refers to the representation used for state
1211/// identifiers. (By default, this is `usize`.)
1212///
1213/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
1214///
1215/// The lifetime `'b` refers to the lifetime of the haystack being searched.
1216#[derive(Debug)]
Joel Galenson82068452021-05-19 14:19:20 -07001217pub struct FindOverlappingIter<'a, 'b, S: StateID> {
Chih-Hung Hsieh0a0edd52020-04-07 14:24:00 -07001218 fsm: &'a Imp<S>,
1219 prestate: PrefilterState,
1220 haystack: &'b [u8],
1221 pos: usize,
1222 last_match_end: usize,
1223 state_id: S,
1224 match_index: usize,
1225}
1226
1227impl<'a, 'b, S: StateID> FindOverlappingIter<'a, 'b, S> {
1228 fn new(
1229 ac: &'a AhoCorasick<S>,
1230 haystack: &'b [u8],
1231 ) -> FindOverlappingIter<'a, 'b, S> {
1232 assert!(
1233 ac.supports_overlapping(),
1234 "automaton does not support overlapping searches"
1235 );
1236 let prestate = PrefilterState::new(ac.max_pattern_len());
1237 FindOverlappingIter {
1238 fsm: &ac.imp,
1239 prestate,
1240 haystack,
1241 pos: 0,
1242 last_match_end: 0,
1243 state_id: ac.imp.start_state(),
1244 match_index: 0,
1245 }
1246 }
1247}
1248
1249impl<'a, 'b, S: StateID> Iterator for FindOverlappingIter<'a, 'b, S> {
1250 type Item = Match;
1251
1252 fn next(&mut self) -> Option<Match> {
1253 let result = self.fsm.overlapping_find_at(
1254 &mut self.prestate,
1255 self.haystack,
1256 self.pos,
1257 &mut self.state_id,
1258 &mut self.match_index,
1259 );
1260 match result {
1261 None => return None,
1262 Some(m) => {
1263 self.pos = m.end();
1264 Some(m)
1265 }
1266 }
1267 }
1268}
1269
1270/// An iterator that reports Aho-Corasick matches in a stream.
1271///
1272/// This iterator yields elements of type `io::Result<Match>`, where an error
1273/// is reported if there was a problem reading from the underlying stream.
1274/// The iterator terminates only when the underlying stream reaches `EOF`.
1275///
1276/// This iterator is constructed via the
1277/// [`AhoCorasick::stream_find_iter`](struct.AhoCorasick.html#method.stream_find_iter)
1278/// method.
1279///
1280/// The type variable `R` refers to the `io::Read` stream that is being read
1281/// from.
1282///
1283/// The type variable `S` refers to the representation used for state
1284/// identifiers. (By default, this is `usize`.)
1285///
1286/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
1287#[derive(Debug)]
Joel Galenson82068452021-05-19 14:19:20 -07001288pub struct StreamFindIter<'a, R, S: StateID> {
Chih-Hung Hsieh0a0edd52020-04-07 14:24:00 -07001289 it: StreamChunkIter<'a, R, S>,
1290}
1291
1292impl<'a, R: io::Read, S: StateID> StreamFindIter<'a, R, S> {
1293 fn new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamFindIter<'a, R, S> {
1294 StreamFindIter { it: StreamChunkIter::new(ac, rdr) }
1295 }
1296}
1297
1298impl<'a, R: io::Read, S: StateID> Iterator for StreamFindIter<'a, R, S> {
1299 type Item = io::Result<Match>;
1300
1301 fn next(&mut self) -> Option<io::Result<Match>> {
1302 loop {
1303 match self.it.next() {
1304 None => return None,
1305 Some(Err(err)) => return Some(Err(err)),
1306 Some(Ok(StreamChunk::NonMatch { .. })) => {}
1307 Some(Ok(StreamChunk::Match { mat, .. })) => {
1308 return Some(Ok(mat));
1309 }
1310 }
1311 }
1312 }
1313}
1314
1315/// An iterator over chunks in an underlying reader. Each chunk either
1316/// corresponds to non-matching bytes or matching bytes, but all bytes from
1317/// the underlying reader are reported in sequence. There may be an arbitrary
1318/// number of non-matching chunks before seeing a matching chunk.
1319///
1320/// N.B. This does not actually implement Iterator because we need to borrow
1321/// from the underlying reader. But conceptually, it's still an iterator.
1322#[derive(Debug)]
Joel Galenson82068452021-05-19 14:19:20 -07001323struct StreamChunkIter<'a, R, S: StateID> {
Chih-Hung Hsieh0a0edd52020-04-07 14:24:00 -07001324 /// The AC automaton.
1325 fsm: &'a Imp<S>,
1326 /// State associated with this automaton's prefilter. It is a heuristic
1327 /// for stopping the prefilter if it's deemed ineffective.
1328 prestate: PrefilterState,
1329 /// The source of bytes we read from.
1330 rdr: R,
1331 /// A fixed size buffer. This is what we actually search. There are some
1332 /// invariants around the buffer's size, namely, it must be big enough to
1333 /// contain the longest possible match.
1334 buf: Buffer,
1335 /// The ID of the FSM state we're currently in.
1336 state_id: S,
1337 /// The current position at which to start the next search in `buf`.
1338 search_pos: usize,
1339 /// The absolute position of `search_pos`, where `0` corresponds to the
1340 /// position of the first byte read from `rdr`.
1341 absolute_pos: usize,
1342 /// The ending position of the last StreamChunk that was returned to the
1343 /// caller. This position is used to determine whether we need to emit
1344 /// non-matching bytes before emitting a match.
1345 report_pos: usize,
1346 /// A match that should be reported on the next call.
1347 pending_match: Option<Match>,
1348 /// Enabled only when the automaton can match the empty string. When
1349 /// enabled, we need to execute one final search after consuming the
1350 /// reader to find the trailing empty match.
1351 has_empty_match_at_end: bool,
1352}
1353
1354/// A single chunk yielded by the stream chunk iterator.
1355///
1356/// The `'r` lifetime refers to the lifetime of the stream chunk iterator.
1357#[derive(Debug)]
1358enum StreamChunk<'r> {
1359 /// A chunk that does not contain any matches.
1360 NonMatch { bytes: &'r [u8], start: usize },
1361 /// A chunk that precisely contains a match.
1362 Match { bytes: &'r [u8], mat: Match },
1363}
1364
1365impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> {
1366 fn new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamChunkIter<'a, R, S> {
1367 assert!(
1368 ac.supports_stream(),
1369 "stream searching is only supported for Standard match semantics"
1370 );
1371
Chih-Hung Hsieh2db690b2020-10-26 13:16:43 -07001372 let prestate = if ac.imp.use_prefilter() {
1373 PrefilterState::new(ac.max_pattern_len())
1374 } else {
1375 PrefilterState::disabled()
1376 };
Chih-Hung Hsieh0a0edd52020-04-07 14:24:00 -07001377 let buf = Buffer::new(ac.imp.max_pattern_len());
1378 let state_id = ac.imp.start_state();
1379 StreamChunkIter {
1380 fsm: &ac.imp,
1381 prestate,
1382 rdr,
1383 buf,
1384 state_id,
1385 absolute_pos: 0,
1386 report_pos: 0,
1387 search_pos: 0,
1388 pending_match: None,
1389 has_empty_match_at_end: ac.is_match(""),
1390 }
1391 }
1392
1393 fn next<'r>(&'r mut self) -> Option<io::Result<StreamChunk<'r>>> {
1394 loop {
1395 if let Some(mut mat) = self.pending_match.take() {
1396 let bytes = &self.buf.buffer()[mat.start()..mat.end()];
1397 self.report_pos = mat.end();
1398 mat = mat.increment(self.absolute_pos);
1399 return Some(Ok(StreamChunk::Match { bytes, mat }));
1400 }
1401 if self.search_pos >= self.buf.len() {
1402 if let Some(end) = self.unreported() {
1403 let bytes = &self.buf.buffer()[self.report_pos..end];
1404 let start = self.absolute_pos + self.report_pos;
1405 self.report_pos = end;
1406 return Some(Ok(StreamChunk::NonMatch { bytes, start }));
1407 }
1408 if self.buf.len() >= self.buf.min_buffer_len() {
1409 // This is the point at which we roll our buffer, which we
1410 // only do if our buffer has at least the minimum amount of
1411 // bytes in it. Before rolling, we update our various
1412 // positions to be consistent with the buffer after it has
1413 // been rolled.
1414
1415 self.report_pos -=
1416 self.buf.len() - self.buf.min_buffer_len();
1417 self.absolute_pos +=
1418 self.search_pos - self.buf.min_buffer_len();
1419 self.search_pos = self.buf.min_buffer_len();
1420 self.buf.roll();
1421 }
1422 match self.buf.fill(&mut self.rdr) {
1423 Err(err) => return Some(Err(err)),
1424 Ok(false) => {
1425 // We've hit EOF, but if there are still some
1426 // unreported bytes remaining, return them now.
1427 if self.report_pos < self.buf.len() {
1428 let bytes = &self.buf.buffer()[self.report_pos..];
1429 let start = self.absolute_pos + self.report_pos;
1430 self.report_pos = self.buf.len();
1431
1432 let chunk = StreamChunk::NonMatch { bytes, start };
1433 return Some(Ok(chunk));
1434 } else {
1435 // We've reported everything, but there might still
1436 // be a match at the very last position.
1437 if !self.has_empty_match_at_end {
1438 return None;
1439 }
1440 // fallthrough for another search to get trailing
1441 // empty matches
1442 self.has_empty_match_at_end = false;
1443 }
1444 }
1445 Ok(true) => {}
1446 }
1447 }
1448 let result = self.fsm.earliest_find_at(
1449 &mut self.prestate,
1450 self.buf.buffer(),
1451 self.search_pos,
1452 &mut self.state_id,
1453 );
1454 match result {
1455 None => {
1456 self.search_pos = self.buf.len();
1457 }
1458 Some(mat) => {
1459 self.state_id = self.fsm.start_state();
1460 if mat.end() == self.search_pos {
1461 // If the automaton can match the empty string and if
1462 // we found an empty match, then we need to forcefully
1463 // move the position.
1464 self.search_pos += 1;
1465 } else {
1466 self.search_pos = mat.end();
1467 }
1468 self.pending_match = Some(mat.clone());
1469 if self.report_pos < mat.start() {
1470 let bytes =
1471 &self.buf.buffer()[self.report_pos..mat.start()];
1472 let start = self.absolute_pos + self.report_pos;
1473 self.report_pos = mat.start();
1474
1475 let chunk = StreamChunk::NonMatch { bytes, start };
1476 return Some(Ok(chunk));
1477 }
1478 }
1479 }
1480 }
1481 }
1482
1483 fn unreported(&self) -> Option<usize> {
1484 let end = self.search_pos.saturating_sub(self.buf.min_buffer_len());
1485 if self.report_pos < end {
1486 Some(end)
1487 } else {
1488 None
1489 }
1490 }
1491}
1492
1493/// A builder for configuring an Aho-Corasick automaton.
1494#[derive(Clone, Debug)]
1495pub struct AhoCorasickBuilder {
1496 nfa_builder: nfa::Builder,
1497 dfa_builder: dfa::Builder,
1498 dfa: bool,
1499}
1500
1501impl Default for AhoCorasickBuilder {
1502 fn default() -> AhoCorasickBuilder {
1503 AhoCorasickBuilder::new()
1504 }
1505}
1506
1507impl AhoCorasickBuilder {
1508 /// Create a new builder for configuring an Aho-Corasick automaton.
1509 ///
1510 /// If you don't need fine grained configuration or aren't sure which knobs
1511 /// to set, try using
1512 /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured)
1513 /// instead.
1514 pub fn new() -> AhoCorasickBuilder {
1515 AhoCorasickBuilder {
1516 nfa_builder: nfa::Builder::new(),
1517 dfa_builder: dfa::Builder::new(),
1518 dfa: false,
1519 }
1520 }
1521
1522 /// Build an Aho-Corasick automaton using the configuration set on this
1523 /// builder.
1524 ///
1525 /// A builder may be reused to create more automatons.
1526 ///
1527 /// This method will use the default for representing internal state
1528 /// identifiers, which is `usize`. This guarantees that building the
1529 /// automaton will succeed and is generally a good default, but can make
1530 /// the size of the automaton 2-8 times bigger than it needs to be,
1531 /// depending on your target platform.
1532 ///
1533 /// # Examples
1534 ///
1535 /// Basic usage:
1536 ///
1537 /// ```
1538 /// use aho_corasick::AhoCorasickBuilder;
1539 ///
1540 /// let patterns = &["foo", "bar", "baz"];
1541 /// let ac = AhoCorasickBuilder::new()
1542 /// .build(patterns);
1543 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
1544 /// ```
1545 pub fn build<I, P>(&self, patterns: I) -> AhoCorasick
1546 where
1547 I: IntoIterator<Item = P>,
1548 P: AsRef<[u8]>,
1549 {
1550 // The builder only returns an error if the chosen state ID
1551 // representation is too small to fit all of the given patterns. In
1552 // this case, since we fix the representation to usize, it will always
1553 // work because it's impossible to overflow usize since the underlying
1554 // storage would OOM long before that happens.
1555 self.build_with_size::<usize, I, P>(patterns)
1556 .expect("usize state ID type should always work")
1557 }
1558
1559 /// Build an Aho-Corasick automaton using the configuration set on this
1560 /// builder with a specific state identifier representation. This only has
1561 /// an effect when the `dfa` option is enabled.
1562 ///
1563 /// Generally, the choices for a state identifier representation are
1564 /// `u8`, `u16`, `u32`, `u64` or `usize`, with `usize` being the default.
1565 /// The advantage of choosing a smaller state identifier representation
1566 /// is that the automaton produced will be smaller. This might be
1567 /// beneficial for just generally using less space, or might even allow it
1568 /// to fit more of the automaton in your CPU's cache, leading to overall
1569 /// better search performance.
1570 ///
1571 /// Unlike the standard `build` method, this can report an error if the
1572 /// state identifier representation cannot support the size of the
1573 /// automaton.
1574 ///
1575 /// Note that the state identifier representation is determined by the
1576 /// `S` type variable. This requires a type hint of some sort, either
1577 /// by specifying the return type or using the turbofish, e.g.,
1578 /// `build_with_size::<u16, _, _>(...)`.
1579 ///
1580 /// # Examples
1581 ///
1582 /// Basic usage:
1583 ///
1584 /// ```
1585 /// use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
1586 ///
1587 /// # fn example() -> Result<(), ::aho_corasick::Error> {
1588 /// let patterns = &["foo", "bar", "baz"];
1589 /// let ac: AhoCorasick<u8> = AhoCorasickBuilder::new()
1590 /// .build_with_size(patterns)?;
1591 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
1592 /// # Ok(()) }; example().unwrap()
1593 /// ```
1594 ///
1595 /// Or alternatively, with turbofish:
1596 ///
1597 /// ```
1598 /// use aho_corasick::AhoCorasickBuilder;
1599 ///
1600 /// # fn example() -> Result<(), ::aho_corasick::Error> {
1601 /// let patterns = &["foo", "bar", "baz"];
1602 /// let ac = AhoCorasickBuilder::new()
1603 /// .build_with_size::<u8, _, _>(patterns)?;
1604 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
1605 /// # Ok(()) }; example().unwrap()
1606 /// ```
1607 pub fn build_with_size<S, I, P>(
1608 &self,
1609 patterns: I,
1610 ) -> Result<AhoCorasick<S>>
1611 where
1612 S: StateID,
1613 I: IntoIterator<Item = P>,
1614 P: AsRef<[u8]>,
1615 {
1616 let nfa = self.nfa_builder.build(patterns)?;
1617 let match_kind = nfa.match_kind().clone();
1618 let imp = if self.dfa {
1619 let dfa = self.dfa_builder.build(&nfa)?;
1620 Imp::DFA(dfa)
1621 } else {
1622 Imp::NFA(nfa)
1623 };
1624 Ok(AhoCorasick { imp, match_kind })
1625 }
1626
1627 /// Automatically configure the settings on this builder according to the
1628 /// patterns that will be used to construct the automaton.
1629 ///
1630 /// The idea here is to balance space and time automatically. That is, when
1631 /// searching a small number of patterns, this will attempt to use the
1632 /// fastest possible configuration since the total space required will be
1633 /// small anyway. As the number of patterns grows, this will fall back to
1634 /// slower configurations that use less space.
1635 ///
1636 /// This is guaranteed to never set `match_kind`, but any other option may
1637 /// be overridden.
1638 ///
1639 /// # Examples
1640 ///
1641 /// Basic usage:
1642 ///
1643 /// ```
1644 /// use aho_corasick::AhoCorasickBuilder;
1645 ///
1646 /// let patterns = &["foo", "bar", "baz"];
1647 /// let ac = AhoCorasickBuilder::new()
1648 /// .auto_configure(patterns)
1649 /// .build(patterns);
1650 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
1651 /// ```
1652 pub fn auto_configure<B: AsRef<[u8]>>(
1653 &mut self,
1654 patterns: &[B],
1655 ) -> &mut AhoCorasickBuilder {
1656 // N.B. Currently we only use the length of `patterns` to make a
1657 // decision here, and could therefore ask for an `ExactSizeIterator`
1658 // instead. But it's conceivable that we might adapt this to look at
1659 // the total number of bytes, which would requires a second pass.
1660 //
1661 // The logic here is fairly rudimentary at the moment, but probably
1662 // OK. The idea here is to use the fastest thing possible for a small
1663 // number of patterns. That is, a DFA with no byte classes, since byte
1664 // classes require an extra indirection for every byte searched. With a
1665 // moderate number of patterns, we still want a DFA, but save on both
1666 // space and compilation time by enabling byte classes. Finally, fall
1667 // back to the slower but smaller NFA.
1668 if patterns.len() <= 100 {
1669 // N.B. Using byte classes can actually be faster by improving
1670 // locality, but this only really applies for multi-megabyte
1671 // automata (i.e., automata that don't fit in your CPU's cache).
Joel Galenson82068452021-05-19 14:19:20 -07001672 self.dfa(true);
Chih-Hung Hsieh0a0edd52020-04-07 14:24:00 -07001673 } else if patterns.len() <= 5000 {
1674 self.dfa(true);
1675 }
1676 self
1677 }
1678
1679 /// Set the desired match semantics.
1680 ///
1681 /// The default is `MatchKind::Standard`, which corresponds to the match
1682 /// semantics supported by the standard textbook description of the
1683 /// Aho-Corasick algorithm. Namely, matches are reported as soon as they
1684 /// are found. Moreover, this is the only way to get overlapping matches
1685 /// or do stream searching.
1686 ///
1687 /// The other kinds of match semantics that are supported are
1688 /// `MatchKind::LeftmostFirst` and `MatchKind::LeftmostLongest`. The former
1689 /// corresponds to the match you would get if you were to try to match
1690 /// each pattern at each position in the haystack in the same order that
1691 /// you give to the automaton. That is, it returns the leftmost match
1692 /// corresponding the earliest pattern given to the automaton. The latter
1693 /// corresponds to finding the longest possible match among all leftmost
1694 /// matches.
1695 ///
1696 /// For more details on match semantics, see the
1697 /// [documentation for `MatchKind`](enum.MatchKind.html).
1698 ///
1699 /// # Examples
1700 ///
1701 /// In these examples, we demonstrate the differences between match
1702 /// semantics for a particular set of patterns in a specific order:
1703 /// `b`, `abc`, `abcd`.
1704 ///
1705 /// Standard semantics:
1706 ///
1707 /// ```
1708 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
1709 ///
1710 /// let patterns = &["b", "abc", "abcd"];
1711 /// let haystack = "abcd";
1712 ///
1713 /// let ac = AhoCorasickBuilder::new()
1714 /// .match_kind(MatchKind::Standard) // default, not necessary
1715 /// .build(patterns);
1716 /// let mat = ac.find(haystack).expect("should have a match");
1717 /// assert_eq!("b", &haystack[mat.start()..mat.end()]);
1718 /// ```
1719 ///
1720 /// Leftmost-first semantics:
1721 ///
1722 /// ```
1723 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
1724 ///
1725 /// let patterns = &["b", "abc", "abcd"];
1726 /// let haystack = "abcd";
1727 ///
1728 /// let ac = AhoCorasickBuilder::new()
1729 /// .match_kind(MatchKind::LeftmostFirst)
1730 /// .build(patterns);
1731 /// let mat = ac.find(haystack).expect("should have a match");
1732 /// assert_eq!("abc", &haystack[mat.start()..mat.end()]);
1733 /// ```
1734 ///
1735 /// Leftmost-longest semantics:
1736 ///
1737 /// ```
1738 /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
1739 ///
1740 /// let patterns = &["b", "abc", "abcd"];
1741 /// let haystack = "abcd";
1742 ///
1743 /// let ac = AhoCorasickBuilder::new()
1744 /// .match_kind(MatchKind::LeftmostLongest)
1745 /// .build(patterns);
1746 /// let mat = ac.find(haystack).expect("should have a match");
1747 /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]);
1748 /// ```
1749 pub fn match_kind(&mut self, kind: MatchKind) -> &mut AhoCorasickBuilder {
1750 self.nfa_builder.match_kind(kind);
1751 self
1752 }
1753
1754 /// Enable anchored mode, which requires all matches to start at the
1755 /// first position in a haystack.
1756 ///
1757 /// This option is disabled by default.
1758 ///
1759 /// # Examples
1760 ///
1761 /// Basic usage:
1762 ///
1763 /// ```
1764 /// use aho_corasick::AhoCorasickBuilder;
1765 ///
1766 /// let patterns = &["foo", "bar"];
1767 /// let haystack = "foobar";
1768 ///
1769 /// let ac = AhoCorasickBuilder::new()
1770 /// .anchored(true)
1771 /// .build(patterns);
1772 /// assert_eq!(1, ac.find_iter(haystack).count());
1773 /// ```
1774 ///
1775 /// When searching for overlapping matches, all matches that start at
1776 /// the beginning of a haystack will be reported:
1777 ///
1778 /// ```
1779 /// use aho_corasick::AhoCorasickBuilder;
1780 ///
1781 /// let patterns = &["foo", "foofoo"];
1782 /// let haystack = "foofoo";
1783 ///
1784 /// let ac = AhoCorasickBuilder::new()
1785 /// .anchored(true)
1786 /// .build(patterns);
1787 /// assert_eq!(2, ac.find_overlapping_iter(haystack).count());
1788 /// // A non-anchored search would return 3 matches.
1789 /// ```
1790 pub fn anchored(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1791 self.nfa_builder.anchored(yes);
1792 self
1793 }
1794
1795 /// Enable ASCII-aware case insensitive matching.
1796 ///
1797 /// When this option is enabled, searching will be performed without
1798 /// respect to case for ASCII letters (`a-z` and `A-Z`) only.
1799 ///
1800 /// Enabling this option does not change the search algorithm, but it may
1801 /// increase the size of the automaton.
1802 ///
1803 /// **NOTE:** In the future, support for full Unicode case insensitivity
1804 /// may be added, but ASCII case insensitivity is comparatively much
1805 /// simpler to add.
1806 ///
1807 /// # Examples
1808 ///
1809 /// Basic usage:
1810 ///
1811 /// ```
1812 /// use aho_corasick::AhoCorasickBuilder;
1813 ///
1814 /// let patterns = &["FOO", "bAr", "BaZ"];
1815 /// let haystack = "foo bar baz";
1816 ///
1817 /// let ac = AhoCorasickBuilder::new()
1818 /// .ascii_case_insensitive(true)
1819 /// .build(patterns);
1820 /// assert_eq!(3, ac.find_iter(haystack).count());
1821 /// ```
1822 pub fn ascii_case_insensitive(
1823 &mut self,
1824 yes: bool,
1825 ) -> &mut AhoCorasickBuilder {
1826 self.nfa_builder.ascii_case_insensitive(yes);
1827 self
1828 }
1829
1830 /// Set the limit on how many NFA states use a dense representation for
1831 /// their transitions.
1832 ///
1833 /// A dense representation uses more space, but supports faster access to
1834 /// transitions at search time. Thus, this setting permits the control of a
1835 /// space vs time trade off when using the NFA variant of Aho-Corasick.
1836 ///
1837 /// This limit is expressed in terms of the depth of a state, i.e., the
1838 /// number of transitions from the starting state of the NFA. The idea is
1839 /// that most of the time searching will be spent near the starting state
1840 /// of the automaton, so states near the start state should use a dense
1841 /// representation. States further away from the start state would then use
1842 /// a sparse representation, which uses less space but is slower to access
1843 /// transitions at search time.
1844 ///
1845 /// By default, this is set to a low but non-zero number.
1846 ///
1847 /// This setting has no effect if the `dfa` option is enabled.
1848 pub fn dense_depth(&mut self, depth: usize) -> &mut AhoCorasickBuilder {
1849 self.nfa_builder.dense_depth(depth);
1850 self
1851 }
1852
1853 /// Compile the standard Aho-Corasick automaton into a deterministic finite
1854 /// automaton (DFA).
1855 ///
1856 /// When this is disabled (which is the default), then a non-deterministic
1857 /// finite automaton (NFA) is used instead.
1858 ///
1859 /// The main benefit to a DFA is that it can execute searches more quickly
Chih-Hung Hsieh2db690b2020-10-26 13:16:43 -07001860 /// than a NFA (perhaps 2-4 times as fast). The main drawback is that the
Chih-Hung Hsieh0a0edd52020-04-07 14:24:00 -07001861 /// DFA uses more space and can take much longer to build.
1862 ///
1863 /// Enabling this option does not change the time complexity for
1864 /// constructing the Aho-Corasick automaton (which is `O(p)` where
1865 /// `p` is the total number of patterns being compiled). Enabling this
1866 /// option does however reduce the time complexity of non-overlapping
1867 /// searches from `O(n + p)` to `O(n)`, where `n` is the length of the
1868 /// haystack.
1869 ///
1870 /// In general, it's a good idea to enable this if you're searching a
1871 /// small number of fairly short patterns (~1000), or if you want the
1872 /// fastest possible search without regard to compilation time or space
1873 /// usage.
1874 pub fn dfa(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1875 self.dfa = yes;
1876 self
1877 }
1878
1879 /// Enable heuristic prefilter optimizations.
1880 ///
1881 /// When enabled, searching will attempt to quickly skip to match
1882 /// candidates using specialized literal search routines. A prefilter
1883 /// cannot always be used, and is generally treated as a heuristic. It
1884 /// can be useful to disable this if the prefilter is observed to be
1885 /// sub-optimal for a particular workload.
1886 ///
1887 /// This is enabled by default.
1888 pub fn prefilter(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1889 self.nfa_builder.prefilter(yes);
1890 self
1891 }
1892
1893 /// Shrink the size of the transition alphabet by mapping bytes to their
1894 /// equivalence classes. This only has an effect when the `dfa` option is
1895 /// enabled.
1896 ///
1897 /// When enabled, each a DFA will use a map from all possible bytes
1898 /// to their corresponding equivalence class. Each equivalence class
1899 /// represents a set of bytes that does not discriminate between a match
1900 /// and a non-match in the DFA. For example, the patterns `bar` and `baz`
1901 /// have at least five equivalence classes: singleton sets of `b`, `a`, `r`
1902 /// and `z`, and a final set that contains every other byte.
1903 ///
1904 /// The advantage of this map is that the size of the transition table can
1905 /// be reduced drastically from `#states * 256 * sizeof(id)` to
1906 /// `#states * k * sizeof(id)` where `k` is the number of equivalence
1907 /// classes. As a result, total space usage can decrease substantially.
1908 /// Moreover, since a smaller alphabet is used, compilation becomes faster
1909 /// as well.
1910 ///
1911 /// The disadvantage of this map is that every byte searched must be
1912 /// passed through this map before it can be used to determine the next
1913 /// transition. This has a small match time performance cost. However, if
1914 /// the DFA is otherwise very large without byte classes, then using byte
1915 /// classes can greatly improve memory locality and thus lead to better
1916 /// overall performance.
1917 ///
1918 /// This option is enabled by default.
Joel Galenson82068452021-05-19 14:19:20 -07001919 #[deprecated(
1920 since = "0.7.16",
1921 note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57"
1922 )]
Chih-Hung Hsieh0a0edd52020-04-07 14:24:00 -07001923 pub fn byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1924 self.dfa_builder.byte_classes(yes);
1925 self
1926 }
1927
1928 /// Premultiply state identifiers in the transition table. This only has
1929 /// an effect when the `dfa` option is enabled.
1930 ///
1931 /// When enabled, state identifiers are premultiplied to point to their
1932 /// corresponding row in the transition table. That is, given the `i`th
1933 /// state, its corresponding premultiplied identifier is `i * k` where `k`
1934 /// is the alphabet size of the automaton. (The alphabet size is at most
1935 /// 256, but is in practice smaller if byte classes is enabled.)
1936 ///
1937 /// When state identifiers are not premultiplied, then the identifier of
1938 /// the `i`th state is `i`.
1939 ///
1940 /// The advantage of premultiplying state identifiers is that is saves a
1941 /// multiplication instruction per byte when searching with a DFA. This has
1942 /// been observed to lead to a 20% performance benefit in micro-benchmarks.
1943 ///
1944 /// The primary disadvantage of premultiplying state identifiers is
1945 /// that they require a larger integer size to represent. For example,
1946 /// if the DFA has 200 states, then its premultiplied form requires 16
1947 /// bits to represent every possible state identifier, where as its
1948 /// non-premultiplied form only requires 8 bits.
1949 ///
1950 /// This option is enabled by default.
Joel Galenson82068452021-05-19 14:19:20 -07001951 #[deprecated(
1952 since = "0.7.16",
1953 note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57"
1954 )]
Chih-Hung Hsieh0a0edd52020-04-07 14:24:00 -07001955 pub fn premultiply(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1956 self.dfa_builder.premultiply(yes);
1957 self
1958 }
1959}
1960
1961/// A knob for controlling the match semantics of an Aho-Corasick automaton.
1962///
1963/// There are two generally different ways that Aho-Corasick automatons can
1964/// report matches. The first way is the "standard" approach that results from
1965/// implementing most textbook explanations of Aho-Corasick. The second way is
1966/// to report only the leftmost non-overlapping matches. The leftmost approach
1967/// is in turn split into two different ways of resolving ambiguous matches:
1968/// leftmost-first and leftmost-longest.
1969///
1970/// The `Standard` match kind is the default and is the only one that supports
1971/// overlapping matches and stream searching. (Trying to find overlapping
1972/// or streaming matches using leftmost match semantics will result in a
1973/// panic.) The `Standard` match kind will report matches as they are seen.
1974/// When searching for overlapping matches, then all possible matches are
1975/// reported. When searching for non-overlapping matches, the first match seen
1976/// is reported. For example, for non-overlapping matches, given the patterns
1977/// `abcd` and `b` and the subject string `abcdef`, only a match for `b` is
1978/// reported since it is detected first. The `abcd` match is never reported
1979/// since it overlaps with the `b` match.
1980///
1981/// In contrast, the leftmost match kind always prefers the leftmost match
1982/// among all possible matches. Given the same example as above with `abcd` and
1983/// `b` as patterns and `abcdef` as the subject string, the leftmost match is
1984/// `abcd` since it begins before the `b` match, even though the `b` match is
1985/// detected before the `abcd` match. In this case, the `b` match is not
1986/// reported at all since it overlaps with the `abcd` match.
1987///
1988/// The difference between leftmost-first and leftmost-longest is in how they
1989/// resolve ambiguous matches when there are multiple leftmost matches to
1990/// choose from. Leftmost-first always chooses the pattern that was provided
1991/// earliest, where as leftmost-longest always chooses the longest matching
1992/// pattern. For example, given the patterns `a` and `ab` and the subject
1993/// string `ab`, the leftmost-first match is `a` but the leftmost-longest match
1994/// is `ab`. Conversely, if the patterns were given in reverse order, i.e.,
1995/// `ab` and `a`, then both the leftmost-first and leftmost-longest matches
1996/// would be `ab`. Stated differently, the leftmost-first match depends on the
1997/// order in which the patterns were given to the Aho-Corasick automaton.
1998/// Because of that, when leftmost-first matching is used, if a pattern `A`
1999/// that appears before a pattern `B` is a prefix of `B`, then it is impossible
2000/// to ever observe a match of `B`.
2001///
2002/// If you're not sure which match kind to pick, then stick with the standard
2003/// kind, which is the default. In particular, if you need overlapping or
2004/// streaming matches, then you _must_ use the standard kind. The leftmost
2005/// kinds are useful in specific circumstances. For example, leftmost-first can
2006/// be very useful as a way to implement match priority based on the order of
2007/// patterns given and leftmost-longest can be useful for dictionary searching
2008/// such that only the longest matching words are reported.
2009///
2010/// # Relationship with regular expression alternations
2011///
2012/// Understanding match semantics can be a little tricky, and one easy way
2013/// to conceptualize non-overlapping matches from an Aho-Corasick automaton
2014/// is to think about them as a simple alternation of literals in a regular
2015/// expression. For example, let's say we wanted to match the strings
2016/// `Sam` and `Samwise`, which would turn into the regex `Sam|Samwise`. It
2017/// turns out that regular expression engines have two different ways of
2018/// matching this alternation. The first way, leftmost-longest, is commonly
2019/// found in POSIX compatible implementations of regular expressions (such as
2020/// `grep`). The second way, leftmost-first, is commonly found in backtracking
2021/// implementations such as Perl. (Some regex engines, such as RE2 and Rust's
2022/// regex engine do not use backtracking, but still implement leftmost-first
2023/// semantics in an effort to match the behavior of dominant backtracking
2024/// regex engines such as those found in Perl, Ruby, Python, Javascript and
2025/// PHP.)
2026///
2027/// That is, when matching `Sam|Samwise` against `Samwise`, a POSIX regex
2028/// will match `Samwise` because it is the longest possible match, but a
2029/// Perl-like regex will match `Sam` since it appears earlier in the
2030/// alternation. Indeed, the regex `Sam|Samwise` in a Perl-like regex engine
2031/// will never match `Samwise` since `Sam` will always have higher priority.
2032/// Conversely, matching the regex `Samwise|Sam` against `Samwise` will lead to
2033/// a match of `Samwise` in both POSIX and Perl-like regexes since `Samwise` is
2034/// still longest match, but it also appears earlier than `Sam`.
2035///
2036/// The "standard" match semantics of Aho-Corasick generally don't correspond
2037/// to the match semantics of any large group of regex implementations, so
2038/// there's no direct analogy that can be made here. Standard match semantics
2039/// are generally useful for overlapping matches, or if you just want to see
2040/// matches as they are detected.
2041///
2042/// The main conclusion to draw from this section is that the match semantics
2043/// can be tweaked to precisely match either Perl-like regex alternations or
2044/// POSIX regex alternations.
2045#[derive(Clone, Copy, Debug, Eq, PartialEq)]
2046pub enum MatchKind {
2047 /// Use standard match semantics, which support overlapping matches. When
2048 /// used with non-overlapping matches, matches are reported as they are
2049 /// seen.
2050 Standard,
2051 /// Use leftmost-first match semantics, which reports leftmost matches.
2052 /// When there are multiple possible leftmost matches, the match
2053 /// corresponding to the pattern that appeared earlier when constructing
2054 /// the automaton is reported.
2055 ///
2056 /// This does **not** support overlapping matches or stream searching. If
2057 /// this match kind is used, attempting to find overlapping matches or
2058 /// stream matches will panic.
2059 LeftmostFirst,
2060 /// Use leftmost-longest match semantics, which reports leftmost matches.
2061 /// When there are multiple possible leftmost matches, the longest match
2062 /// is chosen.
2063 ///
2064 /// This does **not** support overlapping matches or stream searching. If
2065 /// this match kind is used, attempting to find overlapping matches or
2066 /// stream matches will panic.
2067 LeftmostLongest,
2068 /// Hints that destructuring should not be exhaustive.
2069 ///
2070 /// This enum may grow additional variants, so this makes sure clients
2071 /// don't count on exhaustive matching. (Otherwise, adding a new variant
2072 /// could break existing code.)
2073 #[doc(hidden)]
2074 __Nonexhaustive,
2075}
2076
2077/// The default match kind is `MatchKind::Standard`.
2078impl Default for MatchKind {
2079 fn default() -> MatchKind {
2080 MatchKind::Standard
2081 }
2082}
2083
2084impl MatchKind {
2085 fn supports_overlapping(&self) -> bool {
2086 self.is_standard()
2087 }
2088
2089 fn supports_stream(&self) -> bool {
2090 // TODO: It may be possible to support this. It's hard.
2091 //
2092 // See: https://github.com/rust-lang/regex/issues/425#issuecomment-471367838
2093 self.is_standard()
2094 }
2095
2096 pub(crate) fn is_standard(&self) -> bool {
2097 *self == MatchKind::Standard
2098 }
2099
2100 pub(crate) fn is_leftmost(&self) -> bool {
2101 *self == MatchKind::LeftmostFirst
2102 || *self == MatchKind::LeftmostLongest
2103 }
2104
2105 pub(crate) fn is_leftmost_first(&self) -> bool {
2106 *self == MatchKind::LeftmostFirst
2107 }
2108
2109 /// Convert this match kind into a packed match kind. If this match kind
2110 /// corresponds to standard semantics, then this returns None, since
2111 /// packed searching does not support standard semantics.
2112 pub(crate) fn as_packed(&self) -> Option<packed::MatchKind> {
2113 match *self {
2114 MatchKind::Standard => None,
2115 MatchKind::LeftmostFirst => Some(packed::MatchKind::LeftmostFirst),
2116 MatchKind::LeftmostLongest => {
2117 Some(packed::MatchKind::LeftmostLongest)
2118 }
2119 MatchKind::__Nonexhaustive => unreachable!(),
2120 }
2121 }
2122}
2123
2124#[cfg(test)]
2125mod tests {
2126 use super::*;
2127
2128 #[test]
2129 fn oibits() {
2130 use std::panic::{RefUnwindSafe, UnwindSafe};
2131
2132 fn assert_send<T: Send>() {}
2133 fn assert_sync<T: Sync>() {}
2134 fn assert_unwind_safe<T: RefUnwindSafe + UnwindSafe>() {}
2135
2136 assert_send::<AhoCorasick>();
2137 assert_sync::<AhoCorasick>();
2138 assert_unwind_safe::<AhoCorasick>();
2139 assert_send::<AhoCorasickBuilder>();
2140 assert_sync::<AhoCorasickBuilder>();
2141 assert_unwind_safe::<AhoCorasickBuilder>();
2142 }
2143}