Upgrade rust/crates/regex to 1.5.4 Test: make Change-Id: I0eab39246dc2aea41a62c15661e350b490f06c1d

commit: 3874808a33005b78d4dcbc84fc41afe856c2904c [log] [tgz]
author: Joel Galenson <jgalenson@google.com> Wed May 19 16:51:51 2021 -0700
committer: Joel Galenson <jgalenson@google.com> Wed May 19 16:51:51 2021 -0700
tree: 02c0b3035a3cf71f10f04c344fb3e636a1090409
parent: ffb6030a530f824e597fae242b0c77c0743fc545 [diff] [blame]
diff --git a/src/literal/imp.rs b/src/literal/imp.rs
index e4d04ed..82f050a 100644
--- a/src/literal/imp.rs
+++ b/src/literal/imp.rs

@@ -1,11 +1,8 @@
-use std::cmp;
 use std::mem;
 
 use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder};
-use memchr::{memchr, memchr2, memchr3};
-use syntax::hir::literal::{Literal, Literals};
-
-use freqs::BYTE_FREQUENCIES;
+use memchr::{memchr, memchr2, memchr3, memmem};
+use regex_syntax::hir::literal::{Literal, Literals};
 
 /// A prefix extracted from a compiled regular expression.
 ///
@@ -15,8 +12,8 @@
 #[derive(Clone, Debug)]
 pub struct LiteralSearcher {
     complete: bool,
-    lcp: FreqyPacked,
-    lcs: FreqyPacked,
+    lcp: Memmem,
+    lcs: Memmem,
     matcher: Matcher,
 }
 
@@ -26,10 +23,8 @@
     Empty,
     /// A set of four or more single byte literals.
     Bytes(SingleByteSet),
-    /// A single substring, find using memchr and frequency analysis.
-    FreqyPacked(FreqyPacked),
-    /// A single substring, find using Boyer-Moore.
-    BoyerMoore(BoyerMooreSearch),
+    /// A single substring, using vector accelerated routines when available.
+    Memmem(Memmem),
     /// An Aho-Corasick automaton.
     AC { ac: AhoCorasick<u32>, lits: Vec<Literal> },
     /// A packed multiple substring searcher, using SIMD.
@@ -63,8 +58,8 @@
         let complete = lits.all_complete();
         LiteralSearcher {
             complete: complete,
-            lcp: FreqyPacked::new(lits.longest_common_prefix().to_vec()),
-            lcs: FreqyPacked::new(lits.longest_common_suffix().to_vec()),
+            lcp: Memmem::new(lits.longest_common_prefix()),
+            lcs: Memmem::new(lits.longest_common_suffix()),
             matcher: matcher,
         }
     }
@@ -86,8 +81,7 @@
         match self.matcher {
             Empty => Some((0, 0)),
             Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
-            FreqyPacked(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
-            BoyerMoore(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
+            Memmem(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
             AC { ref ac, .. } => {
                 ac.find(haystack).map(|m| (m.start(), m.end()))
             }
@@ -124,24 +118,23 @@
     }
 
     /// Returns an iterator over all literals to be matched.
-    pub fn iter(&self) -> LiteralIter {
+    pub fn iter(&self) -> LiteralIter<'_> {
         match self.matcher {
             Matcher::Empty => LiteralIter::Empty,
             Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
-            Matcher::FreqyPacked(ref s) => LiteralIter::Single(&s.pat),
-            Matcher::BoyerMoore(ref s) => LiteralIter::Single(&s.pattern),
+            Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()),
             Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
             Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
         }
     }
 
     /// Returns a matcher for the longest common prefix of this matcher.
-    pub fn lcp(&self) -> &FreqyPacked {
+    pub fn lcp(&self) -> &Memmem {
         &self.lcp
     }
 
     /// Returns a matcher for the longest common suffix of this matcher.
-    pub fn lcs(&self) -> &FreqyPacked {
+    pub fn lcs(&self) -> &Memmem {
         &self.lcs
     }
 
@@ -156,8 +149,7 @@
         match self.matcher {
             Empty => 0,
             Bytes(ref sset) => sset.dense.len(),
-            FreqyPacked(_) => 1,
-            BoyerMoore(_) => 1,
+            Memmem(_) => 1,
             AC { ref ac, .. } => ac.pattern_count(),
             Packed { ref lits, .. } => lits.len(),
         }
@@ -169,8 +161,7 @@
         match self.matcher {
             Empty => 0,
             Bytes(ref sset) => sset.approximate_size(),
-            FreqyPacked(ref single) => single.approximate_size(),
-            BoyerMoore(ref single) => single.approximate_size(),
+            Memmem(ref single) => single.approximate_size(),
             AC { ref ac, .. } => ac.heap_bytes(),
             Packed { ref s, .. } => s.heap_bytes(),
         }
@@ -205,12 +196,7 @@
             return Matcher::Bytes(sset);
         }
         if lits.literals().len() == 1 {
-            let lit = lits.literals()[0].to_vec();
-            if BoyerMooreSearch::should_use(lit.as_slice()) {
-                return Matcher::BoyerMoore(BoyerMooreSearch::new(lit));
-            } else {
-                return Matcher::FreqyPacked(FreqyPacked::new(lit));
-            }
+            return Matcher::Memmem(Memmem::new(&lits.literals()[0]));
         }
 
         let pats = lits.literals().to_owned();
@@ -367,116 +353,27 @@
     }
 }
 
-/// Provides an implementation of fast subtring search using frequency
-/// analysis.
+/// A simple wrapper around the memchr crate's memmem implementation.
 ///
-/// memchr is so fast that we do everything we can to keep the loop in memchr
-/// for as long as possible. The easiest way to do this is to intelligently
-/// pick the byte to send to memchr. The best byte is the byte that occurs
-/// least frequently in the haystack. Since doing frequency analysis on the
-/// haystack is far too expensive, we compute a set of fixed frequencies up
-/// front and hard code them in src/freqs.rs. Frequency analysis is done via
-/// scripts/frequencies.py.
+/// The API this exposes mirrors the API of previous substring searchers that
+/// this supplanted.
 #[derive(Clone, Debug)]
-pub struct FreqyPacked {
-    /// The pattern.
-    pat: Vec<u8>,
-    /// The number of Unicode characters in the pattern. This is useful for
-    /// determining the effective length of a pattern when deciding which
-    /// optimizations to perform. A trailing incomplete UTF-8 sequence counts
-    /// as one character.
+pub struct Memmem {
+    finder: memmem::Finder<'static>,
     char_len: usize,
-    /// The rarest byte in the pattern, according to pre-computed frequency
-    /// analysis.
-    rare1: u8,
-    /// The offset of the rarest byte in `pat`.
-    rare1i: usize,
-    /// The second rarest byte in the pattern, according to pre-computed
-    /// frequency analysis. (This may be equivalent to the rarest byte.)
-    ///
-    /// The second rarest byte is used as a type of guard for quickly detecting
-    /// a mismatch after memchr locates an instance of the rarest byte. This
-    /// is a hedge against pathological cases where the pre-computed frequency
-    /// analysis may be off. (But of course, does not prevent *all*
-    /// pathological cases.)
-    rare2: u8,
-    /// The offset of the second rarest byte in `pat`.
-    rare2i: usize,
 }
 
-impl FreqyPacked {
-    fn new(pat: Vec<u8>) -> FreqyPacked {
-        if pat.is_empty() {
-            return FreqyPacked::empty();
-        }
-
-        // Find the rarest two bytes. Try to make them distinct (but it's not
-        // required).
-        let mut rare1 = pat[0];
-        let mut rare2 = pat[0];
-        for b in pat[1..].iter().cloned() {
-            if freq_rank(b) < freq_rank(rare1) {
-                rare1 = b;
-            }
-        }
-        for &b in &pat {
-            if rare1 == rare2 {
-                rare2 = b
-            } else if b != rare1 && freq_rank(b) < freq_rank(rare2) {
-                rare2 = b;
-            }
-        }
-
-        // And find the offsets of their last occurrences.
-        let rare1i = pat.iter().rposition(|&b| b == rare1).unwrap();
-        let rare2i = pat.iter().rposition(|&b| b == rare2).unwrap();
-
-        let char_len = char_len_lossy(&pat);
-        FreqyPacked {
-            pat: pat,
-            char_len: char_len,
-            rare1: rare1,
-            rare1i: rare1i,
-            rare2: rare2,
-            rare2i: rare2i,
-        }
-    }
-
-    fn empty() -> FreqyPacked {
-        FreqyPacked {
-            pat: vec![],
-            char_len: 0,
-            rare1: 0,
-            rare1i: 0,
-            rare2: 0,
-            rare2i: 0,
+impl Memmem {
+    fn new(pat: &[u8]) -> Memmem {
+        Memmem {
+            finder: memmem::Finder::new(pat).into_owned(),
+            char_len: char_len_lossy(pat),
         }
     }
 
     #[cfg_attr(feature = "perf-inline", inline(always))]
     pub fn find(&self, haystack: &[u8]) -> Option<usize> {
-        let pat = &*self.pat;
-        if haystack.len() < pat.len() || pat.is_empty() {
-            return None;
-        }
-        let mut i = self.rare1i;
-        while i < haystack.len() {
-            i += match memchr(self.rare1, &haystack[i..]) {
-                None => return None,
-                Some(i) => i,
-            };
-            let start = i - self.rare1i;
-            let end = start + pat.len();
-            if end > haystack.len() {
-                return None;
-            }
-            let aligned = &haystack[start..end];
-            if aligned[self.rare2i] == self.rare2 && aligned == &*self.pat {
-                return Some(start);
-            }
-            i += 1;
-        }
-        None
+        self.finder.find(haystack)
     }
 
     #[cfg_attr(feature = "perf-inline", inline(always))]
@@ -484,11 +381,11 @@
         if text.len() < self.len() {
             return false;
         }
-        text[text.len() - self.len()..] == *self.pat
+        &text[text.len() - self.len()..] == self.finder.needle()
     }
 
     pub fn len(&self) -> usize {
-        self.pat.len()
+        self.finder.needle().len()
     }
 
     pub fn char_len(&self) -> usize {
@@ -496,627 +393,10 @@
     }
 
     fn approximate_size(&self) -> usize {
-        self.pat.len() * mem::size_of::<u8>()
+        self.finder.needle().len() * mem::size_of::<u8>()
     }
 }
 
 fn char_len_lossy(bytes: &[u8]) -> usize {
     String::from_utf8_lossy(bytes).chars().count()
 }
-
-/// An implementation of Tuned Boyer-Moore as laid out by
-/// Andrew Hume and Daniel Sunday in "Fast String Searching".
-/// O(n) in the size of the input.
-///
-/// Fast string searching algorithms come in many variations,
-/// but they can generally be described in terms of three main
-/// components.
-///
-/// The skip loop is where the string searcher wants to spend
-/// as much time as possible. Exactly which character in the
-/// pattern the skip loop examines varies from algorithm to
-/// algorithm, but in the simplest case this loop repeated
-/// looks at the last character in the pattern and jumps
-/// forward in the input if it is not in the pattern.
-/// Robert Boyer and J Moore called this the "fast" loop in
-/// their original paper.
-///
-/// The match loop is responsible for actually examining the
-/// whole potentially matching substring. In order to fail
-/// faster, the match loop sometimes has a guard test attached.
-/// The guard test uses frequency analysis of the different
-/// characters in the pattern to choose the least frequency
-/// occurring character and use it to find match failures
-/// as quickly as possible.
-///
-/// The shift rule governs how the algorithm will shuffle its
-/// test window in the event of a failure during the match loop.
-/// Certain shift rules allow the worst-case run time of the
-/// algorithm to be shown to be O(n) in the size of the input
-/// rather than O(nm) in the size of the input and the size
-/// of the pattern (as naive Boyer-Moore is).
-///
-/// "Fast String Searching", in addition to presenting a tuned
-/// algorithm, provides a comprehensive taxonomy of the many
-/// different flavors of string searchers. Under that taxonomy
-/// TBM, the algorithm implemented here, uses an unrolled fast
-/// skip loop with memchr fallback, a forward match loop with guard,
-/// and the mini Sunday's delta shift rule. To unpack that you'll have to
-/// read the paper.
-#[derive(Clone, Debug)]
-pub struct BoyerMooreSearch {
-    /// The pattern we are going to look for in the haystack.
-    pattern: Vec<u8>,
-
-    /// The skip table for the skip loop.
-    ///
-    /// Maps the character at the end of the input
-    /// to a shift.
-    skip_table: Vec<usize>,
-
-    /// The guard character (least frequently occurring char).
-    guard: u8,
-    /// The reverse-index of the guard character in the pattern.
-    guard_reverse_idx: usize,
-
-    /// Daniel Sunday's mini generalized delta2 shift table.
-    ///
-    /// We use a skip loop, so we only have to provide a shift
-    /// for the skip char (last char). This is why it is a mini
-    /// shift rule.
-    md2_shift: usize,
-}
-
-impl BoyerMooreSearch {
-    /// Create a new string searcher, performing whatever
-    /// compilation steps are required.
-    fn new(pattern: Vec<u8>) -> Self {
-        debug_assert!(!pattern.is_empty());
-
-        let (g, gi) = Self::select_guard(pattern.as_slice());
-        let skip_table = Self::compile_skip_table(pattern.as_slice());
-        let md2_shift = Self::compile_md2_shift(pattern.as_slice());
-        BoyerMooreSearch {
-            pattern: pattern,
-            skip_table: skip_table,
-            guard: g,
-            guard_reverse_idx: gi,
-            md2_shift: md2_shift,
-        }
-    }
-
-    /// Find the pattern in `haystack`, returning the offset
-    /// of the start of the first occurrence of the pattern
-    /// in `haystack`.
-    #[inline]
-    fn find(&self, haystack: &[u8]) -> Option<usize> {
-        if haystack.len() < self.pattern.len() {
-            return None;
-        }
-
-        let mut window_end = self.pattern.len() - 1;
-
-        // Inspired by the grep source. It is a way
-        // to do correct loop unrolling without having to place
-        // a crashpad of terminating charicters at the end in
-        // the way described in the Fast String Searching paper.
-        const NUM_UNROLL: usize = 10;
-        // 1 for the initial position, and 1 for the md2 shift
-        let short_circut = (NUM_UNROLL + 2) * self.pattern.len();
-
-        if haystack.len() > short_circut {
-            // just 1 for the md2 shift
-            let backstop =
-                haystack.len() - ((NUM_UNROLL + 1) * self.pattern.len());
-            loop {
-                window_end =
-                    match self.skip_loop(haystack, window_end, backstop) {
-                        Some(i) => i,
-                        None => return None,
-                    };
-                if window_end >= backstop {
-                    break;
-                }
-
-                if self.check_match(haystack, window_end) {
-                    return Some(window_end - (self.pattern.len() - 1));
-                } else {
-                    let skip = self.skip_table[haystack[window_end] as usize];
-                    window_end +=
-                        if skip == 0 { self.md2_shift } else { skip };
-                    continue;
-                }
-            }
-        }
-
-        // now process the input after the backstop
-        while window_end < haystack.len() {
-            let mut skip = self.skip_table[haystack[window_end] as usize];
-            if skip == 0 {
-                if self.check_match(haystack, window_end) {
-                    return Some(window_end - (self.pattern.len() - 1));
-                } else {
-                    skip = self.md2_shift;
-                }
-            }
-            window_end += skip;
-        }
-
-        None
-    }
-
-    fn len(&self) -> usize {
-        return self.pattern.len();
-    }
-
-    /// The key heuristic behind which the BoyerMooreSearch lives.
-    ///
-    /// See `rust-lang/regex/issues/408`.
-    ///
-    /// Tuned Boyer-Moore is actually pretty slow! It turns out a handrolled
-    /// platform-specific memchr routine with a bit of frequency
-    /// analysis sprinkled on top actually wins most of the time.
-    /// However, there are a few cases where Tuned Boyer-Moore still
-    /// wins.
-    ///
-    /// If the haystack is random, frequency analysis doesn't help us,
-    /// so Boyer-Moore will win for sufficiently large needles.
-    /// Unfortunately, there is no obvious way to determine this
-    /// ahead of time.
-    ///
-    /// If the pattern itself consists of very common characters,
-    /// frequency analysis won't get us anywhere. The most extreme
-    /// example of this is a pattern like `eeeeeeeeeeeeeeee`. Fortunately,
-    /// this case is wholly determined by the pattern, so we can actually
-    /// implement the heuristic.
-    ///
-    /// A third case is if the pattern is sufficiently long. The idea
-    /// here is that once the pattern gets long enough the Tuned
-    /// Boyer-Moore skip loop will start making strides long enough
-    /// to beat the asm deep magic that is memchr.
-    fn should_use(pattern: &[u8]) -> bool {
-        // The minimum pattern length required to use TBM.
-        const MIN_LEN: usize = 9;
-        // The minimum frequency rank (lower is rarer) that every byte in the
-        // pattern must have in order to use TBM. That is, if the pattern
-        // contains _any_ byte with a lower rank, then TBM won't be used.
-        const MIN_CUTOFF: usize = 150;
-        // The maximum frequency rank for any byte.
-        const MAX_CUTOFF: usize = 255;
-        // The scaling factor used to determine the actual cutoff frequency
-        // to use (keeping in mind that the minimum frequency rank is bounded
-        // by MIN_CUTOFF). This scaling factor is an attempt to make TBM more
-        // likely to be used as the pattern grows longer. That is, longer
-        // patterns permit somewhat less frequent bytes than shorter patterns,
-        // under the assumption that TBM gets better as the pattern gets
-        // longer.
-        const LEN_CUTOFF_PROPORTION: usize = 4;
-
-        let scaled_rank = pattern.len().wrapping_mul(LEN_CUTOFF_PROPORTION);
-        let cutoff = cmp::max(
-            MIN_CUTOFF,
-            MAX_CUTOFF - cmp::min(MAX_CUTOFF, scaled_rank),
-        );
-        // The pattern must be long enough to be worthwhile. e.g., memchr will
-        // be faster on `e` because it is short even though e is quite common.
-        pattern.len() > MIN_LEN
-            // all the bytes must be more common than the cutoff.
-            && pattern.iter().all(|c| freq_rank(*c) >= cutoff)
-    }
-
-    /// Check to see if there is a match at the given position
-    #[inline]
-    fn check_match(&self, haystack: &[u8], window_end: usize) -> bool {
-        // guard test
-        if haystack[window_end - self.guard_reverse_idx] != self.guard {
-            return false;
-        }
-
-        // match loop
-        let window_start = window_end - (self.pattern.len() - 1);
-        for i in 0..self.pattern.len() {
-            if self.pattern[i] != haystack[window_start + i] {
-                return false;
-            }
-        }
-
-        true
-    }
-
-    /// Skip forward according to the shift table.
-    ///
-    /// Returns the offset of the next occurrence
-    /// of the last char in the pattern, or the none
-    /// if it never reappears. If `skip_loop` hits the backstop
-    /// it will leave early.
-    #[inline]
-    fn skip_loop(
-        &self,
-        haystack: &[u8],
-        mut window_end: usize,
-        backstop: usize,
-    ) -> Option<usize> {
-        let window_end_snapshot = window_end;
-        let skip_of = |we: usize| -> usize {
-            // Unsafe might make this faster, but the benchmarks
-            // were hard to interpret.
-            self.skip_table[haystack[we] as usize]
-        };
-
-        loop {
-            let mut skip = skip_of(window_end);
-            window_end += skip;
-            skip = skip_of(window_end);
-            window_end += skip;
-            if skip != 0 {
-                skip = skip_of(window_end);
-                window_end += skip;
-                skip = skip_of(window_end);
-                window_end += skip;
-                skip = skip_of(window_end);
-                window_end += skip;
-                if skip != 0 {
-                    skip = skip_of(window_end);
-                    window_end += skip;
-                    skip = skip_of(window_end);
-                    window_end += skip;
-                    skip = skip_of(window_end);
-                    window_end += skip;
-                    if skip != 0 {
-                        skip = skip_of(window_end);
-                        window_end += skip;
-                        skip = skip_of(window_end);
-                        window_end += skip;
-
-                        // If ten iterations did not make at least 16 words
-                        // worth of progress, we just fall back on memchr.
-                        if window_end - window_end_snapshot
-                            > 16 * mem::size_of::<usize>()
-                        {
-                            // Returning a window_end >= backstop will
-                            // immediatly break us out of the inner loop in
-                            // `find`.
-                            if window_end >= backstop {
-                                return Some(window_end);
-                            }
-
-                            continue; // we made enough progress
-                        } else {
-                            // In case we are already there, and so that
-                            // we will catch the guard char.
-                            window_end = window_end
-                                .checked_sub(1 + self.guard_reverse_idx)
-                                .unwrap_or(0);
-
-                            match memchr(self.guard, &haystack[window_end..]) {
-                                None => return None,
-                                Some(g_idx) => {
-                                    return Some(
-                                        window_end
-                                            + g_idx
-                                            + self.guard_reverse_idx,
-                                    );
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-
-            return Some(window_end);
-        }
-    }
-
-    /// Compute the ufast skip table.
-    fn compile_skip_table(pattern: &[u8]) -> Vec<usize> {
-        let mut tab = vec![pattern.len(); 256];
-
-        // For every char in the pattern, we write a skip
-        // that will line us up with the rightmost occurrence.
-        //
-        // N.B. the sentinel (0) is written by the last
-        // loop iteration.
-        for (i, c) in pattern.iter().enumerate() {
-            tab[*c as usize] = (pattern.len() - 1) - i;
-        }
-
-        tab
-    }
-
-    /// Select the guard character based off of the precomputed
-    /// frequency table.
-    fn select_guard(pattern: &[u8]) -> (u8, usize) {
-        let mut rarest = pattern[0];
-        let mut rarest_rev_idx = pattern.len() - 1;
-        for (i, c) in pattern.iter().enumerate() {
-            if freq_rank(*c) < freq_rank(rarest) {
-                rarest = *c;
-                rarest_rev_idx = (pattern.len() - 1) - i;
-            }
-        }
-
-        (rarest, rarest_rev_idx)
-    }
-
-    /// If there is another occurrence of the skip
-    /// char, shift to it, otherwise just shift to
-    /// the next window.
-    fn compile_md2_shift(pattern: &[u8]) -> usize {
-        let shiftc = *pattern.last().unwrap();
-
-        // For a pattern of length 1 we will never apply the
-        // shift rule, so we use a poison value on the principle
-        // that failing fast is a good thing.
-        if pattern.len() == 1 {
-            return 0xDEADBEAF;
-        }
-
-        let mut i = pattern.len() - 2;
-        while i > 0 {
-            if pattern[i] == shiftc {
-                return (pattern.len() - 1) - i;
-            }
-            i -= 1;
-        }
-
-        // The skip char never re-occurs in the pattern, so
-        // we can just shift the whole window length.
-        pattern.len() - 1
-    }
-
-    fn approximate_size(&self) -> usize {
-        (self.pattern.len() * mem::size_of::<u8>())
-            + (256 * mem::size_of::<usize>()) // skip table
-    }
-}
-
-fn freq_rank(b: u8) -> usize {
-    BYTE_FREQUENCIES[b as usize] as usize
-}
-
-#[cfg(test)]
-mod tests {
-    use super::{BoyerMooreSearch, FreqyPacked};
-
-    //
-    // Unit Tests
-    //
-
-    // The "hello, world" of string searching
-    #[test]
-    fn bm_find_subs() {
-        let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..]));
-        let haystack = b"I keep seeing patterns in this text";
-        assert_eq!(14, searcher.find(haystack).unwrap());
-    }
-
-    #[test]
-    fn bm_find_no_subs() {
-        let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..]));
-        let haystack = b"I keep seeing needles in this text";
-        assert_eq!(None, searcher.find(haystack));
-    }
-
-    //
-    // Regression Tests
-    //
-
-    #[test]
-    fn bm_skip_reset_bug() {
-        let haystack = vec![0, 0, 0, 0, 0, 1, 1, 0];
-        let needle = vec![0, 1, 1, 0];
-
-        let searcher = BoyerMooreSearch::new(needle);
-        let offset = searcher.find(haystack.as_slice()).unwrap();
-        assert_eq!(4, offset);
-    }
-
-    #[test]
-    fn bm_backstop_underflow_bug() {
-        let haystack = vec![0, 0];
-        let needle = vec![0, 0];
-
-        let searcher = BoyerMooreSearch::new(needle);
-        let offset = searcher.find(haystack.as_slice()).unwrap();
-        assert_eq!(0, offset);
-    }
-
-    #[test]
-    fn bm_naive_off_by_one_bug() {
-        let haystack = vec![91];
-        let needle = vec![91];
-
-        let naive_offset = naive_find(&needle, &haystack).unwrap();
-        assert_eq!(0, naive_offset);
-    }
-
-    #[test]
-    fn bm_memchr_fallback_indexing_bug() {
-        let mut haystack = vec![
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        ];
-        let needle = vec![1, 1, 1, 1, 32, 32, 87];
-        let needle_start = haystack.len();
-        haystack.extend(needle.clone());
-
-        let searcher = BoyerMooreSearch::new(needle);
-        assert_eq!(needle_start, searcher.find(haystack.as_slice()).unwrap());
-    }
-
-    #[test]
-    fn bm_backstop_boundary() {
-        let haystack = b"\
-// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-e_data.clone_created(entity_id, entity_to_add.entity_id);
-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-"
-        .to_vec();
-        let needle = b"clone_created".to_vec();
-
-        let searcher = BoyerMooreSearch::new(needle);
-        let result = searcher.find(&haystack);
-        assert_eq!(Some(43), result);
-    }
-
-    #[test]
-    fn bm_win_gnu_indexing_bug() {
-        let haystack_raw = vec![
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        ];
-        let needle = vec![1, 1, 1, 1, 1, 1, 1];
-        let haystack = haystack_raw.as_slice();
-
-        BoyerMooreSearch::new(needle.clone()).find(haystack);
-    }
-
-    //
-    // QuickCheck Properties
-    //
-
-    use quickcheck::TestResult;
-
-    fn naive_find(needle: &[u8], haystack: &[u8]) -> Option<usize> {
-        assert!(needle.len() <= haystack.len());
-
-        for i in 0..(haystack.len() - (needle.len() - 1)) {
-            if haystack[i] == needle[0]
-                && &haystack[i..(i + needle.len())] == needle
-            {
-                return Some(i);
-            }
-        }
-
-        None
-    }
-
-    quickcheck! {
-        fn qc_bm_equals_nieve_find(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult {
-            if pile1.len() == 0 || pile2.len() == 0 {
-                return TestResult::discard();
-            }
-
-            let (needle, haystack) = if pile1.len() < pile2.len() {
-                (pile1, pile2.as_slice())
-            } else {
-                (pile2, pile1.as_slice())
-            };
-
-            let searcher = BoyerMooreSearch::new(needle.clone());
-            TestResult::from_bool(
-                searcher.find(haystack) == naive_find(&needle, haystack))
-        }
-
-        fn qc_bm_equals_single(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult {
-            if pile1.len() == 0 || pile2.len() == 0 {
-                return TestResult::discard();
-            }
-
-            let (needle, haystack) = if pile1.len() < pile2.len() {
-                (pile1, pile2.as_slice())
-            } else {
-                (pile2, pile1.as_slice())
-            };
-
-            let bm_searcher = BoyerMooreSearch::new(needle.clone());
-            let freqy_memchr = FreqyPacked::new(needle);
-            TestResult::from_bool(
-                bm_searcher.find(haystack) == freqy_memchr.find(haystack))
-        }
-
-        fn qc_bm_finds_trailing_needle(
-            haystack_pre: Vec<u8>,
-            needle: Vec<u8>
-        ) -> TestResult {
-            if needle.len() == 0 {
-                return TestResult::discard();
-            }
-
-            let mut haystack = haystack_pre.clone();
-            let searcher = BoyerMooreSearch::new(needle.clone());
-
-            if haystack.len() >= needle.len() &&
-                searcher.find(haystack.as_slice()).is_some() {
-                return TestResult::discard();
-            }
-
-            haystack.extend(needle.clone());
-
-            // What if the the tail of the haystack can start the
-            // needle?
-            let start = haystack_pre.len()
-                .checked_sub(needle.len())
-                .unwrap_or(0);
-            for i in 0..(needle.len() - 1) {
-                if searcher.find(&haystack[(i + start)..]).is_some() {
-                    return TestResult::discard();
-                }
-            }
-
-            TestResult::from_bool(
-                searcher.find(haystack.as_slice())
-                        .map(|x| x == haystack_pre.len())
-                        .unwrap_or(false))
-        }
-
-        // qc_equals_* is only testing the negative case as @burntsushi
-        // pointed out in https://github.com/rust-lang/regex/issues/446.
-        // This quickcheck prop represents an effort to force testing of
-        // the positive case. qc_bm_finds_first and qc_bm_finds_trailing_needle
-        // already check some of the positive cases, but they don't cover
-        // cases where the needle is in the middle of haystack. This prop
-        // fills that hole.
-        fn qc_bm_finds_subslice(
-            haystack: Vec<u8>,
-            needle_start: usize,
-            needle_length: usize
-        ) -> TestResult {
-            if haystack.len() == 0 {
-                return TestResult::discard();
-            }
-
-            let needle_start = needle_start % haystack.len();
-            let needle_length = needle_length % (haystack.len() - needle_start);
-
-            if needle_length == 0 {
-                return TestResult::discard();
-            }
-
-            let needle = &haystack[needle_start..(needle_start + needle_length)];
-
-            let bm_searcher = BoyerMooreSearch::new(needle.to_vec());
-
-            let start = naive_find(&needle, &haystack);
-            match start {
-                None => TestResult::from_bool(false),
-                Some(nf_start) =>
-                    TestResult::from_bool(
-                        nf_start <= needle_start
-                            && bm_searcher.find(&haystack) == start
-                    )
-            }
-        }
-
-        fn qc_bm_finds_first(needle: Vec<u8>) -> TestResult {
-            if needle.len() == 0 {
-                return TestResult::discard();
-            }
-
-            let mut haystack = needle.clone();
-            let searcher = BoyerMooreSearch::new(needle.clone());
-            haystack.extend(needle);
-
-            TestResult::from_bool(
-                searcher.find(haystack.as_slice())
-                        .map(|x| x == 0)
-                        .unwrap_or(false))
-        }
-    }
-}
commit	3874808a33005b78d4dcbc84fc41afe856c2904c	[log] [tgz]
author	Joel Galenson <jgalenson@google.com>	Wed May 19 16:51:51 2021 -0700
committer	Joel Galenson <jgalenson@google.com>	Wed May 19 16:51:51 2021 -0700
tree	02c0b3035a3cf71f10f04c344fb3e636a1090409
parent	ffb6030a530f824e597fae242b0c77c0743fc545 [diff] [blame]